# Baseline Model

### Libraries Import

In [95]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, log_loss, balanced_accuracy_score
from sklearn.metrics import classification_report

from sklearn.preprocessing import FunctionTransformer

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

import dill

### Data Loading

In [96]:
x_train = pd.read_csv("X_Train_Data_Input.csv")
y_train = pd.read_csv("Y_Train_Data_Target.csv")
x_test = pd.read_csv("X_Test_Data_Input.csv")
y_test = pd.read_csv("Y_Test_Data_Target.csv")

### Train Test Split

In [97]:
IDs = y_train['ID']

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42, stratify=y_train['target'])
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((628106, 23), (157027, 23), (628106, 2), (157027, 2))

In [98]:
x_train.reset_index(drop=True, inplace=True)
x_val.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)

### Data Preprocessing

In [99]:
x_train.drop(['ID', 'Column9', 'Column14'], axis=1, inplace=True)
x_val.drop(['ID', 'Column9', 'Column14'], axis=1, inplace=True)
x_test.drop(['ID', 'Column9', 'Column14'], axis=1, inplace=True)

In [100]:
y_train['target'] = y_train['target'].astype('category')
y_val['target'] = y_val['target'].astype('category')

In [101]:
# Impute missing values
imputer = SimpleImputer(strategy='median')
# imputer = KNNImputer(n_neighbors=5)

x_imputed = imputer.fit_transform(x_train[x_train.select_dtypes('number').columns])
x_imputed = pd.DataFrame(x_imputed, columns = x_train.select_dtypes('number').columns)
x_train_imput = pd.concat([y_train['ID'], x_imputed], axis = 1)

x_val_imputed = imputer.transform(x_val[x_val.select_dtypes('number').columns])
x_val_imputed = pd.DataFrame(x_val_imputed, columns = x_val.select_dtypes('number').columns)
x_val_imput = pd.concat([y_val['ID'], x_val_imputed], axis = 1)

x_test_imputed = imputer.transform(x_test[x_test.select_dtypes('number').columns])
x_test_imputed = pd.DataFrame(x_test_imputed, columns = x_test.select_dtypes('number').columns)
x_test_imput = pd.concat([y_test['ID'], x_test_imputed], axis = 1)

In [102]:
# Initialize the StandardScaler
sc = StandardScaler()

# Apply the scaler to the numerical columns in the training data
x_train_imput_scaled = sc.fit_transform(x_train_imput.select_dtypes('number'))

# Apply the scaler to the numerical columns in the validation data
x_val_imput_scaled = sc.transform(x_val_imput.select_dtypes('number'))

# Apply the scaler to the numerical columns in the validation data
x_test_imput_scaled = sc.transform(x_test_imput.select_dtypes('number'))

# If you want to keep the scaled data in DataFrame format, you can do:
x_train_imput_scaled_df = pd.DataFrame(x_train_imput_scaled, columns=x_train_imput.select_dtypes('number').columns)
x_val_imput_scaled_df = pd.DataFrame(x_val_imput_scaled, columns=x_val_imput.select_dtypes('number').columns)
x_test_imput_scaled_df = pd.DataFrame(x_test_imput_scaled, columns=x_val_imput.select_dtypes('number').columns)

x_train_imput = x_train_imput_scaled_df.copy()
x_val_imput = x_val_imput_scaled_df.copy()
x_test_imput = x_test_imput_scaled_df.copy()

In [103]:
# Oversample after imputation
smote = SMOTE(random_state=42)
X_imputed_resampled, y_imputed_resampled = smote.fit_resample(x_train_imput.select_dtypes('number'), y_train['target'])

# Convert back to DataFrame for convenience
x_train_imput = pd.DataFrame(X_imputed_resampled, columns=x_train_imput.select_dtypes('number').columns)
Y_train = pd.DataFrame(y_imputed_resampled, columns=['target'])

# Model Building

In [104]:
xgb_params = {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1}
lightbgm_params = {'lambda_l1': 3.962980604776118, 'lambda_l2': 1.5075196212131254, 'num_leaves': 90, 'feature_fraction': 0.9675647296514234, 'bagging_fraction': 0.9979919873256766, 'bagging_freq': 2, 'min_child_samples': 63}
catboost_params =  {
    'iterations': 700,
    'learning_rate': 0.12851432422429102,
    'l2_leaf_reg': 0.8373406120034129,
    'border_count': 171,
    'bagging_temperature': 22.360639700244604,
    'random_strength': 1.3991869546371338,
    'boosting_type': 'Plain',
    'depth': 9,
    'grow_policy': 'SymmetricTree',
    'eval_metric': 'F1', 
}

In [105]:
models = {
    'XGBClassifier': XGBClassifier(**xgb_params),
    'CatBoostClassifier': CatBoostClassifier(**catboost_params, verbose = False),
    'LGBMClassifier': LGBMClassifier(**lightbgm_params, verbose = -1)
}

# Model Training

In [106]:
for name, model in models.items():
    model.fit(x_train_imput.select_dtypes('number') , Y_train['target'])
    print(f'{name} Trained \n')
    print(' * ' * 35 +'\n')

XGBClassifier Trained 

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 

CatBoostClassifier Trained 

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 

LGBMClassifier Trained 

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 



# Training Results

In [107]:
results_train = {}
for name, model in models.items():
    
    prediction = model.predict(x_train_imput.select_dtypes('number'))
    probabilities = model.predict_proba(x_train_imput.select_dtypes('number'))
    
    # ROC AUC Score
    roc_auc = roc_auc_score(Y_train['target'], prediction)

    # Accuracy Score
    accuracy = accuracy_score(Y_train['target'], prediction)
    precision = precision_score(Y_train['target'], prediction)
    recall = recall_score(Y_train['target'], prediction)
    f1 = f1_score(Y_train['target'], prediction)
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(Y_train['target'], prediction)
    
    conf = []
    conf.append(conf_matrix)
    
    logloss = log_loss(Y_train['target'], probabilities)
    
    results_train[name] = [roc_auc, accuracy, precision, recall, f1, logloss, conf]
    print(name + "'s Training Results Recorded")

XGBClassifier's Training Results Recorded
CatBoostClassifier's Training Results Recorded
LGBMClassifier's Training Results Recorded


In [108]:
def print_results(results, dataset):
    print('\n' + ' * ' * 30 + '\n')
    print('                                Dataset Name :', dataset)
    print('\n' + ' * ' * 30 + '\n')
    for (model, result) in zip(results.keys(), results.values()):
        print('Model Name :', model)
        print('ROC Score :', result[0])
        print('Accuracy Score :', result[1])
        print('Precision Score :', result[2])
        print('Recall Score :', result[3])
        print('F1 Score :', result[4])
        print('Log Loss :', result[5])
        print('\n' + ' * ' * 15 +'\n')

In [109]:
print_results(results_train, 'Training Dataset')


 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 

                                Dataset Name : Training Dataset

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 

Model Name : XGBClassifier
ROC Score : 0.9857931373927719
Accuracy Score : 0.9857931373927717
Precision Score : 0.9737966410873605
Recall Score : 0.9984531008297005
F1 Score : 0.9859707472200138
Log Loss : 0.04693734960252023

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 

Model Name : CatBoostClassifier
ROC Score : 0.988563493179581
Accuracy Score : 0.9885634931795809
Precision Score : 0.9785101371825243
Recall Score : 0.9990683448178878
F1 Score : 0.9886823830663679
Log Loss : 0.0392169975032979

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 

Model Name : LGBMClassifier
ROC Score : 0.9860682393474898
Accuracy Score : 0.9860682393474898
Precision Score : 0.9746076753682094
Recall Score : 0.9981419631556743
F1 Score : 0.9862344409060554
Log

# Validation Results

In [110]:
results_val = {}
for name, model in models.items():
    
    val_prediction = model.predict(x_val_imput.select_dtypes('number'))
    probabilities = model.predict_proba(x_val_imput.select_dtypes('number'))
    
    # ROC AUC Score
    roc_auc = roc_auc_score(y_val['target'], val_prediction)

    # Accuracy Score
    accuracy = accuracy_score(y_val['target'], val_prediction)
    precision = precision_score(y_val['target'], val_prediction)
    recall = recall_score(y_val['target'], val_prediction)
    f1 = f1_score(y_val['target'], val_prediction)
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_val['target'], val_prediction)
    conf = []
    conf.append(conf_matrix)

    logloss = log_loss(y_val['target'], probabilities)
        
    results_val[name] = [roc_auc, accuracy, precision, recall, f1, logloss, conf]
    print(name + "'s Validation Results Recorded")

XGBClassifier's Validation Results Recorded
CatBoostClassifier's Validation Results Recorded
LGBMClassifier's Validation Results Recorded


In [111]:
print_results(results_val, 'Validation Dataset')


 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 

                                Dataset Name : Validation Dataset

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 

Model Name : XGBClassifier
ROC Score : 0.9797674070604235
Accuracy Score : 0.9740363122265597
Precision Score : 0.790094084567968
Recall Score : 0.9868305531167691
F1 Score : 0.8775712441067836
Log Loss : 0.06926239323344127

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 

Model Name : CatBoostClassifier
ROC Score : 0.9746017157505795
Accuracy Score : 0.9750361402816076
Precision Score : 0.8031070772314717
Recall Score : 0.9740663199837915
F1 Score : 0.8803637917353354
Log Loss : 0.06698774140387906

 *  *  *  *  *  *  *  *  *  *  *  *  *  *  * 

Model Name : LGBMClassifier
ROC Score : 0.9782188194994981
Accuracy Score : 0.9743547288046005
Precision Score : 0.7940534642662302
Recall Score : 0.9829810224893631
F1 Score : 0.8784742131152489


# Best Models Results

In [112]:
results_val['XGBClassifier']

[0.9797674070604235,
 0.9740363122265597,
 0.790094084567968,
 0.9868305531167691,
 0.8775712441067836,
 0.06926239323344127,
 [array([[138338,   3882],
         [   195,  14612]], dtype=int64)]]

In [113]:
results_val['CatBoostClassifier']

[0.9746017157505795,
 0.9750361402816076,
 0.8031070772314717,
 0.9740663199837915,
 0.8803637917353354,
 0.06698774140387906,
 [array([[138684,   3536],
         [   384,  14423]], dtype=int64)]]

In [114]:
results_val['LGBMClassifier']

[0.9782188194994981,
 0.9743547288046005,
 0.7940534642662302,
 0.9829810224893631,
 0.8784742131152489,
 0.06827327850604616,
 [array([[138445,   3775],
         [   252,  14555]], dtype=int64)]]

# Model Ensembling

In [115]:
# Step 1: Make predictions using the 3 models
xgb_pred = models['XGBClassifier'].predict(x_test_imput)
catboost_pred = models['CatBoostClassifier'].predict(x_test_imput)
lgbm_pred = models['LGBMClassifier'].predict(x_test_imput)

# Step 2: Create a DataFrame with predictions from all models
predictions_df_majority = pd.DataFrame({
    'XGBoost_Pred': xgb_pred,
    'CatBoost_Pred': catboost_pred,
    'LightGBM_Pred': lgbm_pred
})

# Step 3: Ensemble (majority voting)
# Here we take the mode (most frequent value) across the rows to get the ensemble prediction
predictions_df_majority['Ensemble_Majority'] = predictions_df_majority.mode(axis=1)[0]

# Step 4: Calculate evaluation metrics on the ensemble predictions
ensemble_pred = predictions_df_majority['Ensemble_Majority']

predictions_df_majority

Unnamed: 0,XGBoost_Pred,CatBoost_Pred,LightGBM_Pred,Ensemble_Majority
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
261707,0,0,0,0
261708,0,0,0,0
261709,0,0,0,0
261710,0,0,0,0


# Individual Model Results

In [116]:
# Step 1: Create a list to store evaluation metrics
metrics_list = []

# Evaluate XGBoost predictions
xgb_pred = predictions_df_majority['XGBoost_Pred']
xgb_accuracy = accuracy_score(y_test['target'], xgb_pred)
xgb_precision = precision_score(y_test['target'], xgb_pred)
xgb_recall = recall_score(y_test['target'], xgb_pred)
xgb_f1 = f1_score(y_test['target'], xgb_pred)
xgb_roc_auc = roc_auc_score(y_test['target'], xgb_pred)
DenScore = (xgb_accuracy + xgb_precision + xgb_recall + xgb_f1 + xgb_roc_auc) / 5

metrics_list.append({'Model': 'XGBoost',
                     'Accuracy': xgb_accuracy,
                     'Precision': xgb_precision,
                     'Recall': xgb_recall,
                     'F1-Score': xgb_f1,
                     'ROC AUC': xgb_roc_auc,
                     'Den Score': DenScore})

# Evaluate CatBoost predictions
catboost_pred = predictions_df_majority['CatBoost_Pred']
catboost_accuracy = accuracy_score(y_test['target'], catboost_pred)
catboost_precision = precision_score(y_test['target'], catboost_pred)
catboost_recall = recall_score(y_test['target'], catboost_pred)
catboost_f1 = f1_score(y_test['target'], catboost_pred)
catboost_roc_auc = roc_auc_score(y_test['target'], catboost_pred)
DenScore = (catboost_accuracy + catboost_precision + catboost_recall + catboost_f1 + catboost_roc_auc) / 5

metrics_list.append({'Model': 'CatBoost',
                     'Accuracy': catboost_accuracy,
                     'Precision': catboost_precision,
                     'Recall': catboost_recall,
                     'F1-Score': catboost_f1,
                     'ROC AUC': catboost_roc_auc,
                     'Den Score': DenScore})

# Evaluate LightGBM predictions
lgbm_pred = predictions_df_majority['LightGBM_Pred']
lgbm_accuracy = accuracy_score(y_test['target'], lgbm_pred)
lgbm_precision = precision_score(y_test['target'], lgbm_pred)
lgbm_recall = recall_score(y_test['target'], lgbm_pred)
lgbm_f1 = f1_score(y_test['target'], lgbm_pred)
lgbm_roc_auc = roc_auc_score(y_test['target'], lgbm_pred)
DenScore = (lgbm_accuracy + lgbm_precision + lgbm_recall + lgbm_f1 + lgbm_roc_auc) / 5

metrics_list.append({'Model': 'LightGBM',
                     'Accuracy': lgbm_accuracy,
                     'Precision': lgbm_precision,
                     'Recall': lgbm_recall,
                     'F1-Score': lgbm_f1,
                     'ROC AUC': lgbm_roc_auc,
                     'Den Score': DenScore})

# Step 2: Convert metrics list to DataFrame
metrics = pd.DataFrame(metrics_list)

# Print the evaluation metrics for each model
print("Individual Model Results Evaluation Metrics:")
print(metrics)

# Calculate and print confusion matrices for each model
print("\nConfusion Matrices:")
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test['target'], xgb_pred))
print("CatBoost Confusion Matrix:\n", confusion_matrix(y_test['target'], catboost_pred))
print("LightGBM Confusion Matrix:\n", confusion_matrix(y_test['target'], lgbm_pred))

# Step 3: Print classification reports for each model
print("\nClassification Reports:")
print("XGBoost Classification Report:\n", classification_report(y_test['target'], xgb_pred))
print("CatBoost Classification Report:\n", classification_report(y_test['target'], catboost_pred))
print("LightGBM Classification Report:\n", classification_report(y_test['target'], lgbm_pred))

Individual Model Results Evaluation Metrics:
      Model  Accuracy  Precision    Recall  F1-Score   ROC AUC  Den Score
0   XGBoost  0.973585   0.786301  0.988532  0.875895  0.980281   0.920919
1  CatBoost  0.974789   0.799417  0.978037  0.879752  0.976244   0.921648
2  LightGBM  0.974094   0.790968  0.985777  0.877692  0.979327   0.921572

Confusion Matrices:
XGBoost Confusion Matrix:
 [[230404   6630]
 [   283  24395]]
CatBoost Confusion Matrix:
 [[230978   6056]
 [   542  24136]]
LightGBM Confusion Matrix:
 [[230605   6429]
 [   351  24327]]

Classification Reports:
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99    237034
           1       0.79      0.99      0.88     24678

    accuracy                           0.97    261712
   macro avg       0.89      0.98      0.93    261712
weighted avg       0.98      0.97      0.97    261712

CatBoost Classification Report:
               precision    recall

# Ensembled Results Feature Importance

In [117]:
from sklearn.preprocessing import MinMaxScaler

# Step 1: Get feature importances from each model
xgb_importance = models['XGBClassifier'].feature_importances_
cat_importance = models['CatBoostClassifier'].get_feature_importance()
lgb_importance = models['LGBMClassifier'].feature_importances_

# Step 2: Normalize the importances
scaler = MinMaxScaler()
xgb_importance = scaler.fit_transform(xgb_importance.reshape(-1, 1)).flatten()
cat_importance = scaler.fit_transform(cat_importance.reshape(-1, 1)).flatten()
lgb_importance = scaler.fit_transform(lgb_importance.reshape(-1, 1)).flatten()

# Step 3: Combine the importances (using average here)
avg_importance = (xgb_importance + cat_importance + lgb_importance) / 3

# Step 4: Create DataFrame for visualization
feature_names = x_train.columns  # Ensure that x_train has the feature names
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': avg_importance
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
16,Column18,0.743763
1,Column1,0.365753
3,Column3,0.247013
2,Column2,0.231806
4,Column4,0.209701
7,Column7,0.165368
5,Column5,0.147758
8,Column8,0.115455
13,Column15,0.08843
6,Column6,0.083055


# Ensembling Model Results on different Parameters

In [118]:
# Accuracy
accuracy = accuracy_score(y_test['target'], ensemble_pred)

# Precision
precision = precision_score(y_test['target'], ensemble_pred)

# Recall
recall = recall_score(y_test['target'], ensemble_pred)

# F1-Score
f1 = f1_score(y_test['target'], ensemble_pred)

roc_auc = roc_auc_score(y_test['target'], ensemble_pred)

DenScore = (accuracy + precision + recall + f1 + roc_auc) / 5

# Classification Report
classification_rep = classification_report(y_test['target'], ensemble_pred)

# Step 8: Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test['target'], ensemble_pred)

# Step 5: Output the results
print("Ensemble Model Performance:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Den Score: {DenScore}")

print("\nClassification Report:")
print(classification_rep)
print("\nConfusion Matrix:")
print(conf_matrix)

Ensemble Model Performance:
Accuracy: 0.9742617839457113
Precision: 0.791588116752259
Recall: 0.986870897155361
F1-Score: 0.8785080441526585
ROC AUC: 0.9799099627823938
Den Score: 0.9222277609576766

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    237034
           1       0.79      0.99      0.88     24678

    accuracy                           0.97    261712
   macro avg       0.90      0.98      0.93    261712
weighted avg       0.98      0.97      0.98    261712


Confusion Matrix:
[[230622   6412]
 [   324  24354]]


In [119]:
# Step 1: Get predicted probabilities using the 3 models
xgb_prob = models['XGBClassifier'].predict_proba(x_test_imput)[:, 1]  # Probability of class 1
catboost_prob = models['CatBoostClassifier'].predict_proba(x_test_imput)[:, 1]  # Probability of class 1
lgbm_prob = models['LGBMClassifier'].predict_proba(x_test_imput)[:, 1]  # Probability of class 1

# Step 2: Create a DataFrame with probabilities from all models
prob_df = pd.DataFrame({
    'XGBoost_Prob': xgb_prob,
    'CatBoost_Prob': catboost_prob,
    'LightGBM_Prob': lgbm_prob
})

# Step 3: Ensemble (take the mean of the probabilities)
prob_df['Ensemble_Prob'] = prob_df.mean(axis=1)

# Step 4: Convert averaged probabilities to binary classes using a threshold of 0.5
prob_df['Ensemble_Pred'] = (prob_df['Ensemble_Prob'] >= 0.5).astype(int)

# Step 5: Calculate evaluation metrics on the ensemble predictions
ensemble_pred = prob_df['Ensemble_Pred']

# Accuracy
accuracy = accuracy_score(y_test['target'], ensemble_pred)

# Precision
precision = precision_score(y_test['target'], ensemble_pred)

# Recall
recall = recall_score(y_test['target'], ensemble_pred)

# F1-Score
f1 = f1_score(y_test['target'], ensemble_pred)

# ROC AUC (for binary classification)
roc_auc = roc_auc_score(y_test['target'], prob_df['Ensemble_Prob'])

DenScore = (accuracy + precision + recall + f1 + roc_auc) / 5

# Classification Report
classification_rep = classification_report(y_test['target'], ensemble_pred)

# Step 8: Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test['target'], ensemble_pred)

# Step 6: Output the results
print("Ensemble Model Performance (Based on Probabilities):")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Den Score: {DenScore}")
print("\nClassification Report:")
print(classification_rep)
print("\nConfusion Matrix:")
print(conf_matrix)

prob_df

Ensemble Model Performance (Based on Probabilities):
Accuracy: 0.9744757596136211
Precision: 0.7932225480612577
Recall: 0.9864656779317611
F1-Score: 0.879352694697298
ROC AUC: 0.9942574455701296
Den Score: 0.9255548251748135

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    237034
           1       0.79      0.99      0.88     24678

    accuracy                           0.97    261712
   macro avg       0.90      0.98      0.93    261712
weighted avg       0.98      0.97      0.98    261712


Confusion Matrix:
[[230688   6346]
 [   334  24344]]


Unnamed: 0,XGBoost_Prob,CatBoost_Prob,LightGBM_Prob,Ensemble_Prob,Ensemble_Pred
0,0.000003,3.187385e-08,0.000037,0.000013,0
1,0.000004,2.258158e-08,0.000039,0.000014,0
2,0.000005,3.735947e-09,0.000040,0.000015,0
3,0.000005,1.158518e-07,0.000043,0.000016,0
4,0.000004,8.352285e-07,0.000041,0.000015,0
...,...,...,...,...,...
261707,0.000001,2.290419e-06,0.000030,0.000011,0
261708,0.000004,5.223751e-09,0.000046,0.000017,0
261709,0.000004,2.567144e-09,0.000036,0.000013,0
261710,0.000004,2.680107e-09,0.000041,0.000015,0


In [120]:
# Step 1: Calculate accuracy for each model on the validation set
xgb_val_pred = models['XGBClassifier'].predict(x_val_imput)
catboost_val_pred = models['CatBoostClassifier'].predict(x_val_imput)
lgbm_val_pred = models['LGBMClassifier'].predict(x_val_imput)

xgb_accuracy = accuracy_score(y_val['target'], xgb_val_pred)
catboost_accuracy = accuracy_score(y_val['target'], catboost_val_pred)
lgbm_accuracy = accuracy_score(y_val['target'], lgbm_val_pred)

# Store accuracies
accuracies = np.array([xgb_accuracy, catboost_accuracy, lgbm_accuracy])

# Step 2: Normalize the accuracies to use as weights
normalized_weights = accuracies / accuracies.sum()

# Step 3: Get predicted probabilities using the 3 models
xgb_prob = models['XGBClassifier'].predict_proba(x_test_imput)[:, 1]  # Probability of class 1
catboost_prob = models['CatBoostClassifier'].predict_proba(x_test_imput)[:, 1]  # Probability of class 1
lgbm_prob = models['LGBMClassifier'].predict_proba(x_test_imput)[:, 1]  # Probability of class 1

# Step 4: Create a DataFrame with probabilities from all models
prob_df = pd.DataFrame({
    'XGBoost_Prob': xgb_prob,
    'CatBoost_Prob': catboost_prob,
    'LightGBM_Prob': lgbm_prob
})

# Step 5: Ensemble (take the weighted mean of the probabilities)
prob_df['Ensemble_Prob'] = (prob_df['XGBoost_Prob'] * normalized_weights[0] +
                             prob_df['CatBoost_Prob'] * normalized_weights[1] +
                             prob_df['LightGBM_Prob'] * normalized_weights[2])

# Step 6: Convert averaged probabilities to binary classes using a threshold of 0.5
prob_df['Ensemble_Pred'] = (prob_df['Ensemble_Prob'] >= 0.5).astype(int)

# Step 7: Calculate evaluation metrics on the ensemble predictions
ensemble_pred = prob_df['Ensemble_Pred']

# Accuracy
accuracy = accuracy_score(y_test['target'], ensemble_pred)

# Precision
precision = precision_score(y_test['target'], ensemble_pred)

# Recall
recall = recall_score(y_test['target'], ensemble_pred)

# F1-Score
f1 = f1_score(y_test['target'], ensemble_pred)

# ROC AUC
roc_auc = roc_auc_score(y_test['target'], prob_df['Ensemble_Prob'])

DenScore = (accuracy + precision + recall + f1 + roc_auc) / 5

# Classification Report
classification_rep = classification_report(y_test['target'], ensemble_pred)

# Step 8: Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test['target'], ensemble_pred)

# Step 8: Output the results
print("Ensemble Model Performance (Weighted by Accuracy):")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Den Score: {DenScore}")
print("\nClassification Report:")
print(classification_rep)
print("\nConfusion Matrix:")
print(conf_matrix)

prob_df

Ensemble Model Performance (Weighted by Accuracy):
Accuracy: 0.9744757596136211
Precision: 0.7932225480612577
Recall: 0.9864656779317611
F1-Score: 0.879352694697298
ROC AUC: 0.9942574247137355
Den Score: 0.9255548210035347

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    237034
           1       0.79      0.99      0.88     24678

    accuracy                           0.97    261712
   macro avg       0.90      0.98      0.93    261712
weighted avg       0.98      0.97      0.98    261712


Confusion Matrix:
[[230688   6346]
 [   334  24344]]


Unnamed: 0,XGBoost_Prob,CatBoost_Prob,LightGBM_Prob,Ensemble_Prob,Ensemble_Pred
0,0.000003,3.187385e-08,0.000037,0.000013,0
1,0.000004,2.258158e-08,0.000039,0.000014,0
2,0.000005,3.735947e-09,0.000040,0.000015,0
3,0.000005,1.158518e-07,0.000043,0.000016,0
4,0.000004,8.352285e-07,0.000041,0.000015,0
...,...,...,...,...,...
261707,0.000001,2.290419e-06,0.000030,0.000011,0
261708,0.000004,5.223751e-09,0.000046,0.000017,0
261709,0.000004,2.567144e-09,0.000036,0.000013,0
261710,0.000004,2.680107e-09,0.000041,0.000015,0


In [121]:
# Step 1: Calculate F1 score for each model on the validation set
xgb_val_pred = models['XGBClassifier'].predict(x_val_imput)
catboost_val_pred = models['CatBoostClassifier'].predict(x_val_imput)
lgbm_val_pred = models['LGBMClassifier'].predict(x_val_imput)

# Calculate F1 scores
xgb_f1 = f1_score(y_val['target'], xgb_val_pred)
catboost_f1 = f1_score(y_val['target'], catboost_val_pred)
lgbm_f1 = f1_score(y_val['target'], lgbm_val_pred)

# Store F1 scores
f1_scores = np.array([xgb_f1, catboost_f1, lgbm_f1])

# Step 2: Normalize the F1 scores to use as weights
normalized_weights = f1_scores / f1_scores.sum()

# Step 3: Get predicted probabilities using the 3 models
xgb_prob = models['XGBClassifier'].predict_proba(x_test_imput)[:, 1]  # Probability of class 1
catboost_prob = models['CatBoostClassifier'].predict_proba(x_test_imput)[:, 1]  # Probability of class 1
lgbm_prob = models['LGBMClassifier'].predict_proba(x_test_imput)[:, 1]  # Probability of class 1

# Step 4: Create a DataFrame with probabilities from all models
prob_df = pd.DataFrame({
    'XGBoost_Prob': xgb_prob,
    'CatBoost_Prob': catboost_prob,
    'LightGBM_Prob': lgbm_prob
})

# Step 5: Ensemble (take the weighted mean of the probabilities)
prob_df['Ensemble_Prob'] = (prob_df['XGBoost_Prob'] * normalized_weights[0] +
                             prob_df['CatBoost_Prob'] * normalized_weights[1] +
                             prob_df['LightGBM_Prob'] * normalized_weights[2])

# Step 6: Convert averaged probabilities to binary classes using a threshold of 0.5
prob_df['Ensemble_Pred'] = (prob_df['Ensemble_Prob'] >= 0.5).astype(int)

# Step 7: Calculate evaluation metrics on the ensemble predictions
ensemble_pred = prob_df['Ensemble_Pred']

# Accuracy
accuracy = accuracy_score(y_test['target'], ensemble_pred)

# Precision
precision = precision_score(y_test['target'], ensemble_pred)

# Recall
recall = recall_score(y_test['target'], ensemble_pred)

# F1-Score
f1 = f1_score(y_test['target'], ensemble_pred)

# ROC AUC
roc_auc = roc_auc_score(y_test['target'], prob_df['Ensemble_Prob'])

DenScore = (accuracy + precision + recall + f1 + roc_auc) / 5

# Classification Report
classification_rep = classification_report(y_test['target'], ensemble_pred)

# Step 8: Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test['target'], ensemble_pred)

# Step 9: Output the results
print("Ensemble Model Performance (Weighted by F1 Score):")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Den Score: {DenScore}")
print("\nClassification Report:")
print(classification_rep)
print("\nConfusion Matrix:")
print(conf_matrix)

prob_df

Ensemble Model Performance (Weighted by F1 Score):
Accuracy: 0.9744757596136211
Precision: 0.7932225480612577
Recall: 0.9864656779317611
F1-Score: 0.879352694697298
ROC AUC: 0.9942573959934551
Den Score: 0.9255548152594786

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    237034
           1       0.79      0.99      0.88     24678

    accuracy                           0.97    261712
   macro avg       0.90      0.98      0.93    261712
weighted avg       0.98      0.97      0.98    261712


Confusion Matrix:
[[230688   6346]
 [   334  24344]]


Unnamed: 0,XGBoost_Prob,CatBoost_Prob,LightGBM_Prob,Ensemble_Prob,Ensemble_Pred
0,0.000003,3.187385e-08,0.000037,0.000013,0
1,0.000004,2.258158e-08,0.000039,0.000014,0
2,0.000005,3.735947e-09,0.000040,0.000015,0
3,0.000005,1.158518e-07,0.000043,0.000016,0
4,0.000004,8.352285e-07,0.000041,0.000015,0
...,...,...,...,...,...
261707,0.000001,2.290419e-06,0.000030,0.000011,0
261708,0.000004,5.223751e-09,0.000046,0.000017,0
261709,0.000004,2.567144e-09,0.000036,0.000013,0
261710,0.000004,2.680107e-09,0.000041,0.000015,0


# Best Pipeline Approach

In [122]:
class CustomVotingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, models):
        self.models = models  # Dictionary of model names and instances
        self.model_names = list(models.keys())

    def _convert_to_dataframe(self, X, y=None):
        """Helper function to convert input to DataFrame if it's not already one."""
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        if y is not None and not isinstance(y, pd.DataFrame):
            y = pd.DataFrame(y)
        return X, y

    def fit(self, X, y):
        
        X, y = self._convert_to_dataframe(X, y)
            
        # Fit each model
        for model in self.models.values():
            model.fit(X, y)
            
        self.fitted_ = True 
        return self

    def predict(self, X):
        # Convert X to DataFrame if needed
        X, _ = self._convert_to_dataframe(X)

        # Get predictions from each model
        predictions = np.array([model.predict(X) for model in self.models.values()])
        
        # Use majority voting
        return np.round(np.mean(predictions, axis=0)).astype(int)

    def predict_proba(self, X):
        # Convert X to DataFrame if needed
        X, _ = self._convert_to_dataframe(X)

        # Get probabilities from each model
        probabilities = np.array([model.predict_proba(X) for model in self.models.values()])
        
        # Average probabilities
        return np.mean(probabilities, axis=0)


In [123]:
# Function to drop specified columns
def drop_columns(X):
    return X.drop(columns=['ID', 
                           'Column9'
                           ])

# Create the FunctionTransformer
drop_columns_transformer = FunctionTransformer(drop_columns)

In [124]:
imputer = SimpleImputer(strategy='median')
sc = StandardScaler()

In [125]:
# Model parameters
xgb_params = {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1}
lightbgm_params = {'lambda_l1': 3.962980604776118, 'lambda_l2': 1.5075196212131254, 'num_leaves': 90, 'feature_fraction': 0.9675647296514234, 'bagging_fraction': 0.9979919873256766, 'bagging_freq': 2, 'min_child_samples': 63}
catboost_params = {
    'iterations': 700,
    'learning_rate': 0.12851432422429102,
    'l2_leaf_reg': 0.8373406120034129,
    'border_count': 171,
    'bagging_temperature': 22.360639700244604,
    'random_strength': 1.3991869546371338,
    'boosting_type': 'Plain',
    'depth': 9,
    'grow_policy': 'SymmetricTree',
    'eval_metric': 'F1', 
}

# Initialize models
models = {
    'XGBClassifier': XGBClassifier(**xgb_params),
    # 'XGBClassifier': XGBClassifier(),
    'CatBoostClassifier': CatBoostClassifier(**catboost_params, verbose=False),
    # 'CatBoostClassifier': CatBoostClassifier(verbose=False),
    'LGBMClassifier': LGBMClassifier(**lightbgm_params, verbose=-1)
    # 'LGBMClassifier': LGBMClassifier(verbose=-1)
}

In [126]:
# Assuming your models are stored in a dictionary called 'models'
final_models = {
    'XGBClassifier': models['XGBClassifier'],    # Replace with your actual XGBClassifier object
    'CatBoostClassifier': models['CatBoostClassifier'],  # Replace with your actual CatBoostClassifier object
    'LGBMClassifier': models['LGBMClassifier']   # Replace with your actual LGBMClassifier object
}

In [127]:
# Instantiate the custom voting classifier
voting_clf = CustomVotingClassifier(models=final_models)

# # Convert the dictionary to a list of tuples
# estimators_list = [(name, model) for name, model in final_models.items()]
# voting_clf = VotingClassifier(estimators=estimators_list)

In [128]:
x_train_final = pd.read_csv("X_Train_Data_Input.csv")
y_train_final = pd.read_csv("Y_Train_Data_Target.csv")
x_train_final, x_val_final, y_train_final, y_val_final = train_test_split(x_train_final, y_train_final, test_size=0.2, random_state=42, stratify=y_train_final['target'])

x_train_final.reset_index(drop=True, inplace=True)
x_val_final.reset_index(drop=True, inplace=True)
y_train_final.reset_index(drop=True, inplace=True)
y_val_final.reset_index(drop=True, inplace=True)

x_test_final = pd.read_csv("X_Test_Data_Input.csv")
y_test_final = pd.read_csv("Y_Test_Data_Target.csv")

In [129]:
# Pipeline
pipeline = Pipeline([
    ('drop_columns', drop_columns_transformer),  # Drop specified columns
    ('imputer', imputer),                        # Impute missing values
    ('scaler', sc),                             # Scale the data
    ('voting', voting_clf)                      # Train the voting classifier
])

In [130]:
# X_train, y_train are your training data and target variable
pipeline.fit(x_train_final, y_train_final.iloc[:, -1])

In [131]:
# Assuming 'pipeline' is your pipeline object
with open('pipeline.pkl', 'wb') as f:
    dill.dump(pipeline, f)

In [132]:
with open('pipeline.pkl', 'rb') as f:
    loaded = dill.load(f)

In [133]:
y_pred_final = loaded.predict(x_val_final)
y_pred_final

array([0, 0, 0, ..., 1, 0, 0])

In [134]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_val_final['target'], y_pred_final)
precision = precision_score(y_val_final['target'], y_pred_final)
recall = recall_score(y_val_final['target'], y_pred_final)
f1 = f1_score(y_val_final['target'], y_pred_final)
roc_auc = roc_auc_score(y_val_final['target'], y_pred_final)

conf_matrix = confusion_matrix(y_val_final['target'], y_pred_final)
class_report = classification_report(y_val_final['target'], y_pred_final)

# Print the evaluation results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC Score: {roc_auc}")

Accuracy: 0.9786788259343935
Precision: 0.8546140991520703
Recall: 0.9325319105828325
F1 Score: 0.8918744348275417
ROC AUC Score: 0.9580076231299762


In [135]:
print("\nConfusion Matrix:")
conf_matrix


Confusion Matrix:


array([[139871,   2349],
       [   999,  13808]], dtype=int64)

In [136]:
print("\nClassification Report:")
print(class_report)


Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99    142220
           1       0.85      0.93      0.89     14807

    accuracy                           0.98    157027
   macro avg       0.92      0.96      0.94    157027
weighted avg       0.98      0.98      0.98    157027



In [137]:
y_pred_test_final = loaded.predict(x_test_final)
y_pred_test_final

array([0, 0, 0, ..., 0, 0, 0])

In [138]:
# Calculate evaluation metrics
accuracy_test = accuracy_score(y_test_final['target'], y_pred_test_final)
precision_test = precision_score(y_test_final['target'], y_pred_test_final)
recall_test = recall_score(y_test_final['target'], y_pred_test_final)
f1_test = f1_score(y_test_final['target'], y_pred_test_final)
roc_auc_test = roc_auc_score(y_test_final['target'], y_pred_test_final)

conf_matrix_test = confusion_matrix(y_test_final['target'], y_pred_test_final)
class_report_test = classification_report(y_test_final['target'], y_pred_test_final)

# Print the evaluation results
print(f"Accuracy: {accuracy_test}")
print(f"Precision: {precision_test}")
print(f"Recall: {recall_test}")
print(f"F1 Score: {f1_test}")
print(f"ROC AUC Score: {roc_auc_test}")

Accuracy: 0.9785680442623953
Precision: 0.8503656340719509
Recall: 0.937717805332685
F1 Score: 0.8919080380027365
ROC AUC Score: 0.9602694176135653


In [139]:
print("\nConfusion Matrix:")
conf_matrix_test


Confusion Matrix:


array([[232962,   4072],
       [  1537,  23141]], dtype=int64)

In [140]:
print("\nClassification Report:")
print(class_report_test)


Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99    237034
           1       0.85      0.94      0.89     24678

    accuracy                           0.98    261712
   macro avg       0.92      0.96      0.94    261712
weighted avg       0.98      0.98      0.98    261712



In [141]:
from sklearn.preprocessing import MinMaxScaler

# Step 1: Get feature importances from each model
xgb_importance = loaded.named_steps['voting'].models['XGBClassifier'].feature_importances_
cat_importance = loaded.named_steps['voting'].models['CatBoostClassifier'].get_feature_importance()
lgb_importance = loaded.named_steps['voting'].models['LGBMClassifier'].feature_importances_

# Step 2: Normalize the importances
scaler = MinMaxScaler()
xgb_importance = scaler.fit_transform(xgb_importance.reshape(-1, 1)).flatten()
cat_importance = scaler.fit_transform(cat_importance.reshape(-1, 1)).flatten()
lgb_importance = scaler.fit_transform(lgb_importance.reshape(-1, 1)).flatten()

# Step 3: Combine the importances (using average here)
avg_importance = (xgb_importance + cat_importance + lgb_importance) / 3

# Step 4: Create DataFrame for visualization
columns = ['Column0', 'Column1', 'Column2', 'Column3', 'Column4', 'Column5',
       'Column6', 'Column7', 'Column8', 'Column10', 'Column11', 'Column12', 'Column13',
       'Column14', 'Column15', 'Column16', 'Column17', 'Column18', 'Column19',
       'Column20', 'Column21']

feature_names = columns  # Ensure that x_train has the feature names
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': avg_importance
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
17,Column18,0.72569
1,Column1,0.364509
2,Column2,0.298605
7,Column7,0.186993
8,Column8,0.155464
13,Column14,0.122809
3,Column3,0.109014
4,Column4,0.105901
6,Column6,0.075759
5,Column5,0.071232


In [142]:
# Different cases for dropped columns
column_cases = {
    'exclude_both': ['ID', 'Column9', 'Column14'],
    'include_column9': ['ID', 'Column14'],
    'include_column14': ['ID', 'Column9'],
    'include_both': ['ID']
}

# Function to dynamically drop columns based on the case
def drop_columns_by_case(X, columns):
    return X.drop(columns=columns)

# Different classifier setups (CustomVotingClassifier vs VotingClassifier)
voting_cases = {
    'custom_voting_param': CustomVotingClassifier(models=final_models),  # Parameterized
    'custom_voting_non_param': CustomVotingClassifier(models=models),    # Non-parameterized
    'sklearn_voting_param': VotingClassifier(estimators=list(final_models.items()), voting='soft'),
    'sklearn_voting_non_param': VotingClassifier(estimators=list(models.items()), voting='soft')
}

# Define a dictionary to store the results for each case
results = {}


In [143]:
# Function to evaluate model
def evaluate_model(y_true, y_pred, y_proba):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    den_score = (f1 + roc_auc + balanced_acc) / 3
    
    log_loss_value = log_loss(y_true, y_proba)
    
    conf_matrix = confusion_matrix(y_true, y_pred)
    class_report = classification_report(y_true, y_pred)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'den_score': den_score,
        'balanced_accuracy': balanced_acc,
        'log_loss': log_loss_value,
        'conf_matrix': conf_matrix,
        'class_report': class_report
    }

# Loop through each combination of column case and voting classifier
for column_case, columns_to_drop in column_cases.items():
    for voting_case, classifier in voting_cases.items():
        # Create pipeline for each case
        drop_columns_transformer_case = FunctionTransformer(lambda X: drop_columns_by_case(X, columns_to_drop))
        
        pipeline = Pipeline([
            ('drop_columns', drop_columns_transformer_case),  # Drop specified columns
            ('imputer', imputer),                            # Impute missing values
            ('scaler', sc),                                  # Scale the data
            ('voting', classifier)                           # Use the appropriate voting classifier
        ])
        
        # Train the pipeline
        pipeline.fit(x_train_final, y_train_final.iloc[:, -1])
        
        # Make predictions
        y_pred_test_final = pipeline.predict(x_test_final)
        
        y_proba_test_final = pipeline.predict_proba(x_test_final)
        
        # Evaluate and store the results
        result = evaluate_model(y_test_final['target'], y_pred_test_final, y_proba_test_final)
        results[f'{column_case}_{voting_case}'] = result

In [144]:
best_case = max(results.items(), key=lambda item: item[1]['den_score'])

print(f"Best case: {best_case[0]}")
print(f"Accuracy: {best_case[1]['accuracy']}")
print(f"Precision: {best_case[1]['precision']}")
print(f"Recall: {best_case[1]['recall']}")
print(f"F1 Score: {best_case[1]['f1']}")
print(f"ROC AUC Score: {best_case[1]['roc_auc']}")
print(f"Balanced Accuracy Score: {best_case[1]['balanced_accuracy']}")
print(f"Den Score: {best_case[1]['den_score']}")
print(f"Log Loss: {best_case[1]['log_loss']}")
print("\nConfusion Matrix:")
print(best_case[1]['conf_matrix'])
print("\nClassification Report:")
print(best_case[1]['class_report'])

Best case: include_column14_custom_voting_param
Accuracy: 0.9785680442623953
Precision: 0.8503656340719509
Recall: 0.937717805332685
F1 Score: 0.8919080380027365
ROC AUC Score: 0.9602694176135653
Balanced Accuracy Score: 0.9602694176135653
Den Score: 0.9374822910766224
Log Loss: 0.049076202121271076

Confusion Matrix:
[[232962   4072]
 [  1537  23141]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99    237034
           1       0.85      0.94      0.89     24678

    accuracy                           0.98    261712
   macro avg       0.92      0.96      0.94    261712
weighted avg       0.98      0.98      0.98    261712

