### XGBoosting (model 3)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, classification_report, f1_score, fbeta_score, make_scorer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.compose import make_column_transformer

from xgboost import XGBClassifier



pd.set_option('display.float_format', lambda x: '%.2f' % x)
RSEED = 42

In [2]:
df = pd.read_csv('data/cleaned/train.csv')
df_test = pd.read_csv('data/cleaned/test.csv')
df.head()


Unnamed: 0,district,client_id,client_catg,region,target,reading_remarque,counter_coeff,consommation_per_month,months_number,elec,gaz
0,63,train_Client_33962,11,101,0.0,6.0,1.0,113.27,4.0,1,1
1,62,train_Client_32174,11,301,0.0,6.0,1.0,8.11,3.4,1,1
2,69,train_Client_18868,11,107,0.0,6.0,1.0,70.77,4.0,1,1
3,62,train_Client_39728,11,310,0.0,6.0,1.0,144.65,4.11,1,0
4,60,train_Client_34246,11,101,0.0,6.0,1.0,120.93,3.96,1,1


In [3]:
y_train = df['target']
X_train = df.drop(['target', 'client_id'], axis=1)

In [None]:
# Helper: Check specific columns:
X_train.iloc[:,5].sort_values(ascending=False).nunique() # Not sure why we need this line of code

105312

In [5]:
#Split target from test data
y_test = df_test['target']
X_test = df_test.drop(['target'], axis=1)

In [6]:
# Helper (compare X_train and X_test below):
X_train.head(2)

Unnamed: 0,district,client_catg,region,reading_remarque,counter_coeff,consommation_per_month,months_number,elec,gaz
0,63,11,101,6.0,1.0,113.27,4.0,1,1
1,62,11,301,6.0,1.0,8.11,3.4,1,1


In [7]:
# Helper (compare X_train and X_test):
X_test.head(2)

Unnamed: 0,district,client_catg,region,reading_remarque,counter_coeff,consommation_per_month,months_number,elec,gaz
0,69,11,104,8.0,1.0,52.0,4.0,1,0
1,62,11,301,6.0,1.0,117.75,4.0,1,0


In [12]:
y_train.unique()

array([0., 1.])

In [13]:
model3 = XGBClassifier(
    n_estimators=4000,
    learning_rate=0.01,  # Reduced learning rate
    max_depth=3,
    objective='binary:logistic',
    random_state=RSEED,
    scale_pos_weight=sum(y_train == 0) / sum(y_train == 1),
    gamma=0.1,  # Adjust gamma for regularization
    reg_lambda=1,  # Adjust reg_lambda for L2 regularization
    reg_alpha=0,  # Adjust reg_alpha for L1 regularization
)

In [None]:
# run took 14m14s
model3.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=500
)

[0]	validation_0-logloss:0.69108
[500]	validation_0-logloss:0.45959
[1000]	validation_0-logloss:0.42734
[1500]	validation_0-logloss:0.41983
[2000]	validation_0-logloss:0.41660
[2500]	validation_0-logloss:0.41848
[3000]	validation_0-logloss:0.42139
[3500]	validation_0-logloss:0.42046
[3999]	validation_0-logloss:0.41909


#### Evaluation

In [16]:
# Make probability predictions for X_train
train_probs3 = model3.predict_proba(X_train)[:, 1]
train_predictions3 = model3.predict(X_train)

In [18]:
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs3)}')

Train ROC AUC Score: 0.7281352847389368


In [19]:
print(confusion_matrix(y_train, train_predictions3))
print(classification_report(y_train, train_predictions3))

[[80184 46204]
 [ 2333  5219]]
              precision    recall  f1-score   support

         0.0       0.97      0.63      0.77    126388
         1.0       0.10      0.69      0.18      7552

    accuracy                           0.64    133940
   macro avg       0.54      0.66      0.47    133940
weighted avg       0.92      0.64      0.73    133940



In [None]:
# Make probability predictions test data
# Run took 1m13s
test_probs3 = model3.predict_proba(X_test)[:, 1]
test_predictions3 = model3.predict(X_test)

In [21]:
print(f'Test ROC AUC Score: {roc_auc_score(y_test, test_probs3)}')

Test ROC AUC Score: 0.5727353117401761


In [22]:
print(confusion_matrix(y_test, test_predictions3))
print(classification_report(y_test, test_predictions3))

[[952927  77582]
 [ 74320  13893]]
              precision    recall  f1-score   support

         0.0       0.93      0.92      0.93   1030509
         1.0       0.15      0.16      0.15     88213

    accuracy                           0.86   1118722
   macro avg       0.54      0.54      0.54   1118722
weighted avg       0.87      0.86      0.87   1118722



---

### try again

In [None]:
ros = RandomOverSampler(random_state=RSEED)
X_train, y_train = ros.fit_resample(X_train, y_train)