In [51]:
import pandas as pd
import numpy as np
from scipy import stats
import csv
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

# XGBoost

In [52]:
campaign_ad = pd.read_csv("MLUnige2023_subscriptions_train.csv", index_col="Id")
campaign_test = pd.read_csv("MLUnige2023_subscriptions_test.csv", index_col="Id")

X = campaign_ad.drop(columns='subscription')
y = campaign_ad['subscription']

In [53]:
num_vars = ['age', 'time_spent', 'banner_views', 'banner_views_old', 'days_elapsed_old', 'X4']
cat_vars = list(set(X.columns).difference(num_vars))

for col in cat_vars:
  X[col] = X[col].astype("category")
  campaign_test[col] = campaign_test[col].astype("category")

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=46)

In [55]:
xgb_clf = XGBClassifier(tree_method="hist",
                        objective="binary:logistic",
                        eval_metric='auc',
                        enable_categorical=True,
                        n_jobs=-2,
                        random_state=391)

param_search = {'n_estimators': range(50, 301),
                'max_depth': range(1, 8),
                'learning_rate': stats.uniform(.001, .3), # [.001, .3.001]
                'colsample_bytree': stats.uniform(.5,.5), # [.5, 1]
                'gamma': stats.uniform(0, .5),            # [0, .5]
                'scale_pos_weight':[1]}

model = RandomizedSearchCV(xgb_clf,
                           param_distributions=param_search,
                           n_iter=250,
                           return_train_score=True,
                           verbose=1,
                           cv= StratifiedKFold(n_splits=5, shuffle=True, random_state=611),
                           random_state=547)

model.fit(X_train, y_train)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits


In [56]:
best_params=model.best_params_
best_params

{'colsample_bytree': 0.9605195575049101,
 'gamma': 0.2780067362138148,
 'learning_rate': 0.16146905484916832,
 'max_depth': 6,
 'n_estimators': 114,
 'scale_pos_weight': 1}

In [57]:
pd.DataFrame(model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_learning_rate,param_max_depth,param_n_estimators,param_scale_pos_weight,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.223724,0.021920,0.013715,0.002283,0.931867,0.128518,0.274064,5,169,1,...,0.856847,0.012540,125,0.997207,0.995811,0.995412,0.996210,0.995013,0.995930,0.000753
1,0.242867,0.005849,0.014362,0.000634,0.654448,0.264387,0.112567,5,179,1,...,0.861317,0.013400,17,0.961093,0.969280,0.963096,0.958308,0.956314,0.961618,0.004478
2,0.268770,0.017404,0.014558,0.000728,0.655366,0.080637,0.119778,4,225,1,...,0.860200,0.014343,39,0.948324,0.952723,0.948534,0.947137,0.952124,0.949769,0.002228
3,0.245334,0.007157,0.014406,0.000724,0.730592,0.484692,0.118224,7,124,1,...,0.858125,0.013813,87,0.976656,0.977459,0.979852,0.977259,0.979254,0.978096,0.001233
4,0.213474,0.015741,0.013748,0.000614,0.587381,0.407906,0.061792,2,261,1,...,0.850306,0.016222,193,0.873703,0.872531,0.868741,0.869739,0.863754,0.869694,0.003473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,0.144516,0.006347,0.016439,0.001112,0.766438,0.106116,0.172644,1,111,1,...,0.841527,0.015319,231,0.851955,0.851386,0.847397,0.846998,0.844804,0.848508,0.002735
246,0.343685,0.023394,0.019625,0.001415,0.932065,0.100493,0.098399,3,231,1,...,0.857647,0.014725,102,0.911413,0.914821,0.917016,0.906443,0.907441,0.911427,0.004086
247,0.987697,0.032408,0.028742,0.001212,0.511207,0.014147,0.009678,7,272,1,...,0.856530,0.013220,128,0.925778,0.922801,0.920806,0.920207,0.922601,0.922439,0.001947
248,0.284875,0.006795,0.019171,0.000755,0.927412,0.100206,0.173574,5,112,1,...,0.856848,0.012093,124,0.975459,0.977858,0.976860,0.970277,0.976661,0.975423,0.002683


In [None]:
#params with cv=3, n_iter = 1000
{'colsample_bytree': 0.7231233684423828,
 'gamma': 0.246830272707522,
 'learning_rate': 0.2818210861049318,
 'max_depth': 4,
 'n_estimators': 243,
 'scale_pos_weight': 1}

In [54]:
params_s_grid = {'colsample_bytree': 1,
 'gamma': 0.1,
 'learning_rate': 0.19,
 'max_depth': 4,
 'n_estimators': 150,
 'scale_pos_weight': 1}

In [55]:
params_s_best = {'colsample_bytree': 0.8480431984105623,
               'gamma': 0.43302507108794236,
               'learning_rate': 0.05825570698271222,
               'max_depth': 5,
               'n_estimators': 277,
               'scale_pos_weight': 1}

In [8]:
model.best_score_

0.863071976538604

In [58]:
xgb_model = XGBClassifier(**best_params,
                          tree_method='hist',
                          objective="binary:logistic",
                          eval_metric='auc',
                          enable_categorical=True)

xgb_model.fit(X_train, y_train)

In [59]:
train_pred = xgb_model.predict_proba(X_train)[:,1]
valid_pred = xgb_model.predict_proba(X_valid)[:,1]

In [60]:
cutoff_table = pd.DataFrame({'cutoff': 1e-2*np.arange(10, 95, 5)})
cutoff_table['train_acc'] = [np.round(accuracy_score(y_train, (train_pred > cutoff).astype(int)), 3)
                             for cutoff in cutoff_table['cutoff']]
cutoff_table['valid_acc'] = [np.round(accuracy_score(y_valid, (valid_pred > cutoff).astype(int)), 3)
                             for cutoff in cutoff_table['cutoff']]
cutoff_table

Unnamed: 0,cutoff,train_acc,valid_acc
0,0.1,0.863,0.804
1,0.15,0.899,0.82
2,0.2,0.922,0.832
3,0.25,0.94,0.838
4,0.3,0.955,0.841
5,0.35,0.965,0.844
6,0.4,0.974,0.842
7,0.45,0.978,0.84
8,0.5,0.98,0.843
9,0.55,0.981,0.836


In [62]:
cutoff_table = pd.DataFrame({'cutoff': 1e-2*np.arange(30, 55, 1)})
cutoff_table['train_acc'] = [np.round(accuracy_score(y_train, (train_pred > cutoff).astype(int)), 4)
                             for cutoff in cutoff_table['cutoff']]
cutoff_table['valid_acc'] = [np.round(accuracy_score(y_valid, (valid_pred > cutoff).astype(int)), 4)
                             for cutoff in cutoff_table['cutoff']]
cutoff_table

Unnamed: 0,cutoff,train_acc,valid_acc
0,0.3,0.955,0.8407
1,0.31,0.9582,0.8414
2,0.32,0.9606,0.8414
3,0.33,0.9623,0.8436
4,0.34,0.9641,0.8436
5,0.35,0.9652,0.8444
6,0.36,0.9674,0.8414
7,0.37,0.9687,0.8429
8,0.38,0.971,0.8429
9,0.39,0.9727,0.8414


Choosing cutoff

In [67]:
treshold = 0.5

In [68]:
y_test_pred = (xgb_model.predict_proba(X_test)[:,1] > treshold).astype(int)
accuracy_score(y_test, y_test_pred)

0.9761727475800447

In [69]:
xgb_model.fit(X, y)
y_REAL_test = (xgb_model.predict_proba(campaign_test)[:,1] > treshold).astype(int)

In [70]:
y_pred = (xgb_model.predict_proba(X)[:,1] > treshold).astype(int)
accuracy_score(y, y_pred)

0.9729669347631814

In [27]:
file = open('test_file_xgboost.csv', 'w')
writer = csv.writer(file)
writer.writerow(['Id', 'subscription'])
for i in range(len(y_REAL_test)):
    writer.writerow([i, y_REAL_test[i]])
file.close()

# Missing values imputation 

XGboost doesn't need categorical variables to be encoded numerically, so we keep the columns in their original form. This is why the imputation is redone here, because we end up with train, valid and test datasets which still have categories as strings and no dummies created.

In [None]:
campaign_ad = pd.read_csv("MLUnige2023_subscriptions_train.csv", index_col="Id")
campaign_test = pd.read_csv("MLUnige2023_subscriptions_test.csv", index_col="Id")

num_vars = ['age', 'time_spent', 'banner_views', 'banner_views_old', 'days_elapsed_old', 'X4']

In [None]:
X = campaign_ad.drop(columns='subscription')
y = campaign_ad['subscription']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=46)

In [None]:
enc = OrdinalEncoder(categories=[['smartphone', 'desktop']])

In [None]:
X_tr_dev = X_train.drop(columns=['job', 'education'])
X_tr_imp_dev = X_tr_dev[X_tr_dev['device'] == 'na'].drop(columns='device')
X_tr_dev = X_tr_dev[X_tr_dev['device'] != 'na'] # this is the data set that does not have NAs
y_tr_dev = X_tr_dev[['device']] 
X_tr_dev = X_tr_dev.drop(columns='device')
y_tr_dev = enc.fit_transform(y_tr_dev)
y_tr_dev.shape = (y_tr_dev.shape[0],)

In [None]:
X_v_dev = X_valid.drop(columns=['job', 'education'])
X_v_imp_dev = X_v_dev[X_v_dev['device'] == 'na'].drop(columns='device')
X_v_dev = X_v_dev[X_v_dev['device'] != 'na']
y_v_dev = X_v_dev[['device']]
X_v_dev = X_v_dev.drop(columns='device')
y_v_dev = enc.fit_transform(y_v_dev)

In [None]:
X_te_dev = X_test.drop(columns=['job', 'education'])
X_te_imp_dev = X_te_dev[X_te_dev['device'] == 'na'].drop(columns='device')
X_te_dev = X_te_dev[X_te_dev['device'] != 'na']
y_te_dev = X_te_dev[['device']]
X_te_dev = X_te_dev.drop(columns='device')
y_te_dev = enc.fit_transform(y_te_dev)

In [None]:
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(), ['marital', 'outcome_old'])],
                                 remainder='passthrough')

rfc_dev = RandomForestClassifier(n_estimators=100, random_state=59, n_jobs=-2)

device_imputation = Pipeline([
    ("preprocessor", preprocessor),
    ("rfc_dev", rfc_dev)
])

In [None]:
device_imputation.fit(X_tr_dev, y_tr_dev)

In [None]:
y_tr_imp_dev = device_imputation.predict(X_tr_imp_dev)
y_v_imp_dev  = device_imputation.predict(X_v_imp_dev)
y_te_imp_dev = device_imputation.predict(X_te_imp_dev)

In [None]:
X_train.loc[X_tr_imp_dev.index, 'device'] = y_tr_imp_dev
X_valid.loc[X_v_imp_dev.index, 'device'] = y_v_imp_dev
X_test.loc[X_te_imp_dev.index, 'device'] = y_te_imp_dev

In [None]:
X_train.loc[X_train['device'] == 0, 'device'] = 'smartphone'
X_train.loc[X_train['device'] == 1, 'device'] = 'desktop'
X_valid.loc[X_valid['device'] == 0, 'device'] = 'smartphone'
X_valid.loc[X_valid['device'] == 1, 'device'] = 'desktop'
X_test.loc[X_test['device'] == 0, 'device'] = 'smartphone'
X_test.loc[X_test['device'] == 1, 'device'] = 'desktop'

In [None]:
imputer = SimpleImputer(missing_values="na", strategy='most_frequent')
X_train[['job', 'education']] = imputer.fit_transform(X_train[['job', 'education']])
X_valid[['job', 'education']] = imputer.fit_transform(X_valid[['job', 'education']])
X_test[['job', 'education']]  = imputer.fit_transform(X_test[['job', 'education']])

In [None]:
cat_vars = list(set(X_train.columns).difference(num_vars))

for col in cat_vars:
  X_train[col] = X_train[col].astype("category")
  X_valid[col] = X_valid[col].astype("category")
  X_test[col]  = X_test[col].astype("category")

In [None]:
X_campaign_test = campaign_test.drop(columns=['job', 'education'])
X_imp_campaign_test = X_campaign_test[X_campaign_test['device'] == 'na'].drop(columns='device')
X_campaign_test = X_campaign_test[X_campaign_test['device'] != 'na']
y_campaign_test = X_campaign_test[['device']]
X_campaign_test = X_campaign_test.drop(columns='device')
y_campaign_test = enc.fit_transform(y_campaign_test)
y_imp_campaign_test = device_imputation.predict(X_imp_campaign_test)
campaign_test.loc[X_imp_campaign_test.index, 'device'] = y_imp_campaign_test
campaign_test.loc[campaign_test['device'] == 0, 'device'] = 'smartphone'
campaign_test.loc[campaign_test['device'] == 1, 'device'] = 'desktop'
campaign_test[['job', 'education']] = imputer.fit_transform(campaign_test[['job', 'education']])
for col in cat_vars:
  campaign_test[col] = campaign_test[col].astype("category")