In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import csv

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

# XGBoost

In [2]:
campaign_ad = pd.read_csv("MLUnige2023_subscriptions_train.csv", index_col="Id")
campaign_test = pd.read_csv("MLUnige2023_subscriptions_test.csv", index_col="Id")

X = campaign_ad.drop(columns='subscription')
y = campaign_ad['subscription']

In [3]:
num_vars = ['age', 'time_spent', 'banner_views', 'banner_views_old', 'days_elapsed_old', 'X4']
cat_vars = list(set(X.columns).difference(num_vars))

for col in cat_vars:
  X[col] = X[col].astype("category")
  campaign_test[col] = campaign_test[col].astype("category")

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=46)

In [5]:
# xgb_clf = XGBClassifier(tree_method="hist", enable_categorical=True)
# param_search = {'n_estimators': [150, 200, 250],
#                 'learning_rate': [0.16, 0.17, 0.18, 0.19],
#                 'max_depth': range(2, 5),
#                 'colsample_bytree': [1],
#                 'gamma': [i/10.0 for i in range(1, 4)],
#                 'scale_pos_weight':[.7, .8, 1]}
# model = GridSearchCV(xgb_clf, scoring='accuracy', param_grid=param_search, return_train_score=True, verbose=1, cv=3)
# model.fit(X_train, y_train)

In [6]:
xgb_clf = XGBClassifier(tree_method="hist",
                        objective="binary:logistic",
                        eval_metric='auc',
                        enable_categorical=True,
                        n_jobs=-2,
                        random_state=391)

param_search = {'n_estimators': range(50, 301),
                'max_depth': range(1, 8),
                'learning_rate': stats.uniform(.001, .3), # [.001, .3.001]
                'colsample_bytree': stats.uniform(.5,.5), # [.5, 1]
                'gamma': stats.uniform(0, .5),            # [0, .5]
                'scale_pos_weight':[1]}

model = RandomizedSearchCV(xgb_clf,
                           param_distributions=param_search,
                           n_iter=250,
                           return_train_score=True,
                           verbose=2,
                           cv= StratifiedKFold(n_splits=5, shuffle=True, random_state=611))

model.fit(X_train, y_train)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits
[CV] END colsample_bytree=0.9578299326676318, gamma=0.31401680160036444, learning_rate=0.27704362159434015, max_depth=3, n_estimators=212, scale_pos_weight=1; total time=   0.3s
[CV] END colsample_bytree=0.9578299326676318, gamma=0.31401680160036444, learning_rate=0.27704362159434015, max_depth=3, n_estimators=212, scale_pos_weight=1; total time=   0.2s
[CV] END colsample_bytree=0.9578299326676318, gamma=0.31401680160036444, learning_rate=0.27704362159434015, max_depth=3, n_estimators=212, scale_pos_weight=1; total time=   0.2s
[CV] END colsample_bytree=0.9578299326676318, gamma=0.31401680160036444, learning_rate=0.27704362159434015, max_depth=3, n_estimators=212, scale_pos_weight=1; total time=   0.2s
[CV] END colsample_bytree=0.9578299326676318, gamma=0.31401680160036444, learning_rate=0.27704362159434015, max_depth=3, n_estimators=212, scale_pos_weight=1; total time=   0.2s
[CV] END colsample_bytree=0.9510455315367058, 

In [7]:
best_params=model.best_params_
best_params

{'colsample_bytree': 0.8362720899180284,
 'gamma': 0.4168929215817826,
 'learning_rate': 0.11983128552815249,
 'max_depth': 7,
 'n_estimators': 210,
 'scale_pos_weight': 1}

In [None]:
#params with cv=3, n_iter = 1000
{'colsample_bytree': 0.7231233684423828,
 'gamma': 0.246830272707522,
 'learning_rate': 0.2818210861049318,
 'max_depth': 4,
 'n_estimators': 243,
 'scale_pos_weight': 1}

In [54]:
params_s_grid = {'colsample_bytree': 1,
 'gamma': 0.1,
 'learning_rate': 0.19,
 'max_depth': 4,
 'n_estimators': 150,
 'scale_pos_weight': 1}

In [55]:
params_s_best = {'colsample_bytree': 0.8480431984105623,
               'gamma': 0.43302507108794236,
               'learning_rate': 0.05825570698271222,
               'max_depth': 5,
               'n_estimators': 277,
               'scale_pos_weight': 1}

In [8]:
model.best_score_

0.863071976538604

In [19]:
xgb_model = XGBClassifier(**best_params,
                          tree_method='hist',
                          objective="binary:logistic",
                          eval_metric='auc',
                          enable_categorical=True)

xgb_model.fit(X_train, y_train)

In [20]:
train_pred = xgb_model.predict_proba(X_train)[:,1]
valid_pred = xgb_model.predict_proba(X_valid)[:,1]

In [21]:
cutoff_table = pd.DataFrame({'cutoff': 1e-2*np.arange(10, 95, 5)})
cutoff_table['train_acc'] = [np.round(accuracy_score(y_train, (train_pred > cutoff).astype(int)), 3)
                             for cutoff in cutoff_table['cutoff']]
cutoff_table['valid_acc'] = [np.round(accuracy_score(y_valid, (valid_pred > cutoff).astype(int)), 3)
                             for cutoff in cutoff_table['cutoff']]
cutoff_table

Unnamed: 0,cutoff,train_acc,valid_acc
0,0.1,0.865,0.808
1,0.15,0.902,0.823
2,0.2,0.926,0.832
3,0.25,0.944,0.837
4,0.3,0.958,0.843
5,0.35,0.97,0.843
6,0.4,0.978,0.846
7,0.45,0.983,0.852
8,0.5,0.985,0.845
9,0.55,0.983,0.844


In [22]:
cutoff_table = pd.DataFrame({'cutoff': 1e-2*np.arange(30, 50, 1)})
cutoff_table['train_acc'] = [np.round(accuracy_score(y_train, (train_pred > cutoff).astype(int)), 4)
                             for cutoff in cutoff_table['cutoff']]
cutoff_table['valid_acc'] = [np.round(accuracy_score(y_valid, (valid_pred > cutoff).astype(int)), 4)
                             for cutoff in cutoff_table['cutoff']]
cutoff_table

Unnamed: 0,cutoff,train_acc,valid_acc
0,0.3,0.9579,0.8429
1,0.31,0.9614,0.8436
2,0.32,0.963,0.8436
3,0.33,0.9652,0.8444
4,0.34,0.9666,0.8429
5,0.35,0.9695,0.8429
6,0.36,0.971,0.8414
7,0.37,0.9732,0.8414
8,0.38,0.9746,0.8436
9,0.39,0.9765,0.8444


Choosing cutoff

In [23]:
treshold = 0.44

In [24]:
y_test_pred = (xgb_model.predict_proba(X_test)[:,1] > treshold).astype(int)
accuracy_score(y_test, y_test_pred)

0.8674609084139985

In [25]:
xgb_model.fit(X, y)
y_REAL_test = (xgb_model.predict_proba(campaign_test)[:,1] > treshold).astype(int)

In [26]:
y_pred = (xgb_model.predict_proba(X)[:,1] > treshold).astype(int)
accuracy_score(y, y_pred)

0.9712913315460232

In [27]:
file = open('test_file_xgboost.csv', 'w')
writer = csv.writer(file)
writer.writerow(['Id', 'subscription'])
for i in range(len(y_REAL_test)):
    writer.writerow([i, y_REAL_test[i]])
file.close()

# Missing values imputation 

XGboost doesn't need categorical variables to be encoded numerically, so we keep the columns in their original form. This is why the imputation is redone here, because we end up with train, valid and test datasets which still have categories as strings and no dummies created.

In [None]:
campaign_ad = pd.read_csv("MLUnige2023_subscriptions_train.csv", index_col="Id")
campaign_test = pd.read_csv("MLUnige2023_subscriptions_test.csv", index_col="Id")

num_vars = ['age', 'time_spent', 'banner_views', 'banner_views_old', 'days_elapsed_old', 'X4']

In [None]:
X = campaign_ad.drop(columns='subscription')
y = campaign_ad['subscription']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=46)

In [None]:
enc = OrdinalEncoder(categories=[['smartphone', 'desktop']])

In [None]:
X_tr_dev = X_train.drop(columns=['job', 'education'])
X_tr_imp_dev = X_tr_dev[X_tr_dev['device'] == 'na'].drop(columns='device')
X_tr_dev = X_tr_dev[X_tr_dev['device'] != 'na'] # this is the data set that does not have NAs
y_tr_dev = X_tr_dev[['device']] 
X_tr_dev = X_tr_dev.drop(columns='device')
y_tr_dev = enc.fit_transform(y_tr_dev)
y_tr_dev.shape = (y_tr_dev.shape[0],)

In [None]:
X_v_dev = X_valid.drop(columns=['job', 'education'])
X_v_imp_dev = X_v_dev[X_v_dev['device'] == 'na'].drop(columns='device')
X_v_dev = X_v_dev[X_v_dev['device'] != 'na']
y_v_dev = X_v_dev[['device']]
X_v_dev = X_v_dev.drop(columns='device')
y_v_dev = enc.fit_transform(y_v_dev)

In [None]:
X_te_dev = X_test.drop(columns=['job', 'education'])
X_te_imp_dev = X_te_dev[X_te_dev['device'] == 'na'].drop(columns='device')
X_te_dev = X_te_dev[X_te_dev['device'] != 'na']
y_te_dev = X_te_dev[['device']]
X_te_dev = X_te_dev.drop(columns='device')
y_te_dev = enc.fit_transform(y_te_dev)

In [None]:
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(), ['marital', 'outcome_old'])],
                                 remainder='passthrough')

rfc_dev = RandomForestClassifier(n_estimators=100, random_state=59, n_jobs=-2)

device_imputation = Pipeline([
    ("preprocessor", preprocessor),
    ("rfc_dev", rfc_dev)
])

In [None]:
device_imputation.fit(X_tr_dev, y_tr_dev)

In [None]:
y_tr_imp_dev = device_imputation.predict(X_tr_imp_dev)
y_v_imp_dev  = device_imputation.predict(X_v_imp_dev)
y_te_imp_dev = device_imputation.predict(X_te_imp_dev)

In [None]:
X_train.loc[X_tr_imp_dev.index, 'device'] = y_tr_imp_dev
X_valid.loc[X_v_imp_dev.index, 'device'] = y_v_imp_dev
X_test.loc[X_te_imp_dev.index, 'device'] = y_te_imp_dev

In [None]:
X_train.loc[X_train['device'] == 0, 'device'] = 'smartphone'
X_train.loc[X_train['device'] == 1, 'device'] = 'desktop'
X_valid.loc[X_valid['device'] == 0, 'device'] = 'smartphone'
X_valid.loc[X_valid['device'] == 1, 'device'] = 'desktop'
X_test.loc[X_test['device'] == 0, 'device'] = 'smartphone'
X_test.loc[X_test['device'] == 1, 'device'] = 'desktop'

In [None]:
imputer = SimpleImputer(missing_values="na", strategy='most_frequent')
X_train[['job', 'education']] = imputer.fit_transform(X_train[['job', 'education']])
X_valid[['job', 'education']] = imputer.fit_transform(X_valid[['job', 'education']])
X_test[['job', 'education']]  = imputer.fit_transform(X_test[['job', 'education']])

In [None]:
cat_vars = list(set(X_train.columns).difference(num_vars))

for col in cat_vars:
  X_train[col] = X_train[col].astype("category")
  X_valid[col] = X_valid[col].astype("category")
  X_test[col]  = X_test[col].astype("category")

In [None]:
X_campaign_test = campaign_test.drop(columns=['job', 'education'])
X_imp_campaign_test = X_campaign_test[X_campaign_test['device'] == 'na'].drop(columns='device')
X_campaign_test = X_campaign_test[X_campaign_test['device'] != 'na']
y_campaign_test = X_campaign_test[['device']]
X_campaign_test = X_campaign_test.drop(columns='device')
y_campaign_test = enc.fit_transform(y_campaign_test)
y_imp_campaign_test = device_imputation.predict(X_imp_campaign_test)
campaign_test.loc[X_imp_campaign_test.index, 'device'] = y_imp_campaign_test
campaign_test.loc[campaign_test['device'] == 0, 'device'] = 'smartphone'
campaign_test.loc[campaign_test['device'] == 1, 'device'] = 'desktop'
campaign_test[['job', 'education']] = imputer.fit_transform(campaign_test[['job', 'education']])
for col in cat_vars:
  campaign_test[col] = campaign_test[col].astype("category")