# Introduction to Data Science

Authors: Lior Tondovski, Ilan Vasilevski, Maya Vilenko

---

### imports

In [1]:
import pandas as pd
#import AdaBoost classifier
from sklearn.ensemble import AdaBoostClassifier
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#import XGBoost classifier
from xgboost import XGBClassifier
#import RandomSearchCV
from sklearn.model_selection import RandomizedSearchCV
#import train_test_split
from sklearn.model_selection import train_test_split
#import shap
import shap
#import AUC and accuracy score
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from utils import *


In [2]:
#read train and test data pickls
test_data= pd.read_pickle('processed_files/test_data.pkl')
validation_data = pd.read_pickle('processed_files/validation_data.pkl')
train_data_undersampled = pd.read_pickle('processed_files/train_data_undersampled.pkl')
train_data_undersampled_sm = pd.read_pickle('processed_files/train_data_undersampled_sm.pkl')

In [3]:
#take a subset of the data for faster hyperparameter tuning and model training
train_data_undersampled_sm = train_data_undersampled_sm.sample(50000, random_state=42)
train_data_undersampled = train_data_undersampled.sample(50000, random_state=42)

### Hyperparameter Tuning - Random Search

##### We chose to train AdaBoost, XGBoost and Random Forest (all tree-based models because most of the time they perform best on tabular data with a lot of categorical features)
##### The reason for choosing Random Search is due to the fact it is pretty fast.

In [4]:
#split the train data to predictors and target
X_train_sm = train_data_undersampled_sm.drop('clicked', axis=1)
y_train_sm = train_data_undersampled_sm['clicked']
X_train = train_data_undersampled.drop('clicked', axis=1)
y_train = train_data_undersampled['clicked']

In [5]:
#ADABoost Random Search 
ada_clf = AdaBoostClassifier(random_state=12)
#The parameters to be tuned
ada_param_grid = {
    'n_estimators': [50, 100, 200, 300], # n_estimators is the number of stumps to be used
    'learning_rate': [0.01, 0.05, 0.1, 0.3] # learning_rate is the weight of each stump
}


ada_random_search_sm = RandomizedSearchCV(ada_clf, param_distributions=ada_param_grid, n_iter=5, cv=2, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=12)
ada_random_search_sm.fit(X_train_sm, y_train_sm)
ada_random_search_sm.best_params_

#print the best score
print(ada_random_search_sm.best_score_)


Fitting 2 folds for each of 5 candidates, totalling 10 fits
0.8672802531419253


In [6]:
#ADABoost Random Search 
ada_clf = AdaBoostClassifier(random_state=12)

ada_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.3]
}

ada_random_search = RandomizedSearchCV(ada_clf, param_distributions=ada_param_grid, n_iter=5, cv=2, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=12)
ada_random_search.fit(X_train, y_train)
ada_random_search.best_params_

#print the best score
print(ada_random_search.best_score_)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
0.8398484777936833


In [7]:
#XGBoost Random Search
xgb_clf = XGBClassifier(random_state=12)
xgb_param_grid = {
    'n_estimators': [50, 100, 200], # n_estimators is the number of weak classifiers to be used
    'learning_rate': [0.05, 0.1, 0.3], # learning_rate is the weight of each weak classifier
    'max_depth': [3, 4, 5], # max_depth is the maximum depth of each tree
    'gamma': [0, 0.5, 1],  # gamma is hyperparameter that controls the tree pruning, higher values of gamma lead to more pruning
    'reg_lambda': [0, 0.5, 1] # reg_lambda is the L2 regularization term on weights
}

xgb_random_search_sm = RandomizedSearchCV(xgb_clf, param_distributions=xgb_param_grid, n_iter=5, cv=2, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=12)
xgb_random_search_sm.fit(X_train_sm, y_train_sm)
xgb_random_search_sm.best_params_

#print the best score
print(xgb_random_search_sm.best_score_)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


In [None]:
#XGBoost Random Search
xgb_clf = XGBClassifier(random_state=12)
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.3],
    'max_depth': [3, 4, 5],
    'gamma': [0, 0.5, 1],
    'reg_lambda': [0, 0.5, 1],
    
}

xgb_random_search = RandomizedSearchCV(xgb_clf, param_distributions=xgb_param_grid, n_iter=5, cv=2, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=12)
xgb_random_search.fit(X_train_sm, y_train_sm)
xgb_random_search.best_params_

print(xgb_random_search.best_score_)

In [None]:
#Random Forest Random Search
rf_clf = RandomForestClassifier(random_state=12)
rf_param_grid = {
    'n_estimators': [50, 100, 200], # n_estimators is the number of trees to be used
    'max_depth': [3, 4, 5], # max_depth is the maximum depth of each tree
    'min_samples_split': [2, 3, 20], # min_samples_split is the minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 10], # min_samples_leaf is the minimum number of samples required to be at a leaf node
}

rf_random_search_sm = RandomizedSearchCV(rf_clf, param_distributions=rf_param_grid, n_iter=5, cv=2, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=12)
rf_random_search_sm.fit(X_train_sm, y_train_sm)
rf_random_search_sm.best_params_

print(rf_random_search_sm.best_score_)

In [None]:
#Random Forest Random Search
rf_clf = RandomForestClassifier(random_state=12)
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 20],
    'min_samples_leaf': [1, 2, 10],
}

rf_random_search = RandomizedSearchCV(rf_clf, param_distributions=rf_param_grid, n_iter=5, cv=2, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=12)
rf_random_search.fit(X_train_sm, y_train_sm)
rf_random_search.best_params_

print(rf_random_search.best_score_)

The Model the Achieced the best performance in the ROC-AUC metric is: XGBoost on the data with the synthethic samples

In [None]:
#Now we will use the best parameters to train the model on the entire training set
#use XGBClassifier with the best parameters from the random search
xgb_clf = XGBClassifier(**xgb_random_search_sm.best_params_)
xgb_clf.fit(X_train_sm, y_train_sm)
#predict on the validation set
X_val = validation_data.drop(['clicked'], axis=1)
y_val = validation_data['clicked']
y_pred = xgb_clf.predict(X_val)
y_pred_prob = xgb_clf.predict_proba(X_val)[:, 1]

### Evaluation on Validation Set

In [None]:
#calculate the accuracy score
print(f'Accuracy Score : {accuracy_score(y_val, y_pred)}')
#calculate the precision score
print(f'Precision Score : {precision_score(y_val, y_pred)}')
#calculate the recall score
print(f'Recall Score : {recall_score(y_val, y_pred)}')
#calculate the f1 score
print(f'F1 Score : {f1_score(y_val, y_pred)}')
#calculate auc
print(f'AUC : {roc_auc_score(y_val, y_pred_prob)}')

In [None]:
#plot the ROC curve
plot_roc_curve(y_val, y_pred_prob)

In [None]:
#plot the precision recall curve
plot_precision_recall_curve(y_val, y_pred_prob)


In [None]:
#plot the confusion matrix
plot_confusion_matrix(y_val, y_pred)

Based on the confusion matrix, it is evident that the model incorrectly identifies non-clicked adds as clicked
The reason for it is the fact that the validation data represents the actual ratio of clicked ads.

#### Feature Importnace With Shap

In [None]:
#caculate the feature importance with shap values
explainer = shap.TreeExplainer(xgb_clf)
shap_values = explainer.shap_values(X_val)
#plot the top 10 features with the highest shap values
shap.summary_plot(shap_values, X_val, plot_type='bar', max_display=10)

In [None]:
#plot the top 10 features with the beeswarm plot
shap.summary_plot(shap_values, X_val, plot_type='dot', max_display=10)

According to the shap beeswarm and feature importance, the banner position is the most effective feature. It is very likely that there was no click if the banner was located at the bottom.

In addition, app_cat_games - casual attracts more clicks, while app_cat_games is exactly the opposite.

---

#### Local Interpretability with SHAP

In [None]:
#watterfall plot
#show real label and predicted label
print(f'Real Label : {y_val.iloc[0]}')
print(f'Predicted Label : {y_pred[0]}')
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X_val.iloc[0,:])

For that example it can be seen that the model chose to classifiy as "not clicked", mainly due to the banner position, device model and the state.

In [None]:
#show real label and predicted label
print(f'Real Label : {y_val.iloc[1]}')
print(f'Predicted Label : {y_pred[1]}')
shap.force_plot(explainer.expected_value, shap_values[1,:], X_val.iloc[1,:])

For that example it can be seen that the model chose to classifiy as "not clicked", user_isp, the state, month, device model and hour.

In [None]:
#get index of the ones
index = np.where(y_val == 1)[0]
#show real label and predicted label
print(f'Real Label: {y_val.iloc[index[0]]}')
print(f'Predicted Label : {y_pred[index[0]]}')

shap.force_plot(explainer.expected_value, shap_values[index[0],:], X_val.iloc[index[0],:])

In that example, it can be seen that the model chose to classify as "clicked", mostly due to the banner position and device model.
The device_width, the user_isp, as well as the day and hour, pulled strongly towards a "not clicked" and therfore the model misclassified this

----

##### predict on the test set

In [None]:
#predict on the test set
X_test = test_data
y_pred = xgb_clf.predict(X_test)
#save the predictions to a text file
np.savetxt('output_12.txt', y_pred, fmt='%d')