In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc 
%matplotlib inline

In [2]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [3]:
email = pd.read_csv('email_table.csv')
email_opened = pd.read_csv('email_opened_table.csv')
link_clicked = pd.read_csv('link_clicked_table.csv')

In [4]:
email_opened['opened']=1
link_clicked['clicked']=1

In [5]:
email2=email.merge(email_opened, how='left', on='email_id')
email3=email2.merge(link_clicked, how='left', on='email_id')

In [6]:
email3['clicked'].fillna(0, inplace=True)
email3['opened'].fillna(0, inplace=True)

In [7]:
email3.head()

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,opened,clicked
0,85120,short_email,personalized,2,Sunday,US,5,0.0,0.0
1,966622,long_email,personalized,12,Sunday,UK,2,1.0,1.0
2,777221,long_email,personalized,11,Wednesday,US,2,0.0,0.0
3,493711,short_email,generic,6,Monday,UK,1,0.0,0.0
4,106887,long_email,generic,14,Monday,US,6,0.0,0.0


In [None]:
email3['user_past_purchases'].value_counts().sort_index()

In [9]:
conditions = [
 email3['weekday'] == 'Monday',
 email3['weekday'] == 'Tuesday',
 email3['weekday'] == 'Wednesday',
 email3['weekday'] == 'Thursday',
 email3['weekday'] == 'Friday',
 email3['weekday'] == 'Saturday',
 email3['weekday'] == 'Sunday',
 ]

outputs = [0, 1, 2, 3, 4, 5, 6 ]

email3['weekday_num']=np.select(conditions, outputs)

In [10]:
email4 = pd.get_dummies(email3, columns=['email_text','email_version','user_country'], drop_first=True)
email4.drop(['weekday','opened'],axis=1, inplace=True)

In [11]:
email4.head()

Unnamed: 0,email_id,hour,user_past_purchases,clicked,weekday_num,email_text_short_email,email_version_personalized,user_country_FR,user_country_UK,user_country_US
0,85120,2,5,0.0,6,1,1,0,0,1
1,966622,12,2,1.0,6,0,1,0,1,0
2,777221,11,2,0.0,2,0,1,0,0,1
3,493711,6,1,0.0,0,1,0,0,1,0
4,106887,14,6,0.0,0,0,0,0,0,1


In [12]:
import math
email4['sin_wk']=email4['weekday_num'].apply(lambda x: math.sin(2*math.pi*x))
email4['cos_wk']=email4['weekday_num'].apply(lambda x: math.cos(2*math.pi*x))
email4['sin_hr']=email4['hour'].apply(lambda x: math.sin(2*math.pi*x))
email4['cos_hr']=email4['hour'].apply(lambda x: math.cos(2*math.pi*x))

In [13]:
X=email4.drop(['email_id','hour','clicked','weekday_num'], axis=1)
y=email4['clicked']

In [14]:
email4['clicked'].value_counts()  #about 2% clicked

0.0    97881
1.0     2119
Name: clicked, dtype: int64

In [15]:
X.head()

Unnamed: 0,user_past_purchases,email_text_short_email,email_version_personalized,user_country_FR,user_country_UK,user_country_US,sin_wk,cos_wk,sin_hr,cos_hr
0,5,1,1,0,0,1,-1.469576e-15,1.0,-4.898587e-16,1.0
1,2,0,1,0,1,0,-1.469576e-15,1.0,-2.939152e-15,1.0
2,2,0,1,0,0,1,-4.898587e-16,1.0,-9.79965e-15,1.0
3,1,1,0,0,1,0,0.0,1.0,-1.469576e-15,1.0
4,6,0,0,0,0,1,0.0,1.0,-3.429011e-15,1.0


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.3, random_state=42)

In [17]:
print('Train set shape: {}, positive ratio={:.3f}'.format(X_train.shape,y_train.mean()))
print('Test set shape: {}, positive ratio={:.3f}'.format(X_test.shape,y_test.mean()))

Train set shape: (70000, 10), positive ratio=0.021
Test set shape: (30000, 10), positive ratio=0.021


In [None]:
pip install bayesian-optimization

In [40]:
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.metrics import confusion_matrix,classification_report,precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization

In [41]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

RandomForestClassifier()

In [42]:
preds = rf.predict(X_test)

In [43]:
# https://scikit-learn.org/stable/modules/model_evaluation.html
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve, auc,recall_score,precision_score
check = rf.predict(X_test)
score = average_precision_score(y_test.values, check)
score

0.021952830188679245

In [50]:
from sklearn.model_selection import GridSearchCV
parameters = {'criterion': ['entropy','gini'],
             'n_estimators': [125,150,200],
             'max_depth': [3,5,7,9],
             'min_samples_split': [5,10,20]}

rf = RandomForestClassifier(class_weight = 'balanced',random_state = 32)
rf_model_gsv = GridSearchCV(rf, parameters, cv = 5, scoring = 'average_precision', n_jobs=-1)
rf_model_gsv = rf_model_gsv.fit(X_train, y_train)
rf_model_gsv.best_params_

{'criterion': 'entropy',
 'max_depth': 7,
 'min_samples_split': 5,
 'n_estimators': 125}

In [51]:
model_rf = RandomForestClassifier(criterion ='entropy', max_depth =7, min_samples_split=5,n_estimators=125,class_weight = 'balanced',random_state = 32)
model_rf.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=7, min_samples_split=5, n_estimators=125,
                       random_state=32)

In [53]:
rf_train_pred = model_rf.predict(X_train)
rf_test_pred = model_rf.predict(X_test)

print('Random ForestTrain Dataset report')
print('   '*60)
print(average_precision_score(y_train,rf_train_pred))
print('-'*60)
print('Random Forest Test Dataset report')
print('   '*60)
print(average_precision_score(y_test,rf_test_pred))

y_train_proba = model_rf.predict_proba(X_train)
y_test_proba = model_rf.predict_proba(X_test)
print('-'*60)
print('Random Forest train roc auc score: {0:.3f}'.format(roc_auc_score(y_train, y_train_proba[:, 1])))
print('-'*60)
print('Random Forest test roc auc score: {0:.3f}'.format(roc_auc_score(y_test, y_test_proba[:, 1])))

Random ForestTrain Dataset report
                                                                                                                                                                                    
0.034580094314295456
------------------------------------------------------------
Random Forest Test Dataset report
                                                                                                                                                                                    
0.03323481381675524
------------------------------------------------------------


NameError: name 'roc_auc_score' is not defined

In [44]:
def stratified_kfold_score(clf,X,y,n_fold):
    X,y = X.values,y.values
    strat_kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=1)
    accuracy_list = []

    for train_index, test_index in strat_kfold.split(X, y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        preds = clf.predict(x_test_fold)
        accuracy_test = average_precision_score(preds,y_test_fold)
        accuracy_list.append(accuracy_test)

    return np.array(accuracy_list).mean()

In [45]:
def bo_params_rf(max_samples,n_estimators,max_features):
    
    params = {
        'max_samples': max_samples,
        'max_features':max_features,
        'n_estimators':int(n_estimators)
    }
    clf = RandomForestClassifier(max_samples=params['max_samples'],max_features=params['max_features'],n_estimators=params['n_estimators'])
    score = stratified_kfold_score(clf,X_train, y_train,5)
    return score

In [46]:
rf_bo = BayesianOptimization(bo_params_rf, {
                                              'max_samples':(0.5,1),
                                                'max_features':(0.5,1),
                                              'n_estimators':(100,500)
                                             })

In [47]:
results = rf_bo.maximize(n_iter=200, init_points=20,acq='ei')

|   iter    |  target   | max_fe... | max_sa... | n_esti... |
-------------------------------------------------------------


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 1       [0m | [0m nan     [0m | [0m 0.6367  [0m | [0m 0.9306  [0m | [0m 287.0   [0m |


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 2       [0m | [0m nan     [0m | [0m 0.6402  [0m | [0m 0.6075  [0m | [0m 224.5   [0m |


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 3       [0m | [0m nan     [0m | [0m 0.9518  [0m | [0m 0.7902  [0m | [0m 262.1   [0m |


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 4       [0m | [0m nan     [0m | [0m 0.6204  [0m | [0m 0.6548  [0m | [0m 456.2   [0m |


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 5       [0m | [0m nan     [0m | [0m 0.7167  [0m | [0m 0.6595  [0m | [0m 451.0   [0m |


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 6       [0m | [0m nan     [0m | [0m 0.8555  [0m | [0m 0.6357  [0m | [0m 194.2   [0m |


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 7       [0m | [0m nan     [0m | [0m 0.7122  [0m | [0m 0.7351  [0m | [0m 466.7   [0m |


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 8       [0m | [0m nan     [0m | [0m 0.5077  [0m | [0m 0.9903  [0m | [0m 260.8   [0m |


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 9       [0m | [0m nan     [0m | [0m 0.5784  [0m | [0m 0.7265  [0m | [0m 280.0   [0m |


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 10      [0m | [0m nan     [0m | [0m 0.8232  [0m | [0m 0.6965  [0m | [0m 439.3   [0m |


  recall = tps / tps[-1]
  recall = tps / tps[-1]


| [0m 11      [0m | [0m nan     [0m | [0m 0.9649  [0m | [0m 0.6133  [0m | [0m 102.3   [0m |


  recall = tps / tps[-1]


KeyboardInterrupt: 

In [None]:
# https://docs.ray.io/en/latest/tune/tutorials/tune-xgboost.html#training-a-simple-xgboost-classifier