In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pickle 

import warnings
warnings.simplefilter("ignore")
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = [6, 6]

In [2]:
# display all columns in preview
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
# load the dataset
df = pd.read_pickle('data_cleaning4.pkl')

In [4]:
df.shape

(111714, 27)

## Data imputation & processing

In [5]:
# Create a random sample from the dataset

df_sample = df.sample(frac=0.1)

In [6]:
df_sample.shape

(11171, 27)

In [7]:
df_sample.columns

Index(['Language', 'Website', 'Enquiry type', 'Enquiry status', 'Client budget', 'Num nights', 'Adults', 'Children', 'Flights booked', 'Country code', 'Click path', 'User agent', 'User repeat', 'User referral', 'GA source', 'GA medium', 'Device', 'GA keyword', 'Session duration', 'is booking', 'Sessions', 'Avg. session length (sec)', 'Avg. pageviews per session', 'Pageviews', 'Hits', 'Created month', 'Created to arrival'], dtype='object')

In [8]:
numerical_col =  ['Num nights', 'Adults', 'Children', 'Session duration', 'Sessions', 
                  'Avg. session length (sec)', 'Avg. pageviews per session', 
                  'Pageviews', 'Hits', 'Created to arrival']
categorical_col = ['Language', 'Website',  'Enquiry type', 'Enquiry status',
                   'Client budget','Country code','GA source',
                   'GA medium', 'Device', 'Created month']

binary_col = ['Flights booked', 'User agent', 'User repeat', 'User referral']
text_col = ['Click path','GA keyword']
target = ['is booking']

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from category_encoders import OneHotEncoder,HashingEncoder, BinaryEncoder 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer 
from sklearn.pipeline import make_pipeline, make_union
from mlxtend.feature_selection import ColumnSelector
from mlxtend.preprocessing import DenseTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [12]:
from lightgbm import LGBMClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, accuracy_score
from sklearn.metrics import SCORERS
import time
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

In [13]:
# Numerical pipeline

numerical_pipeline = make_pipeline(
    ColumnSelector(cols=numerical_col),
    SimpleImputer(strategy="median"),
    StandardScaler()
)


In [14]:
# Categorical pipeline

categorical_pipeline = make_pipeline(
    ColumnSelector(cols=categorical_col),
    SimpleImputer(strategy="constant",fill_value = 'None'),
    OneHotEncoder()
)


In [15]:
# Binary pipeline

binary_pipeline = make_pipeline(
    ColumnSelector(cols = binary_col),
    SimpleImputer(strategy="most_frequent"),
    BinaryEncoder()
)

In [16]:
# Helper transformer for text pipelines

from sklearn.base import BaseEstimator
class ReshapeTransformer(BaseEstimator):
    def __init__(self):
        self.is_fitted = False

    def transform(self, X, y=None):
        return X.reshape(X.shape[0],)

    def fit(self, X, y=None):
        self.is_fitted = True
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

In [17]:
# Text pipelines

text_pipeline_1 = make_pipeline(    
    ColumnSelector(cols = ['Click path']),
    SimpleImputer(strategy = 'constant',fill_value = ''),
    ReshapeTransformer(),
    HashingVectorizer(n_features = 2**11),
    DenseTransformer()
)

text_pipeline_2 = make_pipeline(
    ColumnSelector(cols = ['GA keyword']),
    SimpleImputer(strategy = 'constant',fill_value = ''),
    ReshapeTransformer(),
    TfidfVectorizer(),
    DenseTransformer()
)

In [18]:
# Pipeline union

processing_pipeline = make_union(
    numerical_pipeline,
    categorical_pipeline,
    binary_pipeline,
    text_pipeline_1,
    text_pipeline_2
)


In [19]:
X = df.drop('is booking',axis = 1)
y = df['is booking']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,
                                                    random_state=42)

In [20]:
# svd = TruncatedSVD(n_components=87)
# pca =  PCA(n_components=58)
estimator =    BalancedRandomForestClassifier(bootstrap=False, class_weight=None,
                 criterion='gini', max_depth=60, max_features='sqrt',
                 max_leaf_nodes=None, min_impurity_decrease=0.0,
                 min_samples_leaf=1, min_samples_split=5,
                 min_weight_fraction_leaf=0.0, n_estimators=472, n_jobs=1,
                 oob_score=False, random_state=None, replacement=False,
                 sampling_strategy='auto', verbose=0, warm_start=False)

# LGBM is another model I've tried

# LGBMClassifier(scale_pos_weight = weight, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
#          importance_type='split', learning_rate=0.1, max_depth=50,
#          min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
#          n_estimators=200, n_jobs=-1, num_leaves=300, objective=None,
#          random_state=None, reg_alpha=0.0, reg_lambda=0.0,
#           silent=True, subsample=0.8,
#          subsample_for_bin=200000, subsample_freq=10, 


# selector_kbest, pca, svd are all dimentionality reduction (feature selection) methods that I've tried

estimator_pipeline = make_pipeline(
    processing_pipeline,
#     selector_kbest800,
#     pca,
#     svd,
    estimator
)


In [22]:
%%time

estimator_pipeline.fit(X_train,y_train)

# (df_sample.drop(columns = target), 
#               df_sample[target])


CPU times: user 10min 6s, sys: 1min 44s, total: 11min 50s
Wall time: 11min 59s


Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(cols=['Num nights', 'Adults', 'Children', 'Session duration', 'Sessions', 'Avg. session length (sec)', 'Avg. pageviews per session', 'Pageview...te=None, replacement=False,
                sampling_strategy='auto', verbose=0, warm_start=False))])

In [420]:
predictions = estimator_pipeline.predict(X_test)
lead_score = estimator_pipeline.predict_proba(X_test)

In [421]:
true_classes = y_test
pipeline_rocauc = roc_auc_score(true_classes, predictions)
pipeline_rocauc

0.6151179794593558

In [423]:
estimator.classes_

array([0., 1.])

In [424]:
leadscore = lead_score[:,1]
leadscore

array([0.3130349 , 0.26893659, 0.39319344, ..., 0.50385109, 0.5869939 ,
       0.36048197])

In [427]:
pd.DataFrame({'lead score':leadscore,'true class': y_test},index = X_test.index).sort_values(by='lead score')

Unnamed: 0_level_0,lead score,true class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
64563.0,0.093339,1.0
64566.0,0.093339,0.0
64564.0,0.093339,0.0
64574.0,0.093339,0.0
64578.0,0.093339,1.0
64546.0,0.093339,0.0
64582.0,0.093339,1.0
64562.0,0.093339,0.0
64550.0,0.093339,0.0
64557.0,0.093339,0.0


In [None]:
# Below I tried generating feature importances
# But since I used OneHotEncoder to turn categorical data into numerical, the column names are no longer recognizable

In [428]:
estimator.feature_importances_.shape

(4179,)

In [429]:
estimator.n_features_

4179

In [431]:
estimator_pipeline.steps[1][1].feature_importances_

array([1.18683361e-02, 1.32755702e-02, 6.71343398e-03, ...,
       5.83968171e-04, 9.73699362e-07, 9.70376006e-06])

In [432]:
final_numerical_col = numerical_col
final_numerical_col

['Num nights',
 'Adults',
 'Children',
 'Session duration',
 'Sessions',
 'Avg. session length (sec)',
 'Avg. pageviews per session',
 'Pageviews',
 'Hits',
 'Created to arrival']

In [433]:
num_final_numerical_col = len(final_numerical_col)
num_final_numerical_col

10

In [434]:
final_categorical_col = estimator_pipeline.steps[0][1].transformer_list[1][1].steps[2][1].get_feature_names()
num_final_categorical_col = len(final_categorical_col)
num_final_categorical_col

575

In [435]:
final_binary_col = binary_col
final_binary_col

['Flights booked', 'User agent', 'User repeat', 'User referral']

In [436]:
num_final_binary_col = len(final_binary_col)
num_final_binary_col

4

In [437]:
# Text1 pipeline - Hashingvectorizer's parameter n_features is set to 2**11, 
# which means : The number of features (columns) in the output matrices is 2048

num_final_text1_col = estimator_pipeline.steps[0][1].transformer_list[3][1].steps[3][1].get_params()['n_features']
num_final_text1_col 

2048

In [438]:
final_text2_col = estimator_pipeline.steps[0][1].transformer_list[4][1].steps[3][1].get_feature_names()
num_final_text2_col = len(final_text2_col)
num_final_text2_col

1542

In [439]:
num_final_numerical_col + num_final_categorical_col + num_final_binary_col + num_final_text1_col + num_final_text2_col

4179

In [None]:
# Here I did hyperparameter tuning

In [None]:
# grid = {'selectkbest__k':  [2500,3000,3500,4000]}

# grid_search_k_pipeline = GridSearchCV(estimator = estimator_pipeline,
#                                                 param_grid= grid,
#                                                 cv = 3, 
#                                                 scoring="roc_auc",
#                                                 return_train_score = True,
#                                                 n_jobs=-1)

In [None]:
# %%time
# grid_search_k_pipeline.fit(X_train,y_train)


In [417]:
# lightgbm - 0.6 - k=3000 - df 80%train

# grid_search_k_pipeline.best_score_

0.6015141648361549

In [419]:
# grid_search_k_pipeline.cv_results_

{'mean_fit_time': array([269.83327198, 283.67580573, 299.62866902, 301.27803993]),
 'std_fit_time': array([2.55518959, 2.97292174, 5.82198277, 2.04565118]),
 'mean_score_time': array([31.84869568, 30.65845537, 38.72996426, 38.90689373]),
 'std_score_time': array([0.58576587, 3.0633607 , 1.93420748, 1.60148928]),
 'param_selectkbest__k': masked_array(data=[2500, 3000, 3500, 4000],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'selectkbest__k': 2500},
  {'selectkbest__k': 3000},
  {'selectkbest__k': 3500},
  {'selectkbest__k': 4000}],
 'split0_test_score': array([0.59980508, 0.5997547 , 0.59773556, 0.59773556]),
 'split1_test_score': array([0.605817  , 0.60421145, 0.60369018, 0.605337  ]),
 'split2_test_score': array([0.59495857, 0.6005764 , 0.59765725, 0.59765725]),
 'mean_test_score': array([0.60019355, 0.60151416, 0.59969431, 0.60024324]),
 'std_test_score': array([0.00444141, 0.00193653, 0.00282567, 0.00360194]),
 'r

In [210]:
# grid_search_brf_pipeline.best_estimator_.steps[-2]

('pca',
 PCA(copy=True, iterated_power='auto', n_components=66, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False))

In [None]:
# Below is a way to use cross validation to run the model (only on a sample of the whole dataset to save time)
# and save the metrics in a dataframe for better comparison

In [19]:
RESULTS = {}

def display_results(results):
    results_df  = pd.DataFrame(results).T
    results_cols = results_df.columns
    for col in results_df:
        results_df[col] = np.abs(results_df[col].apply(np.mean))
        results_df[col+"_idx"] = results_df[col] / results_df[col].min()
    return results_df

In [405]:
cv = cross_validate(estimator_pipeline,
               df_sample.drop(columns = target), 
               df_sample[target],
               scoring=["roc_auc","f1"], 
               cv=5)

In [406]:
RESULTS['10%_27columns_estimator300'] = cv

In [417]:
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_f1,test_roc_auc,train_f1,train_roc_auc,fit_time_idx,score_time_idx,test_f1_idx,test_roc_auc_idx,train_f1_idx,train_roc_auc_idx
10%_28columns_brf,42.534658,2.922994,0.215966,0.610615,0.440094,0.998933,2.960593,2.586064,1.105653,1.068712,1.234592,1.122209
10%_28columns_3000best_brf,107.558348,1.816651,0.21211,0.608316,0.440701,0.998586,7.486518,1.607248,1.085913,1.064687,1.236294,1.121819
10%_28columns_1000best_brf,35.18499,1.885296,0.212036,0.604634,0.436701,0.993419,2.449025,1.667981,1.085532,1.058243,1.225074,1.116014
10%_28columns_500best_brf,24.483953,2.230586,0.213048,0.597928,0.425498,0.980433,1.704187,1.973469,1.090711,1.046506,1.193645,1.101426
10%_28columns_200best_brf,14.366939,2.158451,0.195329,0.571356,0.35647,0.890149,1.0,1.90965,1.0,1.0,1.0,1.0
10%_28columns_800best_brf,23.61506,1.766769,0.214124,0.601774,0.436068,0.991372,1.643709,1.563116,1.096222,1.053237,1.223295,1.113715
10%_28columns_800best_brf_notext1,19.060909,1.130287,0.201143,0.582023,0.402662,0.997938,1.32672,1.0,1.029762,1.018669,1.129582,1.121092
10%_28columns_800best_brf_notext2,19.078274,1.337045,0.212447,0.600669,0.432506,0.992907,1.327929,1.182926,1.087635,1.051303,1.213304,1.11544
10%_28columns_800best_brf_notext3,22.442875,1.615096,0.211705,0.602396,0.431866,0.991855,1.562119,1.428926,1.083839,1.054326,1.21151,1.114258
10%_28columns_800best_brf_notext2&3,19.047579,1.297907,0.207694,0.599308,0.42437,0.992253,1.325792,1.148299,1.063304,1.048921,1.190481,1.114705


In [475]:
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_f1,test_roc_auc,train_f1,train_roc_auc,fit_time_idx,score_time_idx,test_f1_idx,test_roc_auc_idx,train_f1_idx,train_roc_auc_idx
truncatedsvd_brf_10%,7.128501,1.186547,0.171865,0.495871,0.373035,0.997907,1.0,1.122317,2.097006,1.0,1.0,1.028491
truncatedsvd_brf_10%_n=200,18.85444,1.100043,0.193598,0.551819,0.562077,0.999864,2.644938,1.040495,2.362184,1.112828,1.506766,1.030509
f_classif_10%_n=3000,9.215246,1.057231,0.081957,0.562386,0.894967,0.970263,1.292733,1.0,1.0,1.134139,2.399149,1.0
f_classif_10%_n=3000_email_brf,105.121813,2.189306,0.189332,0.583233,0.425607,0.998764,14.746693,2.070793,2.310135,1.17618,1.14093,1.029375
f_classif_10%_n=3000_w/o_email_brf,100.196568,4.042588,0.18787,0.58437,0.412495,0.998273,14.05577,3.823752,2.292297,1.178473,1.105781,1.028869


In [143]:
# display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_f1,test_roc_auc,train_f1,train_roc_auc,fit_time_idx,score_time_idx,test_f1_idx,test_roc_auc_idx,train_f1_idx,train_roc_auc_idx
lgbm_sample_scale_pos_weight,15.415726,4.505231,0.187747,0.551174,0.596669,0.978039,4.195762,5.288134,1.033565,1.000459,1.047367,1.005186
lgbm_sample_scale_pos_weight_nocity,8.306123,2.243567,0.182149,0.560131,0.597089,0.978141,2.260712,2.633446,1.002743,1.016716,1.048103,1.00529
lgbm_sample_scale_pos_weight_nocountry,8.665675,2.154304,0.184571,0.557921,0.576837,0.97551,2.358573,2.528672,1.01608,1.012705,1.012553,1.002587
lgbm_sample_scale_pos_weight_noclickpath,6.783381,2.19709,0.182162,0.558504,0.569685,0.972993,1.846261,2.578893,1.002817,1.013763,1.0,1.0
lgbm_sample_scale_pos_weight_nokeyword,10.04042,2.431356,0.185538,0.552656,0.593393,0.977527,2.732742,2.853869,1.021401,1.003148,1.041616,1.00466
lgbm_sample_scale_pos_weight_nocampaign,10.070543,2.451107,0.183651,0.551771,0.589365,0.977249,2.740941,2.877052,1.011016,1.001541,1.034546,1.004373
lgbm_sample_scale_pos_weight_noclientemail,3.674119,0.851951,0.18165,0.550922,0.585309,0.976129,1.0,1.0,1.0,1.0,1.027424,1.003222


In [57]:
# the categorical feature parameter in lightGBM makes it a lot faster to run the model
# but the metric is a lot worse than using OneHotEncoder

# display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_f1,test_roc_auc,train_f1,train_roc_auc,fit_time_idx,score_time_idx,test_f1_idx,test_roc_auc_idx,train_f1_idx,train_roc_auc_idx
lgbm_sample_catefeature_scale_pos_weight,2.126667,0.431753,0.140786,0.493068,0.557802,0.960783,1.0,1.0,1.0,1.0,1.0,1.0


In [93]:
# # lgbm without using parameters to balance target variable has very bad f1 score

# display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_f1,test_roc_auc,train_f1,train_roc_auc,fit_time_idx,score_time_idx,test_f1_idx,test_roc_auc_idx,train_f1_idx,train_roc_auc_idx
lgbm_sample_isunbalance,15.963921,4.617685,0.180297,0.548344,0.615571,0.978,1.0,1.0,100.966311,1.001512,1.736302,1.084332
lgbm_sample,16.15305,5.832235,0.001786,0.552801,0.35453,0.962292,1.011847,1.263021,1.0,1.009651,1.0,1.066916
lgbm_sample_scale_pos_weight,16.639641,4.853642,0.178073,0.547516,0.612422,0.977613,1.042328,1.051098,99.720959,1.0,1.727421,1.083903
lgbm_30sample_scale_pos_weight,159.45307,33.184465,0.216887,0.600277,0.429822,0.901938,9.98834,7.186385,121.456624,1.096364,1.212372,1.0


In [48]:
# display_results(RESULTS)

# # balancedrandomforest on df_sample

Unnamed: 0,fit_time,score_time,test_f1,test_roc_auc,train_f1,train_roc_auc,fit_time_idx,score_time_idx,test_f1_idx,test_roc_auc_idx,train_f1_idx,train_roc_auc_idx
Pipeline,21.735293,4.769191,0.215772,0.60061,0.350174,0.896568,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# Below I did hyperparameter tuning for LGBM model, but it performs worse than the model I finally chose

In [None]:
# lg = lgb.LGBMClassifier(silent=False)
# param_dist = {"max_depth": [25,50, 75],
#               "max_bin":[150,200,240]
#               "learning_rate" : [0.01,0.05,0.1],
#               "num_leaves": [300,900,1200],
#               "feature_fraction": [0.8,0.9,0.95],
#               "n_estimators": [200]
#              }
# grid_search = GridSearchCV(estimator = lg, 
#                            param_grid=param_dist, 
#                            cv = 3, 
#                            scoring="roc_auc",
#                            n_jobs=-1)
# grid_search.fit(train,y_train)
# grid_search.best_estimator_

In [101]:
# include client email domain as text performs better than including it as category
# but both perform worse than not including client email domain at all

# 0.59 drop client email domain, rest of categorical using onehot

display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score,fit_time_idx,score_time_idx,test_score_idx,train_score_idx
balanced_randomforest,9.544519,0.15029,0.5933,0.881427,1.721732,1.253783,1.128701,1.194438
balanced_bagger,31.763264,1.117723,0.578795,0.96311,5.729763,9.324532,1.101107,1.305128
RUSBoost,28.906347,0.332813,0.525648,0.922823,5.214405,2.776472,1.0,1.250535
easy_ensemble,97.440129,4.179468,0.55277,0.737943,17.577188,34.866954,1.051596,1.0
balanced_rf_hashing,5.543556,0.119869,0.567257,0.873023,1.0,1.0,1.079157,1.18305
balanced_rf_emailastext,12.676568,0.171573,0.586854,0.877544,2.286721,1.431338,1.116439,1.189176
