# Case Study - Click Through Prediction with Decision Trees

We will be working through a decision tree classifier on a dataset found on: https://www.kaggle.com/c/avazu-ctr-prediction/data


In [7]:
import pandas as pd
import numpy as np
from os.path import join
from mltoolbox.config import config


# read in data
DATA_DIR = join(config['data_dir'], 'click-rate-prediction')
click_df = pd.read_csv(join(DATA_DIR, 'train.csv'), nrows=150000)

# drop unnecessary columns for now
click_df.drop(['id', 'hour', 'device_id', 'device_ip'], axis=1, inplace=True)
click_df.head()

Unnamed: 0,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,44956a24,1,2,15706,320,50,1722,0,35,-1,79
1,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,711ee120,1,0,15704,320,50,1722,0,35,100084,79
2,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,8a4875bd,1,0,15704,320,50,1722,0,35,100084,79
3,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,6332421a,1,0,15706,320,50,1722,0,35,100084,79
4,0,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,07d7df22,779d90c2,1,0,18993,320,50,2161,0,35,-1,157


In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


# first step is to label encode the categorical columns
# which includes all the columns
click_df = click_df.apply(LabelEncoder().fit_transform)
click_df.head()

Unnamed: 0,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,0,2,0,126,928,2,808,30,0,794,1,1,136,3,2,36,0,1,0,18
1,0,2,0,126,928,2,808,30,0,1259,1,0,134,3,2,36,0,1,59,18
2,0,2,0,126,928,2,808,30,0,1529,1,0,134,3,2,36,0,1,59,18
3,0,2,0,126,928,2,808,30,0,1116,1,0,136,3,2,36,0,1,59,18
4,0,2,1,1080,551,0,808,30,0,1325,1,0,219,3,2,71,0,1,0,31


In [10]:
from sklearn.model_selection import train_test_split


# split X and y into np matricies explicitly
col_names = list(click_df)
X_names, y_names = list(filter(lambda name: name != 'click', col_names)), ['click']
X, y = np.array(click_df[X_names]), np.array(click_df[y_names])

# one hot encoding for categorical distance constaint
X_train = OneHotEncoder(categories='auto').fit_transform(X)

# split X and y into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


# now running the decision tree classifier using 
# grid search (combination of parameters) for optimization
parameters = {'max_depth': np.arange(5, 20, 3),
              'criterion': ['gini', 'entropy'],
              'min_samples_split': np.arange(20, 60, 10)}

# n_jobs = number of jobs to run in parallel
# cv = crossfold validation split
dtree = DecisionTreeClassifier()
gsearch = GridSearchCV(dtree, parameters, n_jobs=4, cv=3, scoring='roc_auc')
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)

{'criterion': 'entropy', 'max_depth': 11, 'min_samples_split': 50}


In [12]:
from sklearn.metrics import roc_auc_score


# obtain the decision tree object with the best parameter configuration
dtree_best = gsearch.best_estimator_

# obtain the classification score
dtree_prob_pred = dtree_best.predict_proba(X_test)[:, 1]
dtree_auc = roc_auc_score(y_test, dtree_prob_pred)
print(f'The ROC AUC on testing set is using optimized dtree classifier is {dtree_auc}')

The ROC AUC on testing set is using optimized dtree classifier is 0.7217812356826011
