In [3]:
import pandas as pd
from matplotlib import pyplot
import matplotlib.pyplot as plt
import numpy as np
import random
from operator import itemgetter
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [4]:
# Load Train data 
df_train_prepared = pd.read_pickle('df_train_prepared')
 

train_payprice = df_train_prepared['payprice']
train_bidprice = df_train_prepared['bidprice']
train_click = df_train_prepared['click']

rtb_train = df_train_prepared.drop(['click','payprice','bidprice'], axis =1)
rtb_train.head()



Unnamed: 0,weekday,hour,region,city,adexchange,slotwidth,slotheight,slotprice,advertiser,useragent_encoded,IP_encoded,url_encoded,domain_encoded,slotid_encoded,slotvisibility_encoded,slotformat_encoded,creative_encoded,keypage_encoded,usertag_encoded
0,5,22,2,2,2,200,200,5,3427,31,253587,268016,21783,12845,2,0,104,0,744035
1,1,20,238,239,1,300,250,0,2821,29,282720,145905,728,50771,6,3,9,18,744035
2,3,13,40,41,2,250,250,5,3427,31,469429,262017,18426,9136,2,0,92,0,188248
3,6,23,1,1,1,160,600,0,1458,31,107833,658015,4770,50450,2,1,111,12,706361
4,5,6,216,233,2,728,90,133,2259,31,342191,157670,153,1517,8,3,84,18,744035


In [5]:
# Scale validation data for better prediction when using logistic 
standard_scaler = StandardScaler().fit(rtb_train)
train_x = standard_scaler.transform(rtb_train)


In [7]:
# Load validation data
df_val_prepared = pd.read_pickle('df_val_prepared')

val_payprice = df_val_prepared['payprice']
val_bidprice = df_val_prepared['bidprice']
val_click = df_val_prepared['click']

rtb_validation = df_val_prepared.drop(['click','payprice','bidprice'], axis =1)
rtb_validation.head()

Unnamed: 0,weekday,hour,region,city,adexchange,slotwidth,slotheight,slotprice,advertiser,useragent_encoded,IP_encoded,url_encoded,domain_encoded,slotid_encoded,slotvisibility_encoded,slotformat_encoded,creative_encoded,keypage_encoded,usertag_encoded
0,4,20,79,79,1,160,600,0,1458,28,137474,45239,8595,14275,2,1,109,12,110919
1,1,21,79,79,1,950,90,0,3476,26,178167,109524,7201,14355,0,1,108,7,69995
2,4,8,2,2,2,300,250,5,3358,28,52834,75356,8455,7129,2,0,99,14,108448
3,5,15,201,205,2,336,280,5,3358,26,192739,27901,8931,8936,2,0,116,14,109043
4,1,18,134,135,2,200,200,5,3476,26,55070,135142,6582,423,1,0,42,5,6344


In [8]:
# Scale validation data for better prediction when using logistic 
standard_scaler = StandardScaler().fit(rtb_validation)
test_x = standard_scaler.transform(rtb_validation)


### Average CTR for Validation

In [9]:
# THIS IS AVERAGE CTR FOR VALIDATION DATA 


val_click_1 = df_val_prepared[df_val_prepared['click'] == 1]
val_AvgCTR =  float("{0:.5f}".format(len(val_click_1)/len(df_val_prepared['click'])))
val_AvgCTR 

0.00066

### Logistic Regression to Predict Probability of Click or Not Click

### LOGISTIC PREDICTION USING UNDER SAMPLING 

- Data is way too imblance only very little of click -> large prediction error
- Solution: UNDER SAMPLE
- Balance the click and non click -> improve the prediction for click or non click
 

In [10]:
from imblearn.under_sampling import RandomUnderSampler

mean_class_size = int(pd.Series(train_click).value_counts().sum()/1500)
mean_class_size

ratio = {0: mean_class_size,
         1: mean_class_size}

rus = RandomUnderSampler (ratio = ratio, random_state = 0)
rus

train_x_rus, train_y_rus = rus.fit_sample(train_x, train_click)

In [11]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import confusion_matrix 
 
log_reg = LogisticRegression()
clf2 = linear_model.LogisticRegression()
clf2.fit(train_x_rus, train_y_rus)

log2_prob  = clf2.predict_proba(test_x)
log2_pred = clf2.predict(test_x)

log2_error = sum(log2_pred != val_click) 
log2_correct = sum(log2_pred == val_click) 
print('Errors from Logistic with Under Sample:', log2_error)



Errors from Logistic with Under Sample: 79897


In [12]:
df_log2 = pd.DataFrame(columns=['probability_no_click','probability_click'], data = log2_prob)
df_log2['click'] = log2_pred
df_log2[840:845]

Unnamed: 0,probability_no_click,probability_click,click
840,0.548673,0.451327,0
841,0.388956,0.611044,1
842,0.40402,0.59598,1
843,0.68205,0.31795,0
844,0.71928,0.28072,0


### RANDOM FOREST

Random Forest produce slightly better prediction

In [13]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(train_x_rus, train_y_rus)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### TRAIN PREDICTION WITH RANDOM FOREST

In [14]:
# Prediction for train
train_tree_pred = tree_clf.predict(train_x)
train_tree_prob = tree_clf.predict_proba(train_x)

tree_error = sum(train_tree_pred != train_click) 
tree_correct = sum(train_tree_pred == train_click) 
print('Wrong Predictions from Decision Tree For Validation:', tree_error)


Wrong Predictions from Decision Tree For Validation: 589299


In [15]:
# Put in df
train_tree_predicted = pd.DataFrame(columns=['probability_no_click','probability_click'], data = train_tree_prob)

train_tree_predicted['click'] = train_tree_pred
train_tree_predicted.to_pickle('df_tree_prediction')
train_tree_predicted[840:845]

Unnamed: 0,probability_no_click,probability_click,click
840,0.613445,0.386555,0
841,0.613445,0.386555,0
842,0.613445,0.386555,0
843,0.434842,0.565158,1
844,0.613445,0.386555,0


### VALIDATION PREDICTION WITH RANDOM FOREST

In [16]:
# Prediction for validation
tree_pred = tree_clf.predict(test_x)
tree_prob = tree_clf.predict_proba(test_x)

tree_error = sum(tree_pred != val_click) 
tree_correct = sum(tree_pred == val_click) 
print('Wrong Predictions from Decision Tree For Validation:', tree_error)

Wrong Predictions from Decision Tree For Validation: 73985


In [17]:
# Put in df
df_tree_predicted = pd.DataFrame(columns=['probability_no_click','probability_click'], data = tree_prob)

df_tree_predicted['click'] = tree_pred 
df_tree_predicted.to_pickle('df_tree_prediction')
df_tree_predicted[840:845]

Unnamed: 0,probability_no_click,probability_click,click
840,0.613445,0.386555,0
841,0.100671,0.899329,1
842,0.613445,0.386555,0
843,0.613445,0.386555,0
844,0.613445,0.386555,0


In [18]:
# Mean square error for three models
#log1_rmse = np.sqrt(mean_squared_error(val_click, log1_pred))


log2_rmse = float("{0:.2f}".format(np.sqrt(mean_squared_error(val_click, log2_pred))))
tree_rmse = float("{0:.2f}".format(np.sqrt(mean_squared_error(val_click, tree_pred))))


model_errors = {'Logistic with Under Sample': [log2_rmse, log2_error, log2_correct], 'Decision Tree': [tree_rmse,tree_error, tree_correct]}
labels = ['RMSE', 'Misclassified', 'Correct Classified']

df_error = pd.DataFrame(model_errors, index = labels)
df_error



Unnamed: 0,Decision Tree,Logistic with Under Sample
RMSE,0.49,0.51
Misclassified,73985.0,79897.0
Correct Classified,229940.0,224028.0


### Linear Bidding 

basebid * pCRT/averageCTR 

pCTR = probability of click from Decision Tree Model 

In [19]:

pCTR_validation = df_tree_predicted['probability_click']
pCTR_validation.to_pickle('pCTR_validation')


In [20]:
pCTR_validation = pd.read_pickle('pCTR_validation')


In [22]:
pCTR_train = train_tree_predicted['probability_click']
pCTR_train.to_pickle('pCTR_train')