In [2]:
#importing packages for data manuplation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#for easy access to path
import sys
import os

import warnings
warnings.filterwarnings('ignore')

In [5]:
sys.path.append(os.path.abspath(os.path.join('data')))
sys.path.insert(0,'../scripts_/')

In [7]:
#import packages for machine learning operation
import random
import math
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image  
from sklearn import tree
import scipy.stats as stat
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingClassifier

#import modules from script
from decision_tree import handler
from decision_tree import DecisionTreesModel
from XGboost import boost
from regression import Logistic
from helper import Helper

from app_logger import App_Logger

In [8]:
pd.set_option('max_column', None)
pd.set_option('max_rows', None)
#read csv file
df = pd.read_csv('../data/AdSmartABdata.csv')
df.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,6,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,0,0


In [9]:
#Users who responded to the questionaries
answered_df = df.copy(deep=True)
answered_df = answered_df[answered_df['yes'] == 1].append(answered_df[answered_df['no'] == 1])

In [10]:
control_df = answered_df[answered_df['experiment'] == 'control']
exposed_df = answered_df[answered_df['experiment'] == 'exposed']


In [11]:
# Forming a combined DataFrame substituting the yes and no column to response and having only 1 and 0 as values where 1 translates to a yes and 0 to no
#All users who answered yes
all_yes_df = control_df[control_df['yes'] == 1].append(exposed_df[exposed_df['yes'] == 1])
all_yes_df = all_yes_df.drop(['yes','no'], axis = 1)
all_yes_df['response'] = 1
#All users who answered no
all_no_df = control_df[control_df['no'] == 1].append(exposed_df[exposed_df['no'] == 1])
all_no_df = all_no_df.drop(['yes','no'], axis = 1)
all_no_df['response'] = 0
#Final Combined dataframe will be
combined_df = all_yes_df.append(all_no_df)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
combined_df.head(5)

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,response
0,1862cede-8050-4913-a9f0-45381c323c9e,control,2020-07-03,15,STF-L09,6,Facebook,0
1,dfa5e7ca-4f00-4bc9-b2fd-c2cec5a287f2,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,1
2,60e51496-0008-4836-baf5-24295395ac08,exposed,2020-07-03,10,Generic Smartphone,6,Chrome Mobile,1
3,4170b5fa-c561-4ef2-9242-a92c8e4c6342,control,2020-07-04,20,Samsung SM-G970F,6,Facebook,1
4,9484b215-8f77-484b-acce-0197401f85e1,exposed,2020-07-07,12,Generic Smartphone,6,Chrome Mobile,1


>Extracting all who responded either yes or no to one column

### Encoding categorical variables using Label Encoder

In [12]:
def encode_labels(combined_df):
    date_encoder = preprocessing.LabelEncoder()
    device_encoder = preprocessing.LabelEncoder()
    browser_encoder = preprocessing.LabelEncoder()
    experiment_encoder = preprocessing.LabelEncoder()
    response_encoder = preprocessing.LabelEncoder()
    
    combined_df['date'] = date_encoder.fit_transform(combined_df['date'])
    combined_df['device_make'] = device_encoder.fit_transform(combined_df['device_make'])
    combined_df['browser'] = browser_encoder.fit_transform(combined_df['browser'])
    combined_df['experiment'] = experiment_encoder.fit_transform(combined_df['experiment'])
    combined_df['browser'] = response_encoder.fit_transform(combined_df['browser'])
    combined_df['response'] = response_encoder.fit_transform(combined_df['response'])


    
    return combined_df

In [13]:
combined_df.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,response
0,1862cede-8050-4913-a9f0-45381c323c9e,control,2020-07-03,15,STF-L09,6,Facebook,0
1,dfa5e7ca-4f00-4bc9-b2fd-c2cec5a287f2,control,2020-07-03,15,Generic Smartphone,6,Chrome Mobile,1
2,60e51496-0008-4836-baf5-24295395ac08,exposed,2020-07-03,10,Generic Smartphone,6,Chrome Mobile,1
3,4170b5fa-c561-4ef2-9242-a92c8e4c6342,control,2020-07-04,20,Samsung SM-G970F,6,Facebook,1
4,9484b215-8f77-484b-acce-0197401f85e1,exposed,2020-07-07,12,Generic Smartphone,6,Chrome Mobile,1


## Modeling Training


In [14]:
def feature_data(cleaned_df):
    
    broweser_df = cleaned_df[["experiment", "hour", "date", 'device_make', 'browser', 'response']]
    platfrom_df = cleaned_df[["experiment", "hour", "date", 'device_make', 'platform_os', 'response']]

    return broweser_df, platfrom_df

In [15]:
def save_encoded_df(encoded_df):
    
    browser_df, platfrom_df = feature_data(encoded_df)
    browser_df.to_csv('../data/browser.csv', index=False)
    platfrom_df.to_csv('../data/platform.csv', index=False)

In [16]:
encoded_df = encode_labels(combined_df)
save_encoded_df(encoded_df)

In [17]:
browser_df= pd.read_csv("../data/browser.csv")
browser_df.head(5)

Unnamed: 0,experiment,hour,date,device_make,browser,response
0,0,15,0,40,4,0
1,0,15,0,13,1,1
2,1,10,0,13,1,1
3,0,20,1,71,4,1
4,1,12,4,13,1,1


In [18]:
df_platform= pd.read_csv("../data/platform.csv")
df_platform.head()

Unnamed: 0,experiment,hour,date,device_make,platform_os,response
0,0,15,0,40,6,0
1,0,15,0,13,6,1
2,1,10,0,13,6,1
3,0,20,1,71,6,1
4,1,12,4,13,6,1


#### Decision Tree

In [19]:
feature_cols = ["experiment", "hour", "date", 'device_make', 'browser']

X = browser_df[feature_cols]
y = browser_df[['response']]

test_size = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [23]:
decisionTreesModel = DecisionTreesModel(X_train, X_test,  y_train, y_test)

folds = 5
clf, acc_arr, loss_arr = decisionTreesModel.train(folds)

step 0: Validation Accuracy of DecisionTreesModel is: 0.549
step 0: Validation Loss of DecisionTreesModel is: 0.671

step 1: Validation Accuracy of DecisionTreesModel is: 0.571
step 1: Validation Loss of DecisionTreesModel is: 0.655

step 2: Validation Accuracy of DecisionTreesModel is: 0.491
step 2: Validation Loss of DecisionTreesModel is: 0.713

step 3: Validation Accuracy of DecisionTreesModel is: 0.543
step 3: Validation Loss of DecisionTreesModel is: 0.676

step 4: Validation Accuracy of DecisionTreesModel is: 0.489
step 4: Validation Loss of DecisionTreesModel is: 0.715

