# Base Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("../data/train.csv")
df.head()

Unnamed: 0,Line Item ID,Date,App/URL ID,ISP or Carrier ID,Device Type,Exchange ID,Operating System,Browser,Creative Size,Advertiser Currency,Impressions,IO_ID,CPM
0,2,17-08-2020,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105
1,2,17-08-2020,1362605575.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125
2,2,17-08-2020,20303819748.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02
3,2,17-08-2020,20303819748.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035
4,2,17-08-2020,20303819748.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022


## Model Training

In [3]:
df.columns = ['line_item_id', 'date', 'app_url_id', 'isp_or_carrier_id',
              'device_type', 'exchange_id', 'operating_system', 'browser',
              'creative_size', 'advertiser_currency', 'impressions', 'io_id', 'cpm']

In [4]:
# Change dtype of date
df['date'] = pd.to_datetime(df['date'], format = '%d-%m-%Y')
# get day of week
import datetime as dt
df['day_of_week'] = df['date'].dt.day_name()

In [5]:
# drop app/urls which are unknown, since its a very small fraction of the total data
unknown_app_ids = df[df['app_url_id'] == 'Unknown'].index
df.drop(unknown_app_ids, axis = 0, inplace = True)

In [6]:
# convert dtype of app_url_id
df['app_url_id'] = pd.to_numeric(df['app_url_id'])

In [7]:
df.head()

Unnamed: 0,line_item_id,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,io_id,cpm,day_of_week
0,2,2020-08-17,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105,Monday
1,2,2020-08-17,1362606000.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125,Monday
2,2,2020-08-17,20303820000.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02,Monday
3,2,2020-08-17,20303820000.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035,Monday
4,2,2020-08-17,20303820000.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022,Monday


### Feature Scaling is not required because splitting of the nodes takes place on a single feature, so scale doesnt matter

### Encoder Techniques

    - Try l1,l2 regularization
    - try catboost in lightgbm
    - track R2, adjusted R2, RMSE.
    - Once encoding is fixed, Try NN.

#### Target Encoding

In [None]:
#def target_encoder(df_temp, target, categorical, encoder):
# Target Encoding
# Make a copy of original dataset
df_temp = df.copy()
df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

categorical = ['app_url_id', 'device_type', 'operating_system', 
               'browser','creative_size', 'advertiser_currency','day_of_week']


enc = ce.TargetEncoder(cols = categorical)

# fit and transform
encoded_cols = enc.fit_transform(df_temp[categorical], df_temp['cpm'])

# merge encoded columns with dataset
df_temp[list(encoded_cols)] = encoded_cols

df_temp.head()

#### OneHotEncoder

In [15]:
from category_encoders import OneHotEncoder
df_temp = df.copy()
df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

features = ['device_type', 'day_of_week']
ohe_encoder = OneHotEncoder(features, )
ohe_encoded_columns = ohe_encoder.fit_transform(df_temp[features], df_temp['cpm'])
df_temp[list(ohe_encoded_columns)] = ohe_encoded_columns
df_temp.drop(features, axis = 1, inplace = True)
df_temp.head()

Unnamed: 0,date,app_url_id,isp_or_carrier_id,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,cpm,...,device_type_2,device_type_3,device_type_4,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7
0,2020-08-17,151640000000.0,1000,1,Macintosh,Chrome,728x90,CAD,2,0.0105,...,0,0,0,1,0,0,0,0,0,0
1,2020-08-17,1362606000.0,1000,1,Windows 10,Chrome,300x600,CAD,2,0.0125,...,0,0,0,1,0,0,0,0,0,0
2,2020-08-17,20303820000.0,207,1,Windows 7,Chrome,160x600,CAD,2,0.02,...,0,0,0,1,0,0,0,0,0,0
3,2020-08-17,20303820000.0,666,1,Windows 10,Chrome,160x600,CAD,2,0.035,...,0,0,0,1,0,0,0,0,0,0
4,2020-08-17,20303820000.0,1000,1,Windows 10,Chrome,160x600,CAD,4,0.022,...,0,0,0,1,0,0,0,0,0,0


#### CatBoost Encoder
    - Supported for continuous targets
    - training data must be randomly permutated

In [9]:
#def catboost(df_temp, categorical, target, encoder):
from category_encoders import CatBoostEncoder
import category_encoders as ce

# Make a copy of original dataset
#df_temp = df.copy()
#df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

# Random permutation 
np.random.seed(100)
perm = np.random.permutation(len(df_temp)) 
df_temp = df_temp.iloc[perm].reset_index(drop = True)
#target = df_temp['cpm']

#df_temp.drop('cpm', axis = 1, inplace = True)

# CatBoostEncoder
categorical = ['app_url_id', 'operating_system',
               'browser','creative_size', 'advertiser_currency']
enc = CatBoostEncoder(cols = categorical, )

# fit and transform
cat_boost_encoded_cols = enc.fit_transform(df_temp[categorical], df_temp['cpm'])

# Merge encoded columns with original 
df_temp[list(cat_boost_encoded_cols)] = cat_boost_encoded_cols

#df_temp['cpm'] = target

df_temp.head()

Unnamed: 0,date,app_url_id,isp_or_carrier_id,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,cpm,day_of_week,device_type_1,device_type_2,device_type_3,device_type_4
0,2020-08-17,2.10074,9,1,2.10074,2.10074,2.10074,2.10074,2,0.595,2.10074,0,1,0,0
1,2020-08-27,2.10074,330,1,1.34787,1.34787,1.34787,1.34787,3,0.699,2.10074,0,1,0,0
2,2020-08-22,2.10074,38,1,2.10074,2.10074,2.10074,2.10074,1,3.808,2.10074,0,1,0,0
3,2020-08-19,2.10074,266,1,2.10074,1.13158,2.10074,1.13158,1,0.907,2.10074,0,0,0,1
4,2020-08-22,2.10074,673,8,2.10074,2.10074,1.50387,2.10074,1,0.801,2.95437,1,0,0,0


#### JamesStein Encoder

In [None]:
from category_encoders import JamesSteinEncoder
# Make a copy of original dataset
#df_temp = df.copy()
#df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

# Random permutation 
#perm = np.random.permutation(len(df_temp)) 
#X = X.iloc[perm].reset_index(drop=True) 
#y = y.iloc[perm].reset_index(drop=True)
#df_temp = df_temp.iloc[perm].reset_index(drop = True)

# CatBoostEncoder
categorical = ['app_url_id', 'operating_system', 'day_of_week',
               'browser','creative_size', 'advertiser_currency']

js_enc = JamesSteinEncoder(cols = categorical, 
                           randomized=True, 
                           random_state=10)

# fit and transform
js_encoded_cols = js_enc.fit_transform(df_temp[categorical], df_temp['cpm'])
# Merge encoded columns with original 
df_temp[list(js_encoded_cols)] = js_encoded_cols
df_temp.head()

In [None]:
df_temp.info()

### Train-test split
    Take the last available date as test data

In [10]:
df_temp.head()

Unnamed: 0,date,app_url_id,isp_or_carrier_id,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,cpm,day_of_week,device_type_1,device_type_2,device_type_3,device_type_4
0,2020-08-17,2.10074,9,1,2.10074,2.10074,2.10074,2.10074,2,0.595,2.10074,0,1,0,0
1,2020-08-27,2.10074,330,1,1.34787,1.34787,1.34787,1.34787,3,0.699,2.10074,0,1,0,0
2,2020-08-22,2.10074,38,1,2.10074,2.10074,2.10074,2.10074,1,3.808,2.10074,0,1,0,0
3,2020-08-19,2.10074,266,1,2.10074,1.13158,2.10074,1.13158,1,0.907,2.10074,0,0,0,1
4,2020-08-22,2.10074,673,8,2.10074,2.10074,1.50387,2.10074,1,0.801,2.95437,1,0,0,0


In [11]:
from sklearn.model_selection import train_test_split

df_temp.drop('date', axis = 1, inplace = True)

X_train, X_test, y_train, y_test = train_test_split(df_temp.drop('cpm', axis = 1),
                                                    df_temp['cpm'],                                                   
                                                    test_size = 0.2,
                                                    random_state = 100)
print(X_train.shape)
print(X_test.shape)

(1113539, 13)
(278385, 13)


In [None]:
# Take 27/08/2020 as test dataset
df_test = df_temp[df_temp['date'] == '2020-08-27'].copy()
df_train = df_temp.drop(df_test.index, axis = 0)

# Separate train and test data into predictors and target
df_test.drop('date', axis = 1, inplace = True)
X_test = df_test.drop('cpm', axis = 1).to_numpy()
y_test = df_test['cpm'].to_numpy()

df_train.drop('date', axis = 1, inplace = True)
X_train = df_train.drop('cpm', axis = 1).to_numpy()
y_train = df_train['cpm'].to_numpy()

# Shape of Train and test datasets
print("X_train shape: {} ".format(X_train.shape))
print("X_test shape: {}".format(X_test.shape))

del(df_temp)
del(df_train)
del(df_test)

### ElasticNet regularization

In [None]:
from sklearn.linear_model import ElasticNetCV, ElasticNet

cv_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], 
                        eps=1e-3, n_alphas=100, fit_intercept=True, 
                        normalize=True, precompute='auto', 
                        max_iter=2000, tol=0.0001, cv=6, copy_X=True, 
                        verbose=0, n_jobs=-1, positive=False, random_state=0)
               
cv_model.fit(X_train, y_train)
print('Optimal alpha: %.8f'%cv_model.alpha_)
print('Optimal l1_ratio: %.3f'%cv_model.l1_ratio_)
print('Number of iterations %d'%cv_model.n_iter_)

### Training

#### Linear Regression

In [12]:
#def LR(X_train, X_test, y_train, y_test, encoder):
#print("Encoder Used: {}".format(encoder))

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

# R2 Score

r2_train = reg.score(X_train, y_train)
r2_test = reg.score(X_test, y_test)
print("Linear Regression")
print("Train R2 Score: {:.3f}".format(r2_train))
print("Test R2 Score: {:.3f}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

# MSE
mse_train = mean_squared_error(y_train, reg.predict(X_train))
mse_test = mean_squared_error(y_test, reg.predict(X_test))
#print("Linear Regression")
print("MSE Test: {}".format(mse_test))
print("MSE Train: {}".format(mse_train))

Linear Regression
Train R2 Score: 0.121
Test R2 Score: 0.128
Adjusted R2 train : 0.121
Adjusted R2 test : 0.128
MSE Test: 32.850240313595116
MSE Train: 33.24979335148933


#### SKlearn's new HistGradientBoostingRegressor

In [13]:
#def HistGBR(X_train, X_test, y_train, y_test, encoder):
#print("Encoder Used:{}".format(encoder))

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from time import time

model = HistGradientBoostingRegressor()
tic = time()
model.fit(X_train, y_train)
print("done in {:.3f}s".format(time() - tic))

# R2 score

r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("HistGradientBoostingRegressor")
print("Train R2 Score: {:.3f}".format(r2_train))
print("Test R2 Score: {:.3f}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

# MSE
mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("MSE on test: {}".format(mse_test))
print("MSE on train: {}".format(mse_train))

done in 5.788s
HistGradientBoostingRegressor
Train R2 Score: 0.301
Test R2 Score: 0.278
Adjusted R2 train : 0.301
Adjusted R2 test : 0.278
MSE on test: 27.19325582519761
MSE on train: 26.44605961251656


#### GradientBoostingRegressor

In [14]:
#def GBR(X_train, X_test, y_train, y_test, encoder):
#print("Encoder Used: {}".format(encoder))

from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor().fit(X_train, y_train)

# R2 Score
r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("GradientBoostingRegressor")
print("Train R2 Score: {}".format(r2_train))
print("Test R2 Score: {}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

# MSE
mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("GradientBoostingRegressor")
print("MSE on test: {}".format(mse_test))
print("MSE on train: {}".format(mse_train))

GradientBoostingRegressor
Train R2 Score: 0.2036726133167973
Test R2 Score: 0.204218236405129
Adjusted R2 train : 0.204
Adjusted R2 test : 0.204
GradientBoostingRegressor
MSE on test: 29.972886170794233
MSE on train: 30.137261544673464


    These models are definitely overfitting, choice of encoding could be one of the reason