## Import Basic Libraries

In [60]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)


### Print training data set

In [61]:
dt = pd.read_csv('Train_Data.csv')
print('Training data shape: ', dt.shape)
dt.head(2)

Training data shape:  (4571, 9)


Unnamed: 0,date,campaign,adgroup,ad,impressions,clicks,cost,conversions,revenue
0,01-08-2020,campaign 1,adgroup 1,ad 1,24,6,0.08,0,0.0
1,01-08-2020,campaign 1,adgroup 2,ad 1,1,0,0.0,0,0.0


### Print testing data set


In [62]:
test = pd.read_csv('Test_Data.csv', )
print('Testing data shape: ', test.shape)
test.head(2)

Testing data shape:  (318, 8)


Unnamed: 0,date,campaign,adgroup,ad,cost,impressions,clicks,conversions
0,01-03-2021,campaign 1,adgroup 1,ad 1,0.58,121,49,1
1,01-03-2021,campaign 1,adgroup 3,ad 1,0.17,22,12,0


In [63]:
submission = pd.read_csv('Sample_Submission.csv')
submission.head()

Unnamed: 0,revenue
0,25
1,25
2,25
3,25
4,25


### Drop date and campaign column since it has no effect on the result

In [64]:
dt = dt.drop(['date','campaign'],axis=1)

In [65]:
test = test.drop(['date','campaign'],axis=1)

### Use one hot encoder for asgroup column

In [66]:
l_adgroups_dummies = pd.get_dummies(dt['adgroup'],drop_first=True)

In [67]:
dt = pd.concat([dt,l_adgroups_dummies],axis=1)

In [68]:
dt= dt.drop(['adgroup'],axis =1)

In [69]:
dt.head(2)

Unnamed: 0,ad,impressions,clicks,cost,conversions,revenue,adgroup 2,adgroup 3,adgroup 4
0,ad 1,24,6,0.08,0,0.0,0,0,0
1,ad 1,1,0,0.0,0,0.0,1,0,0


In [70]:
l_adgroups_dummies1 = pd.get_dummies(test['adgroup'],drop_first=True)

In [71]:
test = pd.concat([test,l_adgroups_dummies1],axis=1)

In [72]:
test= test.drop(['adgroup'],axis =1)

In [73]:
test.head(2)

Unnamed: 0,ad,cost,impressions,clicks,conversions,adgroup 2,adgroup 3,adgroup 4
0,ad 1,0.58,121,49,1,0,0,0
1,ad 1,0.17,22,12,0,0,1,0


### Use label encoder for ad column

In [74]:
from sklearn.preprocessing import LabelEncoder

In [75]:
le = LabelEncoder()
label = le.fit_transform(dt['ad'])

In [76]:
le.classes_

array(['ad 1', 'ad 10', 'ad 11', 'ad 12', 'ad 13', 'ad 14', 'ad 15',
       'ad 16', 'ad 17', 'ad 18', 'ad 19', 'ad 2', 'ad 20', 'ad 21',
       'ad 22', 'ad 23', 'ad 24', 'ad 25', 'ad 26', 'ad 27', 'ad 28',
       'ad 29', 'ad 3', 'ad 30', 'ad 31', 'ad 32', 'ad 33', 'ad 34',
       'ad 35', 'ad 36', 'ad 37', 'ad 38', 'ad 39', 'ad 4', 'ad 40',
       'ad 41', 'ad 42', 'ad 43', 'ad 44', 'ad 45', 'ad 46', 'ad 47',
       'ad 48', 'ad 49', 'ad 5', 'ad 50', 'ad 51', 'ad 52', 'ad 53',
       'ad 54', 'ad 55', 'ad 56', 'ad 57', 'ad 58', 'ad 59', 'ad 6',
       'ad 60', 'ad 61', 'ad 62', 'ad 63', 'ad 64', 'ad 65', 'ad 66',
       'ad 67', 'ad 68', 'ad 69', 'ad 7', 'ad 70', 'ad 8', 'ad 9'],
      dtype=object)

In [77]:
dt1 = dt.drop("ad", axis =1)

In [78]:
dt1['ad'] = label

In [79]:
dt1

Unnamed: 0,impressions,clicks,cost,conversions,revenue,adgroup 2,adgroup 3,adgroup 4,ad
0,24,6,0.08,0,0.00,0,0,0,0
1,1,0,0.00,0,0.00,1,0,0,0
2,13,4,0.04,0,0.00,0,1,0,0
3,5,4,0.08,0,0.00,0,0,1,0
4,247,126,1.29,4,925.71,0,0,0,11
...,...,...,...,...,...,...,...,...,...
4566,19,6,0.07,0,0.00,0,1,0,50
4567,1,0,0.00,0,0.00,0,0,1,50
4568,16,8,0.12,1,93.20,0,1,0,51
4569,37,13,0.23,0,0.00,0,0,0,51


In [80]:
le1 = LabelEncoder()
label1 = le1.fit_transform(test['ad'])

In [81]:
le1.classes_

array(['ad 1', 'ad 10', 'ad 2', 'ad 3', 'ad 4', 'ad 5', 'ad 55', 'ad 56',
       'ad 6', 'ad 7', 'ad 71', 'ad 72', 'ad 73', 'ad 74', 'ad 75',
       'ad 8'], dtype=object)

In [82]:
test = test.drop("ad", axis =1)

In [83]:
test['ad'] = label1

In [84]:
test

Unnamed: 0,cost,impressions,clicks,conversions,adgroup 2,adgroup 3,adgroup 4,ad
0,0.58,121,49,1,0,0,0,0
1,0.17,22,12,0,0,1,0,0
2,0.05,5,3,0,0,0,1,0
3,0.01,2,1,0,1,0,0,0
4,0.01,3,1,0,1,0,0,2
...,...,...,...,...,...,...,...,...
313,0.01,1,1,0,0,0,1,6
314,0.01,5,1,0,0,1,0,6
315,0.14,25,11,0,0,0,0,6
316,0.06,9,4,0,0,1,0,7


In [85]:
data_cols = [col for col in dt1.columns if col not in ['revenue']]
revenue_col = ['revenue']

### Divide the training data set into train and test

In [86]:
from sklearn.model_selection import train_test_split

In [87]:
X_train, X_test, y_train, y_test = train_test_split(dt1[data_cols], dt1[revenue_col], test_size=0.2, random_state=42)

In [88]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [89]:
from sklearn import preprocessing
from sklearn import utils

lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(y_train)
print(training_scores_encoded)
print(utils.multiclass.type_of_target(y_train))
print(utils.multiclass.type_of_target(y_train.astype('int')))
print(utils.multiclass.type_of_target(training_scores_encoded))

[  0   0  89 ... 555   0 301]
continuous
multiclass
multiclass


In [90]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [91]:
model2 = RandomForestClassifier(n_estimators=300, max_depth=6, random_state=42)

In [92]:
model2.fit(X_train, training_scores_encoded)

RandomForestClassifier(max_depth=6, n_estimators=300, random_state=42)

In [93]:
y_pred = model2.predict(X_test)

In [94]:
from sklearn.metrics import mean_squared_error

In [95]:
mse = mean_squared_error(y_pred , y_test)
rmse = np.sqrt(mse)

In [96]:
rmse

811.353624619885

In [97]:
  import keras
from keras.models import Sequential
from keras.layers import Dense

In [101]:
model = Sequential()
model.add(Dense(500, input_dim=8, activation= "relu"))
model.add(Dense(100, activation= "relu"))
model.add(Dense(50, activation= "relu"))
model.add(Dense(1))

In [102]:
model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])

In [103]:
model.fit(X_train, training_scores_encoded, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x236f0ecff10>

In [104]:
pred_train= model.predict(X_train)
print(np.sqrt(mean_squared_error(training_scores_encoded,pred_train)))

pred= model.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred))) 

82.03997354813569
764.4419824031581


In [106]:
res = pd.DataFrame(pred) #preditcions are nothing but the final predictions of your model on input features of your new unseen test data

res.columns = ["y_test"]
res.to_csv("prediction_results.csv", index = False)      # the csv file will be saved locally on the same location where this notebook is located.