In [1]:
!pip install skorch -q 

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, FunctionTransformer, StandardScaler
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.model_selection import cross_val_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from skorch import NeuralNetClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,auc, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve

In [3]:
data = pd.read_csv('/kaggle/input/click-through-rate-prediction/ad_10000records.csv')


In [4]:
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Gender,Country,Timestamp,Clicked on Ad
0,62.26,32.0,69481.85,172.83,Decentralized real-time circuit,Lisafort,Male,Svalbard & Jan Mayen Islands,2016-06-09 21:43:05,0
1,41.73,31.0,61840.26,207.17,Optional full-range projection,West Angelabury,Male,Singapore,2016-01-16 17:56:05,0
2,44.4,30.0,57877.15,172.83,Total 5thgeneration standardization,Reyesfurt,Female,Guadeloupe,2016-06-29 10:50:45,0
3,59.88,28.0,56180.93,207.17,Balanced empowering success,New Michael,Female,Zambia,2016-06-21 14:32:32,0
4,49.21,30.0,54324.73,201.58,Total 5thgeneration standardization,West Richard,Female,Qatar,2016-07-21 10:54:35,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  10000 non-null  float64
 1   Age                       10000 non-null  float64
 2   Area Income               10000 non-null  float64
 3   Daily Internet Usage      10000 non-null  float64
 4   Ad Topic Line             10000 non-null  object 
 5   City                      10000 non-null  object 
 6   Gender                    10000 non-null  object 
 7   Country                   10000 non-null  object 
 8   Timestamp                 10000 non-null  object 
 9   Clicked on Ad             10000 non-null  int64  
dtypes: float64(4), int64(1), object(5)
memory usage: 781.4+ KB


In [6]:
data.describe()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Clicked on Ad
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,61.660757,35.9401,53840.047721,177.759831,0.4917
std,15.704142,8.572973,13343.708718,40.820951,0.499956
min,32.6,19.0,13996.5,105.22,0.0
25%,48.86,29.0,44052.3025,140.15,0.0
50%,59.59,35.0,56180.93,178.92,0.0
75%,76.58,42.0,61840.26,212.67,1.0
max,90.97,60.0,79332.33,269.96,1.0


In [7]:
print(data['Ad Topic Line'].value_counts());print('')
print(data['City'].value_counts());print('')
print(data['Gender'].value_counts());print('')
print(data['Country'].value_counts())

Ad Topic Line
Cloned explicit middleware                            344
Streamlined homogeneous analyzer                      212
Business-focused transitional solution                207
Sharable reciprocal project                           187
Intuitive exuding service-desk                        173
                                                     ... 
Multi-lateral empowering throughput                     1
Quality-focused zero-defect budgetary management        1
Versatile 6thgeneration parallelism                     1
Reverse-engineered content-based intranet               1
Multi-channeled reciprocal artificial intelligence      1
Name: count, Length: 559, dtype: int64

City
Hubbardmouth        336
West Brandonton     275
Lisafort            261
Wintersfort         230
Stewartbury         207
                   ... 
Karenton              1
Johnsonview           1
Jayville              1
Lake Michaelport      1
Timothyport           1
Name: count, Length: 521, dtype: int64


In [8]:
data.nunique()

Daily Time Spent on Site    460
Age                          39
Area Income                 524
Daily Internet Usage        505
Ad Topic Line               559
City                        521
Gender                        2
Country                     207
Timestamp                   567
Clicked on Ad                 2
dtype: int64

In [9]:
X = data.drop(labels = ['Clicked on Ad'],axis = 1)
Y = data['Clicked on Ad']
print(X.shape,Y.shape)

(10000, 9) (10000,)


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2,stratify = Y,random_state = 42)

In [11]:
le = LabelEncoder()
le.fit(X_train['Gender'])

def label_encode_gender(X):
    X = X.copy()
    X['Gender'] = le.transform(X['Gender'])
    return X

In [12]:
class CyclicalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, max_vals):
        self.cols = cols
        self.max_vals = max_vals

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col, max_val in zip(self.cols, self.max_vals):
            X[col + '_sin'] = np.sin(2 * np.pi * X[col] / max_val)
            X[col + '_cos'] = np.cos(2 * np.pi * X[col] / max_val)
        return X.drop(columns=self.cols)


class DateTimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, datetime_col):
        self.datetime_col = datetime_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.datetime_col] = pd.to_datetime(X[self.datetime_col], format="%Y-%m-%d %H:%M:%S")
        X['Month'] = X[self.datetime_col].dt.month
        X['Day'] = X[self.datetime_col].dt.day
        X['hour'] = X[self.datetime_col].dt.hour
        X['minute'] = X[self.datetime_col].dt.minute
        X['second'] = X[self.datetime_col].dt.second
        return X.drop(columns=[self.datetime_col])


In [13]:
datetime_cyclical_pipeline = Pipeline(steps=[
    ('datetime_transform', DateTimeTransformer(datetime_col='Timestamp')),
    ('cyclical_encoder', CyclicalEncoder(cols=['Month', 'Day', 'hour', 'minute', 'second'], 
                                         max_vals=[12, 31, 23, 59, 59]))
])

# Combining all transformers into a single pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('gender', FunctionTransformer(label_encode_gender), ['Gender']),
        ('target', ce.target_encoder.TargetEncoder(cols=['Ad Topic Line', 'City', 'Country']), ['Ad Topic Line', 'City', 'Country']),
        ('datetime_cyclical', datetime_cyclical_pipeline, ['Timestamp'])
    ], remainder=StandardScaler())



In [14]:
X_train_transformed = preprocessor.fit_transform(X_train, Y_train)
X_test_transformed = preprocessor.transform(X_test)

In [15]:
class clickclassifier(nn.Module):
    def __init__(self, l = 9):
        super().__init__()
        self.fc1 = nn.Linear(18,l)
        self.fc2 = nn.Linear(l,1)
    def forward(self , X):
        X = X.float()
        X = self.fc1(X)
        X = F.leaky_relu(X)
        X = self.fc2(X)
        X = F.sigmoid(X)
        return X




In [16]:
clf_1 = LogisticRegressionCV(cv = 10,solver="newton-cholesky")
clf_2 = DecisionTreeClassifier()
clf_3 = DecisionTreeClassifier(max_depth = 5)
clf_4 = DecisionTreeClassifier(max_depth = 10)
clf_5 = DecisionTreeClassifier(max_depth = 15)
clf_6 = RandomForestClassifier()
clf_7 = RandomForestClassifier(max_depth = 5)
clf_8 = RandomForestClassifier(max_depth = 5)
clf_9 = RandomForestClassifier(max_depth = 5)
clf_10 = GradientBoostingClassifier(random_state = 42)
clf_11 = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

CLASSIFIERS = [clf_1, clf_2 ,clf_3 ,clf_4 ,clf_5 ,clf_6 ,clf_7 ,clf_8 ,clf_9 ,clf_10 ,clf_11]

In [17]:
S = []
for clf in CLASSIFIERS:
    scores = cross_val_score(estimator=clf, X=X_train_transformed, y=Y_train, cv=10)
    S.append(str(type(clf).__name__ + f"acc = {scores.mean()} with +/- {scores.std()**2}"))

In [18]:
Y_train_resized = np.resize(Y_train, (Y_train.shape[0],1))

In [19]:
X_train_transformed = np.float32(X_train_transformed)
Y_train_resized = np.float32(Y_train_resized)

In [20]:
X_train_transformed.dtype ,Y_train_resized.dtype

(dtype('float32'), dtype('float32'))

In [None]:
# Neural network classifier
net = NeuralNetClassifier(
    clickclassifier,
    max_epochs=10,
    lr=0.1,
    iterator_train__shuffle=True,
    criterion=nn.BCELoss,
    optimizer=torch.optim.Adam,
    optimizer__weight_decay=0,
    verbose=0
)

params = {
    'lr': [1, 0.1, 0.01, 0.001, 0.0001],
    'module__l': [1,2,3,5,7,10],
    'max_epochs': [1,3,5,7,10],
    'optimizer__weight_decay': [0, 0.1, 0.01, 0.001, 0.0001]
}
gs = GridSearchCV(net, params, refit=True, cv=5, scoring='accuracy', verbose = 3)

gs.fit(X_train_transformed, Y_train_resized)
print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))

In [23]:
print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))

best score: 0.883, best params: {'lr': 0.01, 'max_epochs': 10, 'module__l': 10, 'optimizer__weight_decay': 0.001}


In [24]:
clf_12 = NeuralNetClassifier(
    clickclassifier(l = 10),
    max_epochs=10,
    lr=0.01,
    iterator_train__shuffle=True,
    criterion=nn.BCELoss,
    optimizer=torch.optim.Adam,
    verbose=0
)


scores = cross_val_score(estimator=clf_12, X=X_train_transformed, y=Y_train_resized, cv=10)
S.append(str(type(clf).__name__ + f"acc = {scores.mean()} with +/- {scores.std()**2}"))

In [25]:
for i in S:
    print(i)

LogisticRegressionCVacc = 0.8792500000000001 with +/- 0.00017568749999999993
DecisionTreeClassifieracc = 0.8402499999999999 with +/- 0.00021681250000000043
DecisionTreeClassifieracc = 0.8713749999999999 with +/- 7.326562499999986e-05
DecisionTreeClassifieracc = 0.8602500000000001 with +/- 6.743750000000009e-05
DecisionTreeClassifieracc = 0.844375 with +/- 0.0002407031250000003
RandomForestClassifieracc = 0.8958750000000002 with +/- 0.00013782812499999998
RandomForestClassifieracc = 0.8877499999999999 with +/- 0.00011556250000000005
RandomForestClassifieracc = 0.8863749999999999 with +/- 0.00011014062500000036
RandomForestClassifieracc = 0.8883750000000001 with +/- 0.00011939062499999988
GradientBoostingClassifieracc = 0.8943749999999999 with +/- 7.757812499999993e-05
XGBClassifieracc = 0.9 with +/- 6.0625000000000125e-05
XGBClassifieracc = 0.8801250000000002 with +/- 0.00014201562500000005


In [28]:
# Best Model is XGBoost Classifier for this particular application
final_clf = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
final_clf.fit(X_train_transformed,Y_train_resized)

Y_pred = final_clf.predict(X_test_transformed)
print(accuracy_score(Y_test,Y_pred))
print(precision_score(Y_test,Y_pred))
print(recall_score(Y_test,Y_pred))
print(f1_score(Y_test,Y_pred))
print(roc_curve(Y_test,Y_pred))
print(roc_auc_score(Y_test,Y_pred))


0.891
0.8805970149253731
0.9003051881993896
0.8903420523138833
(array([0.       , 0.1179941, 1.       ]), array([0.        , 0.90030519, 1.        ]), array([2, 1, 0]))
0.8911555439522022


In [32]:
conf = confusion_matrix(Y_test,Y_pred)
print(conf)

[[897 120]
 [ 98 885]]


In [30]:
import joblib
joblib.dump(final_clf, '/kaggle/working/clickthroughmodel.pkl')

import cloudpickle
with open('/kaggle/working/pipeline1.pkl', 'wb') as f:
    cloudpickle.dump(preprocessor, f)

['/kaggle/working/clickthroughmodel.pkl']