In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("advertising.csv")

In [3]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [7]:
df.drop(['Ad Topic Line', 'City', 'Country'], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,0,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,1,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,0,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,1,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,0,2016-06-03 03:36:18,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Daily Time Spent on Site    1000 non-null float64
Age                         1000 non-null int64
Area Income                 1000 non-null float64
Daily Internet Usage        1000 non-null float64
Male                        1000 non-null int64
Timestamp                   1000 non-null object
Clicked on Ad               1000 non-null int64
dtypes: float64(3), int64(3), object(1)
memory usage: 54.8+ KB


In [9]:
df.isnull().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Male                        0
Timestamp                   0
Clicked on Ad               0
dtype: int64

In [13]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [16]:
x = df['Timestamp'].iloc[2] #getting the first object of time

In [19]:
df['Hour'] = df['Timestamp'].apply(lambda t: t.hour)

In [20]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Timestamp,Clicked on Ad,Hour
0,68.95,35,61833.9,256.09,0,2016-03-27 00:53:11,0,0
1,80.23,31,68441.85,193.77,1,2016-04-04 01:39:02,0,1
2,69.47,26,59785.94,236.5,0,2016-03-13 20:35:42,0,20
3,74.15,29,54806.18,245.89,1,2016-01-10 02:31:19,0,2
4,68.37,35,73889.99,225.58,0,2016-06-03 03:36:18,0,3


In [21]:
def hour_to_part_of_day(n):
    if 2 <= n < 8:
        return "Night"
    if 8 <= n < 12:
        return "Morning"
    if 12 <= n < 18:
        return "Noon"
    if 18 <= n < 24 or n < 2:
        return "GoodTime"

In [23]:
df['Partofday'] = df['Hour'].apply(hour_to_part_of_day)

In [24]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Timestamp,Clicked on Ad,Hour,Partofday
0,68.95,35,61833.9,256.09,0,2016-03-27 00:53:11,0,0,GoodTime
1,80.23,31,68441.85,193.77,1,2016-04-04 01:39:02,0,1,GoodTime
2,69.47,26,59785.94,236.5,0,2016-03-13 20:35:42,0,20,GoodTime
3,74.15,29,54806.18,245.89,1,2016-01-10 02:31:19,0,2,Night
4,68.37,35,73889.99,225.58,0,2016-06-03 03:36:18,0,3,Night


In [27]:
df.drop(['Timestamp', 'Hour'], axis=1, inplace=True)

In [29]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad,Partofday
0,68.95,35,61833.9,256.09,0,0,GoodTime
1,80.23,31,68441.85,193.77,1,0,GoodTime
2,69.47,26,59785.94,236.5,0,0,GoodTime
3,74.15,29,54806.18,245.89,1,0,Night
4,68.37,35,73889.99,225.58,0,0,Night


In [33]:
df = pd.get_dummies(data=df, columns=['Partofday'], drop_first=True)

In [34]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad,Partofday_Morning,Partofday_Night,Partofday_Noon
0,68.95,35,61833.9,256.09,0,0,0,0,0
1,80.23,31,68441.85,193.77,1,0,0,0,0
2,69.47,26,59785.94,236.5,0,0,0,0,0
3,74.15,29,54806.18,245.89,1,0,0,1,0
4,68.37,35,73889.99,225.58,0,0,0,1,0


## *Manual Way*

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X = df.drop('Clicked on Ad', axis=1)
y = df['Clicked on Ad']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12)

In [58]:
from sklearn.linear_model import LogisticRegression

In [59]:
lm = LogisticRegression()

In [60]:
lm.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [61]:
train_predictions = lm.predict(X_train)
test_predictions = lm.predict(X_test)

In [62]:
from sklearn.metrics import classification_report

In [63]:
print("TRAIN:\n", classification_report(y_train, train_predictions))

TRAIN:
              precision    recall  f1-score   support

          0       0.87      0.92      0.90       345
          1       0.91      0.86      0.88       325

avg / total       0.89      0.89      0.89       670



In [64]:
print("TEST:\n", classification_report(y_test, test_predictions))

TEST:
              precision    recall  f1-score   support

          0       0.87      0.95      0.91       155
          1       0.95      0.87      0.91       175

avg / total       0.91      0.91      0.91       330



## The way

In [65]:
from sklearn.model_selection import GridSearchCV

In [None]:
LogisticRegression()

In [106]:
parameters = {'penalty':['l1', 'l2'], 'C':[1, 10, 30, 70, 100]}

In [171]:
gscv = GridSearchCV(LogisticRegression(n_jobs=1), 
                    param_grid=parameters, 
                    scoring='f1', 
                    cv=3,
                    return_train_score=True)

In [172]:
gscv.fit(X, y)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 10, 30, 70, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [174]:
gscv.best_params_

{'C': 10, 'penalty': 'l1'}

In [175]:
gscv.best_score_

0.9665947410825458

In [178]:
gscv.cv_results_['mean_test_score'].mean()

0.9308368473894646

In [179]:
gscv.cv_results_['std_test_score'].mean()

0.01191770660234626

In [180]:
gscv.grid_scores_

[mean: 0.96321, std: 0.01124, params: {'C': 1, 'penalty': 'l1'},
 mean: 0.89672, std: 0.00566, params: {'C': 1, 'penalty': 'l2'},
 mean: 0.96659, std: 0.01983, params: {'C': 10, 'penalty': 'l1'},
 mean: 0.89672, std: 0.00566, params: {'C': 10, 'penalty': 'l2'},
 mean: 0.96557, std: 0.01714, params: {'C': 30, 'penalty': 'l1'},
 mean: 0.89672, std: 0.00566, params: {'C': 30, 'penalty': 'l2'},
 mean: 0.96466, std: 0.02028, params: {'C': 70, 'penalty': 'l1'},
 mean: 0.89672, std: 0.00566, params: {'C': 70, 'penalty': 'l2'},
 mean: 0.96471, std: 0.02247, params: {'C': 100, 'penalty': 'l1'},
 mean: 0.89672, std: 0.00566, params: {'C': 100, 'penalty': 'l2'}]

In [181]:
model = gscv.best_estimator_

In [184]:
print(classification_report(y_test, model.predict(X_test), digits=3))

             precision    recall  f1-score   support

          0      0.962     0.981     0.971       155
          1      0.983     0.966     0.974       175

avg / total      0.973     0.973     0.973       330



In [183]:
print(classification_report(y_train, model.predict(X_train), digits=3))

             precision    recall  f1-score   support

          0       0.96      0.98      0.97       345
          1       0.98      0.96      0.97       325

avg / total       0.97      0.97      0.97       670



In [189]:
model.classes_

array([0, 1], dtype=int64)

In [207]:
one_datapoint = X_test.iloc[65]

In [198]:
one_datapoint

Daily Time Spent on Site       77.05
Age                            34.00
Area Income                 65756.36
Daily Internet Usage          236.08
Male                            0.00
Partofday_Morning               0.00
Partofday_Night                 0.00
Partofday_Noon                  0.00
Name: 797, dtype: float64

In [208]:
predict_click = np.round( model.predict_proba([one_datapoint]), 3)[0,1]

In [210]:
predict_click

0.013

In [213]:
predict_click = np.round( model.predict_proba(X.sample(10)), 3)[:,1]

In [214]:
predict_click

array([0.999, 0.008, 0.716, 0.948, 1.   , 0.993, 0.024, 0.981, 0.296,
       0.98 ])