## Import the relevant libraries

In [32]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

## Load the data

In [33]:
raw_data = pd.read_csv('advertising.csv')
raw_data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


Here, out objective is to know whether an individual might click on the ad analysing a set of inputs.

In [34]:
# We make sure to create a copy of the data before we start altering it. Note that we don't change the original data we loaded.
data = raw_data.copy()
# Removing irrelevent coloumns.
data = data.drop(['Timestamp','Ad Topic Line'], axis = 1)
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Male,Country,Clicked on Ad
0,68.95,35,61833.9,256.09,Wrightburgh,0,Tunisia,0
1,80.23,31,68441.85,193.77,West Jodi,1,Nauru,0
2,69.47,26,59785.94,236.5,Davidton,0,San Marino,0
3,74.15,29,54806.18,245.89,West Terrifurt,1,Italy,0
4,68.37,35,73889.99,225.58,South Manuel,0,Iceland,0


In [35]:
data.describe()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,65.0002,36.009,55000.00008,180.0001,0.481,0.5
std,15.853615,8.785562,13414.634022,43.902339,0.499889,0.50025
min,32.6,19.0,13996.5,104.78,0.0,0.0
25%,51.36,29.0,47031.8025,138.83,0.0,0.0
50%,68.215,35.0,57012.3,183.13,0.0,0.5
75%,78.5475,42.0,65470.635,218.7925,1.0,1.0
max,91.43,61.0,79484.8,269.96,1.0,1.0


In [36]:
#Calculating Varience Inflation Factor

from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = data[['Daily Time Spent on Site','Age','Area Income','Daily Internet Usage','Male']]
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif["Features"] = variables.columns

In [37]:
vif

Unnamed: 0,VIF,Features
0,22.802493,Daily Time Spent on Site
1,8.670393,Age
2,18.585312,Area Income
3,23.488407,Daily Internet Usage
4,1.902386,Male


In [38]:
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,City,Male,Country,Clicked on Ad
0,68.95,35,61833.9,256.09,Wrightburgh,0,Tunisia,0
1,80.23,31,68441.85,193.77,West Jodi,1,Nauru,0
2,69.47,26,59785.94,236.5,Davidton,0,San Marino,0
3,74.15,29,54806.18,245.89,West Terrifurt,1,Italy,0
4,68.37,35,73889.99,225.58,South Manuel,0,Iceland,0


In [39]:
#Removing some more irrelevent coloumns
dataf = data.drop(['City','Country'],axis=1)

In [40]:
dataf.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
0,68.95,35,61833.9,256.09,0,0
1,80.23,31,68441.85,193.77,1,0
2,69.47,26,59785.94,236.5,0,0
3,74.15,29,54806.18,245.89,1,0
4,68.37,35,73889.99,225.58,0,0


In [41]:
#Rearranging coloumns a bit
dataf.columns.values

array(['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage', 'Male', 'Clicked on Ad'], dtype=object)

In [42]:
cols = ['Clicked on Ad','Daily Time Spent on Site', 'Age', 'Area Income', 'Male',
       ]

In [43]:
data_preprocessed = dataf[cols]
data_preprocessed.head()

Unnamed: 0,Clicked on Ad,Daily Time Spent on Site,Age,Area Income,Male
0,0,68.95,35,61833.9,0
1,0,80.23,31,68441.85,1
2,0,69.47,26,59785.94,0
3,0,74.15,29,54806.18,1
4,0,68.37,35,73889.99,0


### Declare the dependent and independent variables

In [44]:
y = data_preprocessed['Clicked on Ad']
x1 = data_preprocessed.drop(['Clicked on Ad'],axis=1)

# Scaling

In [45]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x1)

StandardScaler()

In [46]:
x1_scaled = scaler.transform(x1)

# Splitting 

In [47]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x1_scaled,y, test_size=0.2, random_state=365)

In [48]:
x1_scaled

array([[ 0.24926659, -0.11490498,  0.50969109, -0.96269532],
       [ 0.96113227, -0.57042523,  1.00253021,  1.03875025],
       [ 0.28208309, -1.13982553,  0.35694859, -0.96269532],
       ...,
       [-0.84377541,  1.707176  , -0.93857029,  1.03875025],
       [-0.59638946, -1.93698596, -0.97548353, -0.96269532],
       [-1.26155474, -1.13982553, -1.87383208, -0.96269532]])

In [49]:
x_train

array([[-0.51056168,  1.59329594, -2.47814483, -0.96269532],
       [-0.49226017,  0.68225545,  0.2042844 , -0.96269532],
       [-0.99649837,  1.13777569, -0.43210266, -0.96269532],
       ...,
       [-0.73649069, -0.22878504, -0.79989037,  1.03875025],
       [ 0.07950429,  1.25165575, -3.01701717,  1.03875025],
       [-0.95673991,  0.2267352 ,  0.95326532,  1.03875025]])

### Simple Logistic Regression

Run the regression and graph the scatter plot.

In [50]:
x_scaled = sm.add_constant(x_train)
reg_log = sm.Logit(y_train,x_scaled)
results_log = reg_log.fit()
# Get the regression summary
results_log.summary2()

Optimization terminated successfully.
         Current function value: 0.194859
         Iterations 8


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.719
Dependent Variable:,Clicked on Ad,AIC:,321.7745
Date:,2021-09-27 00:54,BIC:,345.1975
No. Observations:,800,Log-Likelihood:,-155.89
Df Model:,4,LL-Null:,-554.52
Df Residuals:,795,LLR p-value:,3.018e-171
Converged:,1.0000,Scale:,1.0
No. Iterations:,8.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,1.1381,0.2072,5.4917,0.0000,0.7319,1.5443
x1,-3.2488,0.2755,-11.7904,0.0000,-3.7889,-2.7087
x2,1.4886,0.1777,8.3795,0.0000,1.1404,1.8368
x3,-1.5217,0.1908,-7.9738,0.0000,-1.8957,-1.1476
x4,-0.2793,0.1535,-1.8203,0.0687,-0.5801,0.0214


### Confusion Matrix

It is a function that takes the input data, predict output and compares them with orignal outputs and provide accuracy.

In [51]:
def confusion_matrix(data,actual_values,model):
        pred_values = model.predict(data)
        bins=np.array([0,0.5,1])
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        return cm, accuracy

In [52]:
confusion_matrix(x_scaled,y_train,results_log)

(array([[376.,  23.],
        [ 39., 362.]]),
 0.9225)

## Testing the model

In [53]:
x_test_f=sm.add_constant(x_test)

In [54]:
analysis = confusion_matrix(x_test_f, y_test, results_log)
analysis

(array([[97.,  4.],
        [ 9., 90.]]),
 0.935)

In [55]:
analysis[1]*100

93.5

Thus we conclude that our regression model predicted the outcome with an accuracy of 93.5%