# Train Test Split Method:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split

In [3]:
df=pd.read_csv("advertising.csv")

In [5]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,27-03-2016 00:53,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,04-04-2016 01:39,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,13-03-2016 20:35,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,10-01-2016 02:31,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,03-06-2016 03:36,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.3+ KB


In [7]:
df["Timestamp"]=pd.to_datetime(df["Timestamp"],format="%d-%m-%Y %H:%M")

In [8]:
df["hour"]=df["Timestamp"].dt.hour

In [9]:
df["day"]=df["Timestamp"].dt.dayofweek

In [10]:
x=df[["Daily Time Spent on Site","Age","Area Income","Daily Internet Usage","Male","hour","day"]]
y=df["Clicked on Ad"]

In [16]:
model=LogisticRegression(max_iter=1200)

In [17]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [18]:
model.fit(x_train,y_train)

In [19]:
y_train_pred=model.predict(x_train)
y_test_pred=model.predict(x_test)

In [22]:
test_accuracy=accuracy_score(y_train,y_train_pred)
train_accuracy=accuracy_score(y_test,y_test_pred)
print("train accuracy",train_accuracy)
print("test accuracy",test_accuracy)

train accuracy 0.96
test accuracy 0.9728571428571429


# Shuffle Split Cross Validation Method:

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import ShuffleSplit,cross_validate

In [24]:
df=pd.read_csv("advertising.csv")

In [25]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,27-03-2016 00:53,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,04-04-2016 01:39,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,13-03-2016 20:35,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,10-01-2016 02:31,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,03-06-2016 03:36,0


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.3+ KB


In [27]:
df["Timestamp"]=pd.to_datetime(df["Timestamp"],format="%d-%m-%Y %H:%M")

In [28]:
df["hour"]=df["Timestamp"].dt.hour

In [29]:
df["day"]=df["Timestamp"].dt.dayofweek

In [30]:
x=df[["Daily Time Spent on Site","Age","Area Income","Daily Internet Usage","Male","hour","day"]]
y=df["Clicked on Ad"]

In [40]:
model=LogisticRegression(max_iter=5000)

In [47]:
shuffle_split=ShuffleSplit(n_splits=200,test_size=0.3,random_state=42)

In [48]:
cv_results=cross_validate(model,x,y,cv=shuffle_split,scoring="accuracy",return_train_score=True)

In [44]:
test_score=cv_results["test_score"]

In [49]:
train_score=cv_results["train_score"]

In [51]:
print("cross validation: test score is:",np.round(test_score.mean(),2))
print("cross validation: train score is:",np.round(train_score.mean(),2))

cross validation: test score is: 0.97
cross validation: train score is: 0.97


# KFold Cross Validation Method:

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold,cross_validate

In [53]:
df=pd.read_csv("advertising.csv")

In [54]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,27-03-2016 00:53,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,04-04-2016 01:39,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,13-03-2016 20:35,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,10-01-2016 02:31,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,03-06-2016 03:36,0


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.3+ KB


In [56]:
df["Timestamp"]=pd.to_datetime(df["Timestamp"],format="%d-%m-%Y %H:%M")

In [57]:
df["hour"]=df["Timestamp"].dt.hour

In [58]:
df["day"]=df["Timestamp"].dt.dayofweek

In [59]:
x=df[["Daily Time Spent on Site","Age","Area Income","Daily Internet Usage","Male","hour","day"]]
y=df["Clicked on Ad"]

In [60]:
model=LogisticRegression(max_iter=5000)

In [61]:
Kfold=KFold(n_splits=200,shuffle=True,random_state=42)

In [62]:
cv_results=cross_validate(model,x,y,cv=Kfold,scoring="accuracy",return_train_score=True)

In [63]:
test_score=cv_results["test_score"]

In [64]:
test_score=cv_results["test_score"]

In [65]:
print("cross validation: test score is:",np.round(test_score.mean(),2))
print("cross validation: train score is:",np.round(train_score.mean(),2))

cross validation: test score is: 0.97
cross validation: train score is: 0.97


# Stratified Shuffle Split Method:

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import StratifiedShuffleSplit

In [67]:
df=pd.read_csv("advertising.csv")

In [68]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,27-03-2016 00:53,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,04-04-2016 01:39,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,13-03-2016 20:35,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,10-01-2016 02:31,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,03-06-2016 03:36,0


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.3+ KB


In [70]:
df["Timestamp"]=pd.to_datetime(df["Timestamp"],format="%d-%m-%Y %H:%M")

In [71]:
df["hour"]=df["Timestamp"].dt.hour

In [72]:
df["day"]=df["Timestamp"].dt.dayofweek

In [73]:
x=df[["Daily Time Spent on Site","Age","Area Income","Daily Internet Usage","Male","hour","day"]]
y=df["Clicked on Ad"]

In [74]:
model=LogisticRegression(max_iter=5000)

In [77]:
stratified_split=StratifiedShuffleSplit(n_splits=200,test_size=0.3,random_state=42)

In [79]:
for train_index,test_index in stratified_split.split(x,y):
  x_train_strat,x_test_strat=x.iloc[train_index],x.iloc[test_index]
  y_train_strat,y_test_strat=y.iloc[train_index],y.iloc[test_index]

In [81]:
model.fit(x_train_strat,y_train_strat)

In [85]:
y_train_strat_pred=model.predict(x_train_strat)
y_test_strat_pred=model.predict(x_test_strat)

In [91]:
train_accuracy=accuracy_score(y_train_strat,y_train_strat_pred)
print(train_accuracy)
test_accuracy=accuracy_score(y_test_strat,y_test_strat_pred)
print(test_accuracy)

0.9685714285714285
0.9666666666666667
