In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
%%time
df = pd.read_csv('advertising.csv')

CPU times: total: 15.6 ms
Wall time: 9.97 ms


In [3]:
df.size

10000

In [4]:
df.shape

(1000, 10)

In [5]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,27-03-2016 00:53,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,04-04-2016 1:39,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,13-03-2016 20:35,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,10-01-2016 2:31,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,03-06-2016 3:36,0


### checking missing values

In [6]:
df.isnull().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64

In [7]:
print(f'Duplicates in data set: {df.duplicated().sum()} ({np.round(100*df.duplicated().sum()/len(df), 1)}%)')

Duplicates in data set: 0 (0.0%)


##### NO Missing Value Found

### checking datatypes

In [8]:
df.dtypes

Daily Time Spent on Site    float64
Age                           int64
Area Income                 float64
Daily Internet Usage        float64
Ad Topic Line                object
City                         object
Male                          int64
Country                      object
Timestamp                    object
Clicked on Ad                 int64
dtype: object

    There is 4 object data type is found.
    3 discrete features and 2 continuous features (without the target feature). 

In [9]:
df.describe()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,65.0002,36.009,55000.00008,180.0001,0.481,0.5
std,15.853615,8.785562,13414.634022,43.902339,0.499889,0.50025
min,32.6,19.0,13996.5,104.78,0.0,0.0
25%,51.36,29.0,47031.8025,138.83,0.0,0.0
50%,68.215,35.0,57012.3,183.13,0.0,0.5
75%,78.5475,42.0,65470.635,218.7925,1.0,1.0
max,91.43,61.0,79484.8,269.96,1.0,1.0


In [10]:
df.corr()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad
Daily Time Spent on Site,1.0,-0.331513,0.310954,0.518658,-0.018951,-0.748117
Age,-0.331513,1.0,-0.182605,-0.367209,-0.021044,0.492531
Area Income,0.310954,-0.182605,1.0,0.337496,0.001322,-0.476255
Daily Internet Usage,0.518658,-0.367209,0.337496,1.0,0.028012,-0.786539
Male,-0.018951,-0.021044,0.001322,0.028012,1.0,-0.038027
Clicked on Ad,-0.748117,0.492531,-0.476255,-0.786539,-0.038027,1.0


### Renaming column 

In [11]:
df=df.rename({'Daily Time Spent on Site':'daily_time_spent', 
              'Area Income':'area_income', 
              'Daily Internet Usage':'daily_internet_usage',
              'Ad Topic Line':'ad_topic_line', 
              'Clicked on Ad':'clicked_ad'}, axis=1)

In [None]:
from dataprep.eda import plot, plot_correlation, create_report, plot_missing

In [None]:
#plot(df)

In [None]:
#create_report(df)

In [None]:
plot(df, "Age", "daily_internet_usage")

In [None]:
plot(df, "Age", "clicked_ad")

In [None]:
plot(df, "area_income", "daily_internet_usage")

### Model Building  

In [12]:
x= df[['daily_time_spent', 'Age', 'area_income', 'daily_internet_usage', 'Male']]  # Independent Variable

y= df['clicked_ad']  # Target Variable

In [13]:
# split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.27, random_state= 1000)

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

## Logistic Tree, Decision Tree, Random Forest, SVM, Gaussian NB
    Applying different Classification algorithms

### Converting Dataframe into array for faster computation 

In [16]:
x_train = np.array(x_train) 
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

array([[ 0.19377788, -1.27549843,  1.00270855,  0.19802615,  1.01379376],
       [-0.96568716,  1.87471761, -0.71526816, -1.18305551,  1.01379376],
       [-0.93263086,  0.29960959,  1.03120978, -0.98637959, -0.98639392],
       ...,
       [-1.46652117,  0.52462502,  1.04806257, -0.28507768, -0.98639392],
       [-1.8157952 ,  0.86214817, -1.58970184, -0.32171114,  1.01379376],
       [ 0.25552454, -0.48794442,  1.32730618,  0.95107283,  1.01379376]])

### Logistic Regression 

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()

In [None]:
#define grid search
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(solver=solvers,penalty=penalty,C=c_values)

# Corss-fold parameters
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# GridSearchCV
grid_search = GridSearchCV(estimator=LR, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

LR = grid_search.fit(x, y)

In [None]:
print('Parameters currently in use:\n')
print(LR.get_params())

In [None]:
# Accuracy
accuracy=LR.score(x_test,y_test)
print('\n Accuracy',accuracy*100,'%')

# Confusion Matrix
from sklearn.metrics import confusion_matrix
result = confusion_matrix(y_test,predictions)
print(result)

# Classification report
print(classification_report(y_test,predictions))

In [None]:
# Make predictions
predictions=LR.predict(x_test)  

### Decision Tree 

In [None]:
from sklearn.tree import DecisionTreeClassifier 
DTC = DecisionTreeClassifier()             

DTC.fit(x_train, y_train)                  
y_pred = DTC.predict(x_test)              

In [None]:
DTC.get_params_

In [None]:
# Accuracy
accuracy=DTC.score(x_test,y_test)
print('\n Accuracy',accuracy*100,'%')

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('\n Confusion Matrix\n',cm)

# Classification Report
from sklearn.metrics import classification_report
print('\n Classification Report\n',classification_report(y_test, y_pred))

In [None]:
DTC = LogisticRegression()
DTC.fit(x_train, y_train)
y_pred = DTC.predict(x_test)
cr = classification_report(y_test, y_pred, output_dict=True)
cr_df = pd.DataFrame(cr)
fig = plt.subplots(figsize=(7, 5))
sns.heatmap(cr_df.iloc[:-1, :].T, annot=True, vmin=0.9, cmap='copper_r', linewidth=3)
plt.title('Classification report')
plt.show()

### RandomForest 

In [None]:
# Fitting Random Forest Classification to the Training set 
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier() # n_estimators = 10, criterion = 'entropy', max_leaf_nodes=124, random_state = 0)

RFC.fit(x_train, y_train)
#y_pred = RFC.predict(x_test)

# Classification Report
from sklearn.metrics import classification_report
print('\n Classification Report\n',classification_report(y_test, y_pred))

Random Forests makes it easy to measure the relative importance of each feature , sk-learn measure's feature importance by 
looking at how much the tree nodes that use that feature reduce impurity on average (across all trees in the forest) or it is a 
weighted average (where each node weight is equal to the no. of training samples associated with it).

In [None]:
# parameters used by Random forest
RFC.get_params()

In [None]:
# Setting up the Hyperprameters
forest_params = [{
                  'n_estimators': [10,200],
                  'max_depth': list(range(10, 25)), 
                  'max_features': list(range(0,30)), 
                  'max_leaf_nodes':list(range(10, 50)),
                }]

# Corss-Validation
RFC = GridSearchCV(RFC, forest_params, cv = 3, n_jobs = -1, verbose = 2,scoring='roc_auc')
RFC.fit(x_train, y_train)

In [None]:
RFC.best_params_

In [None]:
%%time

from sklearn.experimental import enable_halving_search_cv  
from sklearn.model_selection import HalvingGridSearchCV, GridSearchCV

forest_params = [{
                  'n_estimators': [10,200],
                  'max_depth': list(range(10, 25)), 
                  'max_features': list(range(0,30)), 
                  'max_leaf_nodes':list(range(10, 50)),
                }]

# Corss-Validation
RFC = HalvingGridSearchCV(RFC, forest_params, cv = 3, n_jobs = -1, verbose = 2,scoring='roc_auc')
RFC.fit(x_train, y_train)

In [None]:
RFC.best_params_

In [None]:
# Accuracy
accuracy=RFC.score(x_test,y_test)
print('\n Accuracy',accuracy*100,'%')
type(y_pred)
type(x_test)

# Classification Report
from sklearn.metrics import classification_report
print('\n Classification Report\n',classification_report(y_test, y_pred))

accuracy=RFC.score(x_test,y_test)
print('\n Accuracy',accuracy*100,'%')


### Support Vector Machine 

In [None]:
tqdm

In [None]:
# Fitting SVM to the Training set
from sklearn.svm import SVC
svc = SVC(kernel = 'linear', random_state = 0)

svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)

In [None]:
pip install scikit-learn --upgrade

In [None]:
svc.get_params_

In [None]:
# Accuracy
accuracy= svc.score(x_test,y_test)
print('\n Accuracy',accuracy*100,'%')

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('\n Confusion Matrix\n',cm)

# Classification Report
from sklearn.metrics import classification_report
print('\n Classification Report\n',classification_report(y_test, y_pred))


In [None]:
# sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2%', cmap='Blues')
group_names = ['True Pos','False Pos','False Neg','True Neg']
group_counts = ['{0:0.0f}'.format(value) for value in cm.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cm.flatten()/np.sum(cm)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in  zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues')

### GaussianNB 

In [None]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()

GNB.fit(x_train, y_train)
y_pred = GNB.predict(x_test)

In [None]:
GNB.get_params_

In [None]:
# accuracy
accuracy=GNB.score(x_test,y_test)
print('\n Accuracy',accuracy*100,'%')

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('\n Comfusion Matrix\n',cm)

# Classification Report
from sklearn.metrics import classification_report
print('\n Classification Report\n',classification_report(y_test, y_pred))

# Upcoming Additions

In [None]:
# Working on below Cells 

In [None]:
# LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA()

x_train = lda.fit_transform(x_train, y_train)
x_test = lda.transform(x_test)

In [None]:
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in
                cm.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in
                     cm.flatten()/np.sum(cm)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues')

In [None]:
# Selecting the best hyperparameters with help of GRID Search
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
Hyper_parameters = {
                    'C': [0.1,1,10,100,1000],
                'gamma': [1,0.1,0.01,0.001,0.0001],
               'kernel': ['rbf'] }

GridSearch_svc = GridSearchCV(estimator = SVC(), param_grid = Hyper_parameters, cv=15, n_jobs=-1)
GridSearch_svc.fit(x_train, y_train)

In [None]:
svc =SVC(C=100, gamma=0.1, kernel='rbf', probability=true)
svc.fit(x_train_scaled, y_train)

pred_svc_test = svc.predict(x_test_scaled)
pred_svc_train = svc.predict(x_train_scaled)

train_accuracy_svc = accuracy_score(y_train, pred_svc_train)*100
test_accuracy_svc = accuracy_score(y_train, pred_svc_train)*100

train_accuracy.append(train_accuracy_svc)
test_accuracy.append(test_accuracy_svc)

print("Accuracy on Train Data: {}".format(accuracy_score(y_train,pred_svc_train)*100))
print("Accuracy on Test Data:{}".format(accuracy_score(y_test,pred_svc_test)*100))