Loading the dataset

In [82]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from imblearn.over_sampling import SMOTE

from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB


To ignore warning

In [81]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [47]:
df = pd.read_csv("data/online_shoppers_data.csv")

checking Five columns rows

In [48]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


Checking number of rows and column

In [49]:
df.shape

(12330, 18)

Checking Dataset information

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

Feature Engineering

Checking unique values in Weekend column

In [51]:
df['Weekend'].unique()

array([False,  True])

Convert Weekend column to binary values

In [52]:
df['Weekend']= df['Weekend'].replace((True,False),(1,0))

In [53]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,False


Checking unique values in Revenue column

In [54]:
df['Revenue'].unique()

array([False,  True])

Convert Revenue column to binary values

In [55]:
df['Weekend']= df['Weekend'].replace((True,False),(1,0))
df['Revenue']= df['Revenue'].replace((True,False),(1,0))

In [56]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,0


Added Returning_Visitor column

Checking unique values in VisitorType column

In [57]:
df['Weekend']= df['Weekend'].replace((True,False),(1,0))
df['Revenue']= df['Revenue'].replace((True,False),(1,0))

In [58]:
df['VisitorType'].unique()

array(['Returning_Visitor', 'New_Visitor', 'Other'], dtype=object)

Adding Returning_Visitor column and Convert VisitorType column 
to binary values

In [59]:
df['Weekend']= df['Weekend'].replace((True,False),(1,0))
df['Revenue']= df['Revenue'].replace((True,False),(1,0))

In [60]:
condition = df['VisitorType']=='Returning_Visitor'
df['Returning_Visitor'] = np.where(condition,1,0)

In [61]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue,Returning_Visitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,0,1


Drop VisitorType column

In [62]:
df = df.drop(columns = ['VisitorType'])

In [63]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,Returning_Visitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,1,0,1


Checking all columns data type

In [64]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
Weekend                      int64
Revenue                      int64
Returning_Visitor            int32
dtype: object

Checking unique values in Month column

In [65]:
df['Month'].unique()

array(['Feb', 'Mar', 'May', 'Oct', 'June', 'Jul', 'Aug', 'Nov', 'Sep',
       'Dec'], dtype=object)

Applying Ordinal Encoding on Month column

In [66]:
ordinal_encoder = OrdinalEncoder()
df['Month'] = ordinal_encoder.fit_transform(df[['Month']])

In [67]:
df['Month'].unique()

array([2., 5., 6., 8., 4., 3., 0., 7., 9., 1.])

Let’s understand the Revenue column

In [68]:
df.Revenue.value_counts()

Revenue
0    10422
1     1908
Name: count, dtype: int64

Access required columns

In [69]:
result = df.columns[1:]
print(result)

Index(['Administrative_Duration', 'Informational', 'Informational_Duration',
       'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates',
       'PageValues', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser',
       'Region', 'TrafficType', 'Weekend', 'Revenue', 'Returning_Visitor'],
      dtype='object')


Create a DataFrame with required columns

In [70]:
result = df[df.columns[1:]]
print(result)

       Administrative_Duration  Informational  Informational_Duration  \
0                          0.0              0                     0.0   
1                          0.0              0                     0.0   
2                          0.0              0                     0.0   
3                          0.0              0                     0.0   
4                          0.0              0                     0.0   
...                        ...            ...                     ...   
12325                    145.0              0                     0.0   
12326                      0.0              0                     0.0   
12327                      0.0              0                     0.0   
12328                     75.0              0                     0.0   
12329                      0.0              0                     0.0   

       ProductRelated  ProductRelated_Duration  BounceRates  ExitRates  \
0                   1                 0.000000   

Correlation

In [71]:
result = df[df.columns[1:]].corr()
print(result)

                         Administrative_Duration  Informational  \
Administrative_Duration                 1.000000       0.302710   
Informational                           0.302710       1.000000   
Informational_Duration                  0.238031       0.618955   
ProductRelated                          0.289087       0.374164   
ProductRelated_Duration                 0.355422       0.387505   
BounceRates                            -0.144170      -0.116114   
ExitRates                              -0.205798      -0.163666   
PageValues                              0.067608       0.048632   
SpecialDay                             -0.073304      -0.048219   
Month                                   0.029061       0.019743   
OperatingSystems                       -0.007343      -0.009527   
Browser                                -0.015392      -0.038235   
Region                                 -0.005561      -0.029169   
TrafficType                            -0.014376      -0.03449

In [72]:
result = df[df.columns[1:]].corr()['Revenue']
print(result)

Administrative_Duration    0.093587
Informational              0.095200
Informational_Duration     0.070345
ProductRelated             0.158538
ProductRelated_Duration    0.152373
BounceRates               -0.150673
ExitRates                 -0.207071
PageValues                 0.492569
SpecialDay                -0.082305
Month                      0.080150
OperatingSystems          -0.014668
Browser                    0.023984
Region                    -0.011595
TrafficType               -0.005113
Weekend                    0.029295
Revenue                    1.000000
Returning_Visitor         -0.103843
Name: Revenue, dtype: float64


In [73]:
result = df[df.columns[1:]].corr()['Revenue']
result1 = result.sort_values(ascending=False)
print(result1)

Revenue                    1.000000
PageValues                 0.492569
ProductRelated             0.158538
ProductRelated_Duration    0.152373
Informational              0.095200
Administrative_Duration    0.093587
Month                      0.080150
Informational_Duration     0.070345
Weekend                    0.029295
Browser                    0.023984
TrafficType               -0.005113
Region                    -0.011595
OperatingSystems          -0.014668
SpecialDay                -0.082305
Returning_Visitor         -0.103843
BounceRates               -0.150673
ExitRates                 -0.207071
Name: Revenue, dtype: float64


Create features and target

In [79]:
X = df.drop(['Revenue'], axis=1)
y = df['Revenue']
print("Features and target created")

Features and target created


Prepairing Train and Test Dataset

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

print("Created train and test datasets")

Created train and test datasets


 create model pipeline

In [84]:
def model_pipeline(X, model):  
    n_c = X.select_dtypes(exclude=['object']).columns.values.tolist()
    c_c = X.select_dtypes(include=['object']).columns.values.tolist()

    numeric_columns = n_c
    categorical_columns = c_c

    numeric_pipeline = SimpleImputer(strategy = 'constant')

    categorical_pipeline = OneHotEncoder(handle_unknown = 'ignore')

    a = ('numeric', numeric_pipeline, numeric_columns)
    b = ('categorical', categorical_pipeline, categorical_columns)

    preprocessor = ColumnTransformer(

    transformers = [a, b], 
    remainder = 'passthrough'

    )

    c = ('preprocessor', preprocessor)
    d = ('smote', SMOTE(random_state = 1))
    e = ('scaler', MinMaxScaler())
    f = ('feature_selection', SelectKBest(score_func = chi2, k = 6))
    g = ('model', model)

    bundled_pipeline = imbpipeline(steps = [c, d, e, f, g])

    return bundled_pipeline

Model Selection

In [85]:
def select_model(X, y, pipeline=None):

    classifiers = {}
    

    c_d4 = {"RandomForestClassifier": RandomForestClassifier()}
    classifiers.update(c_d4)

    c_d5 = {"DecisionTreeClassifier": DecisionTreeClassifier()}
    classifiers.update(c_d5)

    c_d9 = {"KNeighborsClassifier": KNeighborsClassifier()}
    classifiers.update(c_d9)

    c_d10 = {"RidgeClassifier": RidgeClassifier()}
    classifiers.update(c_d10)

    c_d13 = {"BernoulliNB": BernoulliNB()}
    classifiers.update(c_d13)

    c_d14 = {"SVC": SVC()}
    classifiers.update(c_d14)
   
    cols = ['model', 'run_time', 'roc_auc']
    df_models = pd.DataFrame(columns = cols)

    for key in classifiers:
        
        start_time = time.time()
        
        print()
        print("Step 12: model_pipeline run successfully on", key)

        pipeline = model_pipeline(X_train, classifiers[key])
        
        cv = cross_val_score(pipeline, X, y, cv=10, scoring='roc_auc')

        row = {'model': key,
               'run_time': format(round((time.time() - start_time)/60,2)),
               'roc_auc': cv.mean(),
        }

        df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)
        
    df_models = df_models.sort_values(by='roc_auc', ascending=False)
	
    return df_models

Access Model select_model function

In [86]:
models = select_model(X_train, y_train)


Step 12: model_pipeline run successfully on RandomForestClassifier

Step 12: model_pipeline run successfully on DecisionTreeClassifier

Step 12: model_pipeline run successfully on KNeighborsClassifier

Step 12: model_pipeline run successfully on RidgeClassifier

Step 12: model_pipeline run successfully on BernoulliNB

Step 12: model_pipeline run successfully on SVC


Lets see total model with score

In [88]:
print(models)

                    model run_time   roc_auc
0  RandomForestClassifier     0.88  0.886458
5                     SVC     0.89  0.885963
4             BernoulliNB     0.01  0.857851
3         RidgeClassifier     0.01  0.855441
2    KNeighborsClassifier     0.02  0.840505
1  DecisionTreeClassifier     0.03  0.733111


Accessing best model and training

In [89]:
selected_model = SVC()
bundled_pipeline = model_pipeline(X_train, selected_model)
bundled_pipeline.fit(X_train, y_train)

Accessing best model and training

In [90]:
y_pred = bundled_pipeline.predict(X_test)

print(y_pred)


[0 0 0 ... 0 0 0]


ROC and AOC score

In [91]:
roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)


print('ROC/AUC:', roc_auc)
print('Accuracy:', accuracy)
print('F1 score:', f1_score)

ROC/AUC: 0.7772768502330849
Accuracy: 0.8780751554474182
F1 score: 0.6330349877949553


Classification report

In [92]:
classif_report = classification_report(y_test, y_pred)

print(classif_report)

              precision    recall  f1-score   support

           0       0.92      0.93      0.93      3077
           1       0.64      0.63      0.63       622

    accuracy                           0.88      3699
   macro avg       0.78      0.78      0.78      3699
weighted avg       0.88      0.88      0.88      3699

