# Importing Libraries

In [16]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Loading dataset

In [17]:
data = pd.read_csv('online_shoppers_intention.csv')

In [18]:
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [19]:
data1 = data.copy()

# Encoding the Categorical Columns to Numerical

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
LB = LabelEncoder()

In [22]:
data1['Month'] = LB.fit_transform(data['Month'])
data1['VisitorType'] = LB.fit_transform(data['VisitorType'])
data1['Weekend'] = LB.fit_transform(data['Weekend'])
data1['Revenue'] = LB.fit_transform(data['Revenue'])

In [23]:
data1.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,2,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,2,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,2,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,2,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,2,1,0


# Boxcox Transformation 

In [24]:
data2 = data1+1

In [26]:
for i in data1.columns:
    data2[i],lamb= stats.boxcox(data2[i])

In [28]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,cross_val_score
import sklearn.metrics as metrics

In [29]:
X =  data2.drop('Revenue',axis=1)
Y =  data1.Revenue

## Decision Tree

In [30]:
DT = DecisionTreeClassifier()

In [31]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [32]:
GS.best_params_

{'max_depth': 2}

In [33]:
DT = DecisionTreeClassifier(max_depth=2)

###  K Fold Cross Validation

In [34]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [35]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    DT.fit(xtrain,ytrain)
    y_predict = DT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.73  Recall: 0.46  ROC-AOC: 0.71


## Random Forest

In [36]:
RT = RandomForestClassifier(random_state=0)

In [37]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [38]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 8}

In [39]:
RT = RandomForestClassifier(n_estimators=8,max_depth=5)

###  K Fold Cross Validation

In [40]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [41]:
precision=[]
recall  =[]
roc_auc = []
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    precision.append(metrics.precision_score(ytest,y_predict))
    recall.append(metrics.recall_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
print(' Precision: %0.02f  Recall: %0.02f  ROC-AOC: %0.02F'%(np.mean(precision),np.mean(recall),np.mean(roc_auc)))

 Precision: 0.78  Recall: 0.48  ROC-AOC: 0.72
