# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
import warnings
warnings.filterwarnings("ignore")

# Loading dataset

In [2]:
data = pd.read_csv('online_shoppers_intention.csv')

In [3]:
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
data1 = data.copy()

# Encoding the Categorical Columns to Numerical

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
LB = LabelEncoder()

In [7]:
data1['Month'] = LB.fit_transform(data['Month'])
data1['Weekend'] = LB.fit_transform(data['Weekend'])
data1['VisitorType'] = LB.fit_transform(data['VisitorType'])
data1['Revenue'] = LB.fit_transform(data['Revenue'])

In [8]:
data1.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,2,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,2,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,2,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,2,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,2,1,0


In [9]:
X = data1.drop('Revenue',axis=1)
Y = data1.Revenue

# Univariate Selection

    The scikit-learn library provides the SelectKBest class that can be used with a suite of different statistical tests to select a specific number of features.
    We have used Chi square test

In [10]:
test = SelectKBest(score_func=chi2)
fit = test.fit(X, Y)

In [11]:
ff = pd.DataFrame([data1.columns[:-1],fit.scores_],index=['Features','Scores']).T
ff.sort_values('Scores',ascending = False)

Unnamed: 0,Features,Scores
5,ProductRelated_Duration,877404.0
8,PageValues,175127.0
1,Administrative_Duration,41754.8
3,Informational_Duration,35059.8
4,ProductRelated,19317.3
0,Administrative,1133.97
2,Informational,357.982
10,Month,86.1637
9,SpecialDay,53.7971
15,VisitorType,37.5475


## Logistic Regression

In [12]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [13]:
from sklearn.model_selection import GridSearchCV,KFold
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [14]:
X = data1[['ProductRelated_Duration','PageValues','Administrative_Duration','Informational_Duration',
          'ProductRelated','Administrative','Informational','Month','SpecialDay','VisitorType',
          'BounceRates', 'ExitRates']]

In [15]:
Y = data1.Revenue

In [16]:
F1=[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    F1.append(metrics.f1_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  F1: %0.02f ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(F1),np.mean(roc_auc)))

 Accuracy: 0.88  F1: 0.50 ROC-AOC: 0.68


# Random Forest

In [17]:
RT = RandomForestClassifier(random_state=0)

In [18]:
F1=[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    F1.append(metrics.f1_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  F1: %0.02f ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(F1),np.mean(roc_auc)))

 Accuracy: 0.90  F1: 0.61 ROC-AOC: 0.75


# Recursive Feature Elimination

    It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute

In [19]:
from sklearn.feature_selection import RFE

In [20]:
X = data1.drop('Revenue',axis=1)
Y = data1.Revenue

In [21]:
model = LogisticRegression()
rfe = RFE(model)
fit = rfe.fit(X, Y)
ff = pd.DataFrame([data1.columns[:-1],fit.support_,fit.ranking_],index=['Features','Support','Rankings']).T
ff.sort_values('Rankings')

Unnamed: 0,Features,Support,Rankings
8,PageValues,True,1
10,Month,True,1
9,SpecialDay,True,1
15,VisitorType,True,1
7,ExitRates,True,1
6,BounceRates,True,1
16,Weekend,True,1
2,Informational,True,1
11,OperatingSystems,False,2
0,Administrative,False,3


## Logistic Regression

In [22]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [23]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [24]:
X = data1[['PageValues','Month','SpecialDay','VisitorType','ExitRates','BounceRates',
           'Weekend','Informational']]

In [25]:
Y = data1.Revenue

In [26]:
F1=[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    F1.append(metrics.f1_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  F1: %0.02f ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(F1),np.mean(roc_auc)))

 Accuracy: 0.88  F1: 0.49 ROC-AOC: 0.67


# Random Forest

In [27]:
RT = RandomForestClassifier(random_state=0)

In [28]:
F1=[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    F1.append(metrics.f1_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  F1: %0.02f ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(F1),np.mean(roc_auc)))

 Accuracy: 0.88  F1: 0.58 ROC-AOC: 0.73


# Feature Importance

    Bagged decision trees like Random Forest and Extra Trees can be used to estimate the importance of features

In [29]:
from sklearn.ensemble import ExtraTreesClassifier

In [30]:
X = data1.drop('Revenue',axis=1)
Y = data1.Revenue

In [31]:
model = ExtraTreesClassifier()
model.fit(X, Y)
ff = pd.DataFrame([data1.columns[:-1],model.feature_importances_],index=['Features','Importance']).T
ff.sort_values('Importance',ascending = False)

Unnamed: 0,Features,Importance
8,PageValues,0.33371
7,ExitRates,0.0709816
5,ProductRelated_Duration,0.0682977
4,ProductRelated,0.0637527
6,BounceRates,0.0578388
10,Month,0.0544314
0,Administrative,0.0504058
1,Administrative_Duration,0.0486242
13,Region,0.0451208
14,TrafficType,0.0435808


## Logistic Regression

In [32]:
LR = LogisticRegression()

###  K Fold Cross Validation

In [33]:
kf = KFold(n_splits=5,shuffle=True,random_state=0)

In [34]:
X = data1[['PageValues','ExitRates','BounceRates','ProductRelated',
           'Month','Administrative','TrafficType',
           'Region']]

In [35]:
X = data1[['PageValues','ExitRates','BounceRates','ProductRelated','ProductRelated_Duration',
           'Month','Administrative_Duration','Administrative','TrafficType',
           'Region']]

In [36]:
Y = data1.Revenue

In [37]:
F1=[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    LR.fit(xtrain,ytrain)
    y_predict = LR.predict(xtest)
    F1.append(metrics.f1_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  F1: %0.02f ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(F1),np.mean(roc_auc)))

 Accuracy: 0.88  F1: 0.49 ROC-AOC: 0.67


# Random Forest

In [38]:
RT = RandomForestClassifier(random_state=0)

In [39]:
F1=[]
roc_auc = []
accuracy=[]
for train,test in kf.split(X,Y):
    xtrain,xtest=X.iloc[train,:],X.iloc[test,:]
    ytrain,ytest=Y.iloc[train],Y.iloc[test]
    RT.fit(xtrain,ytrain)
    y_predict = RT.predict(xtest)
    F1.append(metrics.f1_score(ytest,y_predict))
    fpr,tpr,_=metrics.roc_curve(ytest,y_predict)
    roc_auc.append(metrics.auc(fpr,tpr))
    accuracy.append(metrics.accuracy_score(ytest,y_predict))
print(' Accuracy: %0.02f  F1: %0.02f ROC-AOC: %0.02F'
      %(np.mean(accuracy),np.mean(F1),np.mean(roc_auc)))

 Accuracy: 0.90  F1: 0.62 ROC-AOC: 0.75


# Inference:
        From all the three feature selection models both extra bag classifier and Univariate selection has given the best scores
        Extra bag classifier has given importance to the variables that are highly correlated with the Target variable.
        It can be used as a final selection model