In [1]:
import pandas as pd 
import numpy as np 

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, LabelEncoder

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

In [2]:
data = pd.read_csv('online_shoppers_intention.csv') 

In [3]:
# scaling
scalar_col = ['Administrative_Duration','Informational_Duration','ProductRelated_Duration']
data[scalar_col] = StandardScaler().fit_transform(data[scalar_col]) 
# labelling 
data['Month'] = LabelEncoder().fit_transform(data['Month'])
data['VisitorType'] = LabelEncoder().fit_transform(data['VisitorType'])
data['Weekend'] = LabelEncoder().fit_transform(data['Weekend'])
data['Revenue'] = LabelEncoder().fit_transform(data['Revenue']) 

In [4]:
X = data.drop('Revenue', axis = 1) 
y = data['Revenue'] 

In [19]:
print(X.shape)
print(y.shape)

(12330, 17)
(12330,)


In [5]:
# Oversampling 
smote = SMOTE(sampling_strategy = 'minority') 
sampled_x, sampled_y = smote.fit_sample(X, y) 

In [6]:
sampled_x.shape

(20844, 17)

In [18]:
sampled_y.shape

(20844,)

In [24]:
y1=pd.DataFrame(sampled_y)

In [27]:
y1[0].value_counts()

1    10422
0    10422
Name: 0, dtype: int64

In [7]:
# train, test split 
X_train, X_test, y_train, y_test = train_test_split(sampled_x, sampled_y, test_size = 0.3, random_state = 1) 

In [8]:
# XGB Classifier 
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test) 

In [9]:
accuracy_score(y_test,pred_xgb) 

0.929325231851615

In [10]:
print(classification_report(y_test, pred_xgb))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      3190
           1       0.93      0.93      0.93      3064

    accuracy                           0.93      6254
   macro avg       0.93      0.93      0.93      6254
weighted avg       0.93      0.93      0.93      6254



In [11]:
pred_data = xgb.predict(X.values)   

In [12]:
print('Classification Report on Original data')
print(classification_report(y.values, pred_data)) 

Classification Report on Original data
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     10422
           1       0.67      0.72      0.69      1908

    accuracy                           0.90     12330
   macro avg       0.81      0.83      0.82     12330
weighted avg       0.90      0.90      0.90     12330

