# Web Application Notebook containing the model and all relevant data preprocessing code

This notebook contains all the relevant code for splitting, loading and saving the data and best model in order to use it in the streamlit app

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## 1. Data Cleaning and Preprocessing

In [2]:
df = pd.read_csv("../Data/online_shoppers_intention.csv")

#### create function to conduct all data preprocessing and cleaning in one step

In [3]:

def preprocess_data(df):
    # convert boolean variables into float format
    df["Weekend"] = df["Weekend"].astype(float)
    df["Revenue"] = df["Revenue"].astype(float)
    # convert month values into numbers, dummy-variables are not necessary since the boosted tree can also detect non-linear patterns
    df["Month"] = df["Month"].replace({'Jan': 1, 'Feb': 2, "Mar":3, 'Apr':4, "May":5, "June":6, "Jul":7, "Aug":8, "Sep":9, "Oct":10, "Nov":11, "Dec":12})
    # rename value of visitor type "other"
    df["VisitorType"] = df["VisitorType"].replace({'Other':'Different'})
    # convert everything to floats and dummies
    df = pd.get_dummies(df, drop_first=True).astype(float) 
    return df

In [4]:
# create copy of clean dataframe to work with
data = preprocess_data(df).copy()

## 2. Instantiate Boosted Tree Model

In [5]:
# import relevant packages
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import pickle

### Create Training and Test Set

In [6]:
y = data["Revenue"]
X = data.drop("Revenue", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05)

### Instantiate and Train Model

In [7]:
xgb = XGBClassifier(random_state = 1, max_depth=20, n_estimators=500, learning_rate=0.01, reg_alpha=6).fit(X_train, y_train)     

### Safe Train and Test Data 

In [8]:
data.iloc[X_train.index,:].to_csv("../Data/online_shoppers_app_dev.csv", index=False)
data.iloc[X_test.index,:].drop("Revenue", axis=1).to_csv("../Data/new_shoppers.csv", index=False)

### Safe Model

In [9]:
filename = 'finalized_default_model.sav'
pickle.dump(xgb, open(filename, 'wb'))

## 3. Check whether saved model works by predicting test data

In [10]:
new_customers = pd.read_csv("../Data/new_shoppers.csv")
new_customers

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,VisitorType_New_Visitor,VisitorType_Returning_Visitor
0,0.0,0.000000,0.0,0.0,2.0,35.000000,0.000000,0.050000,0.000000,0.0,11.0,2.0,2.0,1.0,10.0,0.0,1.0,0.0
1,4.0,134.390000,1.0,191.9,24.0,1592.533333,0.003704,0.013580,5.245869,0.0,10.0,3.0,2.0,1.0,5.0,0.0,0.0,1.0
2,6.0,163.666667,10.0,403.0,15.0,665.566667,0.007143,0.004762,0.000000,0.0,3.0,3.0,2.0,8.0,8.0,0.0,1.0,0.0
3,0.0,0.000000,0.0,0.0,26.0,664.850000,0.026923,0.066667,0.000000,0.0,12.0,2.0,10.0,7.0,1.0,1.0,0.0,1.0
4,3.0,49.500000,1.0,29.5,45.0,1545.500000,0.000000,0.004000,20.971440,0.0,3.0,2.0,4.0,3.0,8.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.0,0.000000,0.0,0.0,58.0,3200.366667,0.003448,0.027586,0.000000,0.0,3.0,2.0,2.0,6.0,1.0,0.0,0.0,1.0
613,7.0,67.000000,2.0,725.5,101.0,4713.800000,0.009444,0.010482,20.624741,0.0,5.0,1.0,1.0,6.0,3.0,1.0,0.0,1.0
614,0.0,0.000000,0.0,0.0,2.0,34.200000,0.000000,0.100000,0.000000,0.0,9.0,2.0,2.0,1.0,2.0,0.0,0.0,1.0
615,2.0,17.000000,1.0,105.0,9.0,148.250000,0.020000,0.022000,0.000000,0.0,3.0,3.0,2.0,1.0,1.0,0.0,0.0,1.0


In [11]:
# load model and make predictions
loaded_model = pickle.load(open(filename, 'rb'))
preds = loaded_model.predict(new_customers)

In [12]:
# evaluate model performance on test data
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         0.0       0.92      0.95      0.94       524
         1.0       0.67      0.56      0.61        93

    accuracy                           0.89       617
   macro avg       0.80      0.75      0.77       617
weighted avg       0.89      0.89      0.89       617

