# Web Application Notebook containing the model and all relevant data preprocessing code

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## 1. Data Cleaning and Preprocessing

In [17]:
df = pd.read_csv("../Data/online_shoppers_intention.csv")

#### create function to conduct all data preprocessing and cleaning in one step

In [18]:

def preprocess_data(df):
    # convert boolean variables into float format
    df["Weekend"] = df["Weekend"].astype(float)
    df["Revenue"] = df["Revenue"].astype(float)
    # convert month values into numbers, dummy-variables are not necessary since the boosted tree can also detect non-linear patterns
    df["Month"] = df["Month"].replace({'Jan': 3, 'Feb': 2, "Mar":3, 'Apr':4, "May":5, "June":6, "Jul":7, "Aug":8, "Sep":9, "Oct":10, "Nov":11, "Dec":12})
    # rename value of visitor type "other"
    df["VisitorType"] = df["VisitorType"].replace({'Other':'Different'})
    # convert everything to floats and dummies
    df = pd.get_dummies(df, drop_first=True).astype(float) 
    return df

In [19]:
# create copy of clean dataframe to work with
data = preprocess_data(df).copy()

## 2. Instantiate Boosted Tree Model

In [21]:
# import relevant packages
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import pickle

### Create Training and Test Set

In [22]:
y = data["Revenue"]
X = data.drop("Revenue", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05)

### Instantiate and Train Model

In [23]:
xgb = XGBClassifier(random_state = 1, max_depth=20, n_estimators=500, learning_rate=0.01, reg_alpha=6).fit(X_train, y_train)     

### Safe Train and Test Data 

In [26]:
data.iloc[X_train.index,:].to_csv("../Data/online_shoppers_app_dev.csv", index=False)
data.iloc[X_test.index,:].drop("Revenue", axis=1).to_csv("../Data/new_shoppers.csv", index=False)

### Safe Model

In [27]:
filename = 'finalized_default_model.sav'
pickle.dump(xgb, open(filename, 'wb'))

## 3. Check whether saved model works by predicting test data

In [31]:
new_customers = pd.read_csv("../Data/new_shoppers.csv")
new_customers

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,VisitorType_New_Visitor,VisitorType_Returning_Visitor
0,4.0,68.400000,0.0,0.000,14.0,337.533333,0.000000,0.023529,0.000000,0.0,8.0,2.0,2.0,7.0,2.0,0.0,1.0,0.0
1,0.0,0.000000,0.0,0.000,12.0,259.500000,0.008333,0.033333,0.000000,0.0,5.0,2.0,4.0,6.0,6.0,0.0,0.0,1.0
2,10.0,262.000000,0.0,0.000,22.0,623.416667,0.018519,0.045267,0.000000,0.2,5.0,3.0,2.0,1.0,2.0,0.0,0.0,1.0
3,13.0,340.591667,0.0,0.000,43.0,863.346825,0.000000,0.006731,40.010284,0.0,11.0,2.0,2.0,4.0,1.0,0.0,0.0,1.0
4,0.0,0.000000,0.0,0.000,2.0,13.000000,0.000000,0.100000,0.000000,0.0,12.0,1.0,1.0,3.0,2.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.0,0.000000,0.0,0.000,6.0,117.500000,0.000000,0.033333,0.000000,0.8,2.0,1.0,1.0,1.0,3.0,1.0,0.0,1.0
613,0.0,0.000000,0.0,0.000,3.0,117.000000,0.066667,0.100000,0.000000,0.0,5.0,3.0,2.0,1.0,11.0,0.0,0.0,1.0
614,0.0,0.000000,0.0,0.000,1.0,0.000000,0.200000,0.200000,0.000000,0.0,3.0,2.0,2.0,2.0,1.0,0.0,0.0,1.0
615,9.0,250.222222,5.0,343.875,82.0,5076.380556,0.000000,0.010982,28.417081,0.0,12.0,3.0,2.0,1.0,13.0,0.0,0.0,1.0


In [33]:
# load model and make predictions
loaded_model = pickle.load(open(filename, 'rb'))
preds = loaded_model.predict(new_customers)

In [34]:
# evaluate model performance on test data
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         0.0       0.94      0.96      0.95       529
         1.0       0.72      0.66      0.69        88

    accuracy                           0.91       617
   macro avg       0.83      0.81      0.82       617
weighted avg       0.91      0.91      0.91       617

