In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [8]:
data = pd.read_csv("E:\Final Project\online_shoppers_intention.csv")

In [9]:
months = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'June':6,'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
data['Month'] = data['Month'].map(months)
data = data.dropna(subset=['Month'])
data['Month'] = data['Month'].astype(int)
data['Month_sin'] = np.sin(2 * np.pi * data['Month'] / 12)
data['Month_cos'] = np.cos(2 * np.pi * data['Month'] / 12)
data = data.drop('Month', axis=1)

In [10]:
cats = ['OperatingSystems','Browser','Region','TrafficType']
data = pd.get_dummies(data, columns=cats, drop_first=True)
visit = pd.get_dummies(data['VisitorType'], prefix='Visitor', drop_first=True)
data = pd.concat([data.drop('VisitorType', axis=1), visit], axis=1)

In [11]:
data['Weekend'] = data['Weekend'].astype(int)
data['Revenue'] = data['Revenue'].astype(int)

In [12]:
log_cols = ['Administrative_Duration','Informational_Duration','ProductRelated_Duration','PageValues']
for c in log_cols:
    data[c] = np.log1p(data[c])

In [13]:
scale_cols = ['Administrative','Informational','ProductRelated','BounceRates','ExitRates','SpecialDay'] + log_cols
scaler = StandardScaler()
data[scale_cols] = scaler.fit_transform(data[scale_cols])


In [14]:
X = data.drop('Revenue', axis=1)
y = data['Revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
