In [1]:
# Load the data from the Apporto machine to the Colab environment

from google.colab import files
uploaded = files.upload()

Saving online_shoppers_intention1stSelection.csv to online_shoppers_intention1stSelection.csv


In [2]:
# Pandas is the Python package for data frames

import pandas as pd

In [3]:
# Part 1 Data Acquisition

# Read data from a CSV file into a data frame
df = pd.read_csv('online_shoppers_intention1stSelection.csv')

# Display the first ten rows of the data frame to examine if it is an individual-level data set
print(df.head(10))

# Display the variable list
print(df.columns.values)

# Display the number of rows and the number of columns in the data set to confirm the portrait shape
# The first element of the output is the number of rows and the second is the number of columns 
print(df.shape)

   Administrative  Informational  ...  Weekend  Revenue
0               0              0  ...    False    False
1               0              0  ...    False    False
2               0              0  ...    False    False
3               0              0  ...    False    False
4               0              0  ...     True    False
5               0              0  ...    False    False
6               0              0  ...    False    False
7               1              0  ...     True    False
8               0              0  ...    False    False
9               0              0  ...    False    False

[10 rows x 12 columns]
['Administrative' 'Informational' 'ProductRelated'
 'ProductRelated_Duration' 'BounceRates' 'ExitRates' 'PageValues'
 'SpecialDay' 'Month' 'VisitorType' 'Weekend' 'Revenue']
(12330, 12)


In [4]:
# Part 2 

# Separate all the variables into two lists for future column indexing
# One for numerical, the other for categorical 
nvar_list = ['Administrative', 'Informational', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates','PageValues']
cvar_list = ['SpecialDay', 'Month', 'VisitorType', 'Weekend', 'Revenue']

In [5]:
# Part 3 Missing Value Imputation

# Show the number of missing values for each variable in the data frame
df.isnull().sum()

Administrative             0
Informational              0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

In [6]:
# Part 4 Variable Transformation

df_sample = df.copy()
df_sample1 = df_sample.copy()
df_sample1[nvar_list] = (df_sample[nvar_list] - df_sample[nvar_list].mean())/df_sample[nvar_list].std()

# Set the datatype for the variables in the cvar_list to be categorical in Python
# Set the datatype for the variables in the nvar_list to be numerical in Python 
df_sample2 = df_sample1.copy()
df_sample2[cvar_list] = df_sample1[cvar_list].astype('category')
df_sample2[nvar_list] = df_sample1[nvar_list].astype('float64')

# Convert the categorical variables into dummies (Step 1 of dummy coding)
# prefix_sep is the sympol used to create the dummy variable names.

df_sample3 = df_sample2.copy()
df_sample3 = pd.get_dummies(df_sample2, prefix_sep='_')

# Remove the redundant dummies (Step 2 of dummy coding)
# Placeholder variable: rdummies
# For k-NN model, we need to keep all the dummies for categorical predictors 
rdummies = ['Revenue_False']
df_sample4 = df_sample3.copy()
df_sample4 = df_sample3.drop(columns=rdummies)

# Get the remaining variable list after the variable transformation
print(df_sample4.columns.values)

# Display the milestone dataframe. Compare it with the original dataframe.
print(df_sample4)
print(df)


['Administrative' 'Informational' 'ProductRelated'
 'ProductRelated_Duration' 'BounceRates' 'ExitRates' 'PageValues'
 'SpecialDay_0.0' 'SpecialDay_0.2' 'SpecialDay_0.4' 'SpecialDay_0.6'
 'SpecialDay_0.8' 'SpecialDay_1.0' 'Month_Aug' 'Month_Dec' 'Month_Feb'
 'Month_Jul' 'Month_June' 'Month_Mar' 'Month_May' 'Month_Nov' 'Month_Oct'
 'Month_Sep' 'VisitorType_New_Visitor' 'VisitorType_Other'
 'VisitorType_Returning_Visitor' 'Weekend_False' 'Weekend_True'
 'Revenue_True']
       Administrative  Informational  ...  Weekend_True  Revenue_True
0           -0.696965      -0.396462  ...             0             0
1           -0.696965      -0.396462  ...             0             0
2           -0.696965      -0.396462  ...             0             0
3           -0.696965      -0.396462  ...             0             0
4           -0.696965      -0.396462  ...             1             0
...               ...            ...  ...           ...           ...
12325        0.206164      -0.396462  .

In [8]:
# Part 5 Data Partiton

# Required package: scikit-learn. Package name in Python: sklearn
# Required subpackage: model_selection. Required function name: train_test_split
from sklearn.model_selection import train_test_split

# Placeholder variables: df4partition, testpart_size
# test_size specifies the percentage for the test partition
df4partition = df_sample4
testpart_size = 0.2

# random_state specifies the seed for random number generator. 
# random_state = 1 unless otherwised noted
df_nontestData, df_testData = train_test_split(df4partition, test_size=testpart_size, random_state=1)

print(df_nontestData)

       Administrative  Informational  ...  Weekend_True  Revenue_True
3339        -0.696965      -0.396462  ...             0             0
10953       -0.696965      -0.396462  ...             0             0
8536        -0.696965      -0.396462  ...             0             0
7766         1.109294      -0.396462  ...             0             0
8280         0.808251      -0.396462  ...             0             0
...               ...            ...  ...           ...           ...
10955        0.507207      -0.396462  ...             0             0
905          0.206164       3.540061  ...             1             1
5192        -0.395922      -0.396462  ...             0             0
12172       -0.094879      -0.396462  ...             0             1
235          0.206164       3.540061  ...             0             0

[9864 rows x 29 columns]


In [9]:
# Part 6 Nearest neighbor 

# Required package: scikit-learn. Package name in Python: sklearn
# Required subpackage: neighbors 
# Required function name: KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Separate the predictor values and the DV values into X and y respectively
# Placeholder variable: DV
DV = 'Revenue_True'
y = df_nontestData[DV]
X = df_nontestData.drop(columns=[DV])


In [10]:
# Run Nearest neighbor with k-fold cross validation
# Placeholder variable: kfolds
kfolds = 5

# We first define the search scope. k (k-th nearest neighbor) will be increased from 1 to max_k
max_k = 200

param_grid = {'n_neighbors': list(range(1, max_k+1))}

# Set n_jobs to be -1 to run  on all CPU cores.
# The search criterion is to find the model that maximizes 
# whatever the scoring function - for this case roc_auc - returns.

from sklearn.model_selection import GridSearchCV

gridsearch = GridSearchCV(KNeighborsClassifier(metric='euclidean'), param_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
gridsearch.fit(X,y)
clf_bestkNN = gridsearch.best_estimator_


In [11]:
# Display the optimal k (k-th nearest neighbor)
print(clf_bestkNN.n_neighbors)

# y_test_actual is the actual values of the DV in the test partition
y_test_actual = df_testData[DV]

# X_test is the predictor values in the test partition
X_test = df_testData.drop(columns=[DV])


# Get the AUC of the final selected k-NN model
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test_actual, clf_bestkNN.predict_proba(X_test)[:,1]))

83
0.8954173486088379
