Project 1 Shopper Prediction
Maxwell Warren - msw8gh
INFOTC 3040

Import modules / dependicies

In [240]:
import numpy as np
import math
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid
import matplotlib.pyplot as plt

Load Data from shopping.xlsx using pandas library

In [241]:
def loadData(filename):
    return pd.read_excel(filename)

rawShoppingData = loadData("shopping.xlsx")

rawTestingData = loadData("unseen.xlsx")

Clean the data (converting strings / bools into numerical representation)
We are going to create an object to convert string options into integers

In [242]:
stringConverter = {
    "Month": {
        "Jan": 1,
        "Feb": 2,
        "Mar": 3,
        "Apr": 4,
        "May": 5,
        "June": 6,
        "Jul": 7,
        "Aug": 8,
        "Sep": 9,
        "Oct": 10,
        "Nov": 11,
        "Dec": 12
    },

    "VisitorType": {
        "New_Visitor": 2,
        "Returning_Visitor": 1,
        "Other": 3
    }
}

def cleanData(data):
    data['Month'] = data['Month'].map(stringConverter['Month']) 
    data['VisitorType'] = data['VisitorType'].map(stringConverter['VisitorType'])

    data['Weekend'] = data['Weekend'].astype(int)
    data['Revenue'] = data['Revenue'].astype(int)

    return data


cleanShoppingData = cleanData(rawShoppingData)
cleanTestingData = cleanData(rawTestingData)

We need to split the data into X and y, where X contains training examples and y contains the outcome
X will be 5000 x 17 and y will be 5000 x 1

In [243]:
#training data
X_train = cleanShoppingData.iloc[:, :17].to_numpy()
y_train = cleanShoppingData["Revenue"].to_numpy()

#testing data
X_test = cleanTestingData.iloc[:, :17].to_numpy()
y_test = cleanTestingData["Revenue"].to_numpy()

##X[row][col], y[row]

The project descriptions says we need to test 3 different feature scaling methods, MinMax, Mean, and Z-Score

Since there is alot of shared code / formulas, I will combine all 3 methods into one parent function with a parameter to control the type of scaling

In [244]:
## data is the 5000 x 17 array
def scaleData(data, sType):
    scaledData = np.zeros(data.shape) #create empty array to put new values in

    m, n = data.shape #m = 5000 n = 17
    
    for c in range(n):
        x = data[:, c] #x is the feature column at index c in data. 5000 x 1

        x_min = np.min(x)
        x_max = np.max(x)
        x_mean = np.mean(x)
        x_std = np.std(x)

        for r in range(m):
            x_curr = x[r] #current value we are working with
            x_scaled = 0

            if sType == "Minmax":
                x_scaled = (x_curr - x_min) / (x_max - x_min)
            elif sType == "Mean":
                x_scaled = (x_curr - x_mean) / (x_max - x_min)
            elif sType == "Z-Score":
                x_scaled = (x_curr - x_mean) / x_std

            scaledData[r, c] = x_scaled #add normalized value to correct spot in new dataset
    
    return scaledData


Next, I am going to define the sigmoid function aswell as initialize all weights for the features to 0. Theta is often used to represent the weights.
I am also going to add a function that adds the bias column to a dataset, since this will be useful for when we are done feature scaling the data.

In [245]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def init_weights(n):
    #n = number of features  = 18 because we have called the add bias term before this function is called
    theta = np.zeros(n) 
    return theta

def add_bias_column(X):
    X_bias = np.c_[np.ones(X.shape[0]), X]  #adding column of ones to start of training data
    return X_bias
 

We are going to implement the logistic cost function next aswell as a cost  function with L2 regularization

In [246]:
def cost(X, y, theta):
    m = len(y)  
    z = np.dot(X, theta)  
    h = sigmoid(z)  

    cost = (-1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))

    return cost

def cost_function_with_L2(X, y, theta, lambda_):
    m = len(y)  
    z = np.dot(X, theta)  
    h = sigmoid(z)  
    
    cost = (-1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
    regularization_term = (lambda_ / (2*m)) * np.sum(np.square(theta[1:]))  
    
    total_cost = cost + regularization_term
    return total_cost

The next step is to use gradient descent to optimize the weights to overall minimize the cost function. We will define the function below for each of the cost variations

In [247]:
def gradient_descent(X, y, theta, learning_rate, iterations):
    m = len(y)  # number of training examples

    for i in range(iterations):
        z = np.dot(X, theta)
        h = sigmoid(z)

        gradient = (1/m) * np.dot(X.T, (h - y))  # gradient of the cost function
        theta -= learning_rate * gradient  # update weights

    return theta


def gradient_descent_with_L2(X, y, theta, learning_rate, iterations, lambda_):
    m = len(y)
    
    for i in range(iterations):
        z = np.dot(X, theta)
        h = sigmoid(z)

        gradient = (1/m) * np.dot(X.T, (h - y))
        theta[1:] -= learning_rate * ((1/m) * np.dot(X[:, 1:].T, (h - y)) + (lambda_ / m) * theta[1:])  
        
        theta[0] -= learning_rate * gradient[0]
        
    return theta

Implement prediction and accuracy functions for model testing


In [248]:
def predict(X_test, theta):
    z = np.dot(X_test, theta)
    probabilities = sigmoid(z)
    return [1 if p >= 0.5 else 0 for p in probabilities]

def getAccuracy(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100

Create a master function to train, test, and log on all scaling types and finding the best one

In [249]:
def test_models(sTypes):
    print("Testing all scaling types WITHOUT any regularization.\n")

    best_accuracy = 0
    best_scaling_method = ""

    for sType in sTypes:
        print("Testing model with", sType, "scaling")

        # Scale the training and test data
        X_train_scaled = scaleData(X_train, sType)  # Scale data before adding bias column
        X_train_bias = add_bias_column(X_train_scaled)  # Training data with added bias column

        X_test_scaled = scaleData(X_test, sType)  # Scale the test data
        X_test_bias = add_bias_column(X_test_scaled)  # Test data with added bias column

        # Initialize weights (all zeros)
        theta_initial = init_weights(X_train_bias.shape[1])

        # Train the model using gradient descent without regularization
        optimized_theta = gradient_descent(X_train_bias, y_train, theta_initial, 0.01, 5000)

        y_pred_test = predict(X_test_bias, optimized_theta)

        test_accuracy = getAccuracy(y_test, y_pred_test)

        if test_accuracy > best_accuracy:
            best_accuracy = test_accuracy
            best_scaling_method = sType

        print(f"Accuracy with {sType} scaling: {test_accuracy}%\n")

    # Output the best scaling method
    print(f"{best_scaling_method} has the best accuracy. We are going to retrain the model using L2 regularization with this scaling method, per the instructions.\n")





    ## Retraining with the most effective scaling method (add L2 Regularization)

    X_train_scaled = scaleData(X_train, best_scaling_method)
    X_train_bias = add_bias_column(X_train_scaled)

    X_test_scaled = scaleData(X_test, best_scaling_method)
    X_test_bias = add_bias_column(X_test_scaled)

    theta_initial = init_weights(X_train_bias.shape[1])

    optimized_theta_L2 = gradient_descent_with_L2(X_train_bias, y_train, theta_initial, 0.01, 5000, 0.05) #0.05 is the lamba for L2

    y_pred_test_L2 = predict(X_test_bias, optimized_theta_L2)

    test_accuracy_L2 = getAccuracy(y_test, y_pred_test_L2)

    print(f"Model Accuracy with L2 Regularization using {best_scaling_method} scaling: {test_accuracy_L2}%")


test_models(["Minmax", "Mean", "Z-Score"])


Testing all scaling types WITHOUT any regularization.

Testing model with Minmax scaling
Accuracy with Minmax scaling: 84.8%

Testing model with Mean scaling
Accuracy with Mean scaling: 84.8%

Testing model with Z-Score scaling
Accuracy with Z-Score scaling: 86.6%

Z-Score has the best accuracy. We are going to retrain the model using L2 regularization with this scaling method, per the instructions.

Model Accuracy with L2 Regularization using Z-Score scaling: 86.6%
