In [None]:
# install

!pip install sklearn
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1310 sha256=bcc4435805a0f0c4ddfe109124311277a05e9e152295bb082449ca1d4457afa7
  Stored in directory: /root/.cache/pip/wheels/46/ef/c3/157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 81 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [None]:
# imports

import pickle
import random
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN
#from tensorflow.keras.utils.vis_utils import plot_model
from catboost import CatBoostClassifier
#from imblearn.over_sampling import RandomOverSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, SCORERS, RocCurveDisplay, roc_curve
from sklearn.model_selection import train_test_split,KFold, RandomizedSearchCV

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
#     ***** FEATURES ORDER: *****

#     "num_page": num_page,
#     "num_category": len(cat_dict), 
#     "max_category": cat_dict[max_cat], 
#     "num_product": len(product_dict),
#     "max_product": product_dict[max_product],
#     "cur_type": row['event_type'], 
#     "num_cart": type_dict['cart'],
#     "num_remove_cart": type_dict['remove_from_cart'],
#     "num_view": type_dict['view'],
#     "price": row['price'],
#     "max_price": max_price,
#     "min_price": min_price,
#     "mean_price": mean_price 
# 
#     ***** EVENT TYPES: *****
# cart = 0, removecart = 1, view = 2, purchase = 3

In [None]:
def load_dataset(path):
    with open(path, 'rb') as handle:
        X = pickle.load(handle)
    return X


# make balanced dataset by dropping some zero classes (not randomly)
def balance_dataset(X, Y):
    count_zeros = len(Y) - int(np.sum(Y))
    c = 0
    X_input = []
    Y_input = []
    for (x,y) in zip(X,Y):
        if not int(y) or c < count_zeros:
            Y_input.append(y)
            X_input.append(x)
            if int(y):
                c += 1
    X = np.array(X_input)
    Y = np.array(Y_input)
    return X, Y


# make balanced dataset using oversampling
def oversampling_balance(X, Y):
    org_shape = X.shape
    X = X.reshape((X.shape[0], X.shape[1]*X.shape[2]))
    ros = RandomOverSampler(sampling_strategy='minority')
    X_resampled, y_resampled = ros.fit_resample(X, Y)
    return X_resampled.reshape(org_shape), y_resampled


# extract a balanced test set from dataset
def balanced_testset(ds, split_ratio):
    split = int((len(ds)*split_ratio)/2)
    c0, c1 = 0, 0
    l0, l1 = [], []
    for d in ds:
        if d[0,-1] == 1 and c1 < split:
            l1.append(True)
            c1 += 1
        else:
            l1.append(False)
        if d[0,-1] == 0 and c0 < split:
            l0.append(True)
            c0 += 1
        else:
            l0.append(False)
    testds = ds[np.any([l0,l1], axis=0)]
    trainds = ds[np.any([l0,l1], axis=0) == False]
    return testds, trainds

# extract X and Y label from all sessions
def extract_XY(X):
    Y = X[:,0,-1]
    Y = np.array([y for y in Y])
    X = X[:,:,:-1]
    return X, Y

# split dataset to test, validation, train
def split_dataset(X, Y, test_size, val_size):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = val_size)
    return X_train, X_test, X_val, y_train, y_test, y_val


# for Catboost model only the last page view and its features is needed
def make_dataset2(ds):
    dataset2 = []
    for session in ds:
        end = False
        for idx, page in enumerate(session):
            if page[0] == 0.:
                dataset2.append(session[idx-1])
                end = True
                break 
        if end == False:
            dataset2.append(session[-1])    
    dataset2 = np.array(dataset2)
    return dataset2

# extract X and Y from catboost dataset
def extract_CXY(ds):
    CY = ds[:,-1]
    CY = np.array([y for y in CY])
    CX = ds[:,:-1]
    return CX, CY

In [None]:
# attention layer

class attention(layers.Layer):
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences

        super(attention,self).__init__()

    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1), initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1), initializer="normal")
        super(attention,self).build(input_shape)


    def call(self, x):
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        if self.return_sequences:
            return output
        return K.sum(output, axis=1)

In [None]:
# load train dataset
# dataset = load_dataset('/content/drive/MyDrive/purchase_prediction_features.pickle')
dataset = load_dataset('/content/drive/MyDrive/dataset_karamuzi/purchase_prediction_trainset.pickle')
print("Type of dataset for GRU is: ", type(dataset))
print("Shape of dataset for GRU is: ", dataset.shape)
print("*"*30)

# train dataset labels count
c = int(np.sum(dataset[:,0,-1]))
print("size of class 1: ", c)
print("size of class 0: ", len(dataset) - c)
print("Total: ", len(dataset))
print("*"*30)

# extract X and Y for all sessions in train dataset
X, Y = extract_XY(dataset)
print(f"shape of X is: {X.shape} and shape of Y is {Y.shape}")

Type of dataset for GRU is:  <class 'numpy.ndarray'>
Shape of dataset for GRU is:  (408124, 100, 14)
******************************
size of class 1:  331800
size of class 0:  76324
Total:  408124
******************************
shape of X is: (408124, 100, 13) and shape of Y is (408124,)


In [None]:
# ********** exctract a testset from dataset using balanced_testset() function **********
# run this block only one time
# first uncomment load_dataset('/content/drive/MyDrive/purchase_prediction_features.pickle') 
# and comment load_dataset('/content/drive/MyDrive/dataset_karamuzi/purchase_prediction_trainset.pickle') from above block
# uncomment codes of this block and run
# save the results to purchase_prediction_testset.pickle and purchase_prediction_trainset.pickle
# then comment this block and uncommented load_dataset of above block
# now you can run above block and get train set and test set from drive

# test_set, train_set = balanced_testset(dataset, 0.02)

# path = '/content/drive/MyDrive/purchase_prediction_testset.pickle'
# with open(path, 'wb') as f:
#     pickle.dump(test_set, f, protocol=pickle.HIGHEST_PROTOCOL)

# path = '/content/drive/MyDrive/purchase_prediction_trainset.pickle'
# with open(path, 'wb') as f:
#     pickle.dump(train_set, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# load test dataset
testdata = load_dataset('/content/drive/MyDrive/dataset_karamuzi/purchase_prediction_testset.pickle') 
print("Type of test dataset for GRU is: ", type(testdata))
print("Shape of test dataset for GRU is: ", testdata.shape)
print("*"*30)

# test dataset labels count
c = int(np.sum(testdata[:,0,-1]))
print("size of class 1: ", c)
print("size of class 0: ", len(testdata) - c)
print("Total: ", len(testdata))
print("*"*30)


# extract X and Y for all sessions in train dataset
Xtest, Ytest = extract_XY(testdata)
print(f"shape of X test is: {Xtest.shape} and shape of Y test is {Ytest.shape}")

Type of test dataset for GRU is:  <class 'numpy.ndarray'>
Shape of test dataset for GRU is:  (8328, 100, 14)
******************************
size of class 1:  4164
size of class 0:  4164
Total:  8328
******************************
shape of X test is: (8328, 100, 13) and shape of Y test is (8328,)


In [None]:
# add layers of GRU model in this function
# layers:
#   input layer
#   mask layer
#   GRU layer
#   dropout layer
#   pooling layer
#   Flatten layer
#   dense layer (feed forward)
#   dense layer (sigmoid)

def GRU_model(input_shape, units, lr, dr, pooling):
    
    model = Sequential()

    model.add(layers.Input(shape = (input_shape[0], input_shape[1])))

    model.add(layers.Masking(mask_value=0.))

    for u in units:
        model.add(layers.GRU(u, return_sequences=True))

    # model.add(attention())

    if dr != 0:
        model.add(layers.Dropout(dr))

    if pooling == 'max':
        model.add(layers.MaxPooling1D(pool_size=3))
    elif pooling == 'avg':
        model.add(layers.AveragePooling1D(pool_size=3))

    model.add(layers.Flatten())

    model.add(Dense(128, activation='relu'))

    model.add(Dense(1, activation='sigmoid'))
    
    opt = tf.keras.optimizers.Adam(learning_rate=lr)

    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', 'AUC'])

    return model

In [None]:
def run_simple_GRU(units, lr, dr, pooling):

    # input shape to pass to GRU model
    input_shape = [X.shape[1], X.shape[2]]

    # split train dataset
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.05, random_state=1)

    # build GRU model
    model = GRU_model(input_shape, units, lr, dr, pooling)

    print(model.summary())

    # add early stopping , set patience to 5
    cb = tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=5, mode='max')

    # add class weight
    c1 = int(np.sum(y_train))
    c0 = len(y_train) - c1
    cw = {0 : c1/len(y_train), 1 : c0/len(y_train)}

    # fit model with early stopping and class weight , I set batch size to 32 and epochs to 50
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, 
                        epochs=50, class_weight=cw, verbose=1, callbacks=[cb])

    # predict on test
    preds = model.predict(X_test)

    # calculate auc
    auc = roc_auc_score(y_test, preds)

    print("AUC = ", auc)

    # RocCurveDisplay.from_predictions(y_test, preds)
    # plt.show()

    # plt.plot(history.history['accuracy'])
    # plt.plot(history.history['val_accuracy'])
    # plt.title('model accuracy')
    # plt.ylabel('accuracy')
    # plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
    # plt.show()

    # plt.plot(history.history['loss'])
    # plt.plot(history.history['val_loss'])
    # plt.title('model loss')
    # plt.ylabel('loss')
    # plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
    # plt.show()

In [None]:
# hyper parameters
# learning_rate : [0.005, 0.001, 0.0001, 0.0005]
# dropout : [0, 0.2, 0.25, 0.5]
# GRU layers : [1, 2, 3] --> hidden nodes of each layer: 128, 64, 32
# pooling : [max , mean , none]

units = [128, 64]
lr = 0.001
dr = 0.2
pooling = 'max'

run_simple_GRU(units, lr, dr, pooling)

In [None]:
def run_kfold_GRU(units, lr, dr, pooling):

    # same as run_simple_GRU but run 'num_folds' times
        
    input_shape = [X.shape[1], X.shape[2]]

    # save results of each fold to outs
    outs = []

    num_folds = 10
    fold_no = 0
    mean_auc = 0

    # k fold cross validation
    kfold = KFold(n_splits=num_folds, shuffle=True)

    for train, test in kfold.split(X, Y):

        fold_no += 1

        print("#"*50)

        model = GRU_model(input_shape, units, lr, dr, pooling)

        print(model.summary())

        cb = tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=5, mode='max')

        c1 = int(np.sum(Y[train]))
        c0 = len(Y[train]) - c1
        cw = {0 : c1/len(Y[train]), 1 : c0/len(Y[train])}

        history = model.fit(X[train], Y[train], validation_data=(X[test], Y[test]), batch_size=32, 
                            epochs=50, class_weight=cw, verbose=1, callbacks=[cb])

        preds = model.predict(X[test])

        auc = roc_auc_score(Y[test], preds)

        outs.append(auc)

        mean_auc += auc

    mean_auc = mean_auc / 10

    for i, x in enumerate(outs):
        print(f"GRU fold {i}", "  AUC = ", x)

    print("GRU Mean AUC: ", mean_auc)

In [None]:
# build catboost train dataset 
Cds = make_dataset2(dataset) 
print("Shape of dataset for Catboost is: ", Cds.shape) 

# extract X and Y from catboost train dataset
CX, CY = extract_CXY(Cds)
print(f"shape of CX is: {CX.shape} and shape of CY is {CY.shape}")

Shape of dataset for Catboost is:  (408124, 14)
shape of CX is: (408124, 13) and shape of CY is (408124,)


In [None]:
# build catboost test dataset 
Cdstest = make_dataset2(testdata) 
print("Shape of test dataset for Catboost is: ", Cdstest.shape) 

# extract X and Y from catboost test dataset
CXt, CYt = extract_CXY(Cdstest)
print(f"shape of CXt is: {CXt.shape} and shape of CYt is {CYt.shape}")

Shape of test dataset for Catboost is:  (8328, 14)
shape of CXt is: (8328, 13) and shape of CYt is (8328,)


In [None]:
def run_catboost():
    
    # add class weight 
    classes = np.unique(CY)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=CY)
    class_weights = dict(zip(classes, weights))

    cbc_model = CatBoostClassifier(class_weights=class_weights)

    # hyper parameters for random search
    param_dist = { "learning_rate": [0.1, 0.01, 0.05, 0.005, 0.001], 
                "max_depth": [6, 8, 10], 'subsample': [0.4, 0.6, 0.8] }

    #Instantiate RandomSearchCV object
    # with K-fold cross validation
    rscv = RandomizedSearchCV(estimator=cbc_model, param_distributions=param_dist, scoring='roc_auc', cv=10)

    # fit model and add early stopping (patience = 5)
    rscv.fit(CX, CY, early_stopping_rounds=5, plot=True)

    # Print the tuned parameters and score
    print("Catboost best params", rscv.best_params_)
    print("Catboost best scores", rscv.best_score_)

In [None]:
def best_catboost():

    # run catboost with best parameters and evaluate on test dataset

    CX_train, CX_test, Cy_train, Cy_test = train_test_split(CX, CY, test_size = 0.05, random_state=1)

    model = CatBoostClassifier(learning_rate=0.05,
                                subsample=0.8,
                                max_depth=6,
                                eval_metric='AUC',
                                task_type="GPU",
                                bootstrap_type='Poisson',
                                devices='0:1')

    model.fit(CX_train, Cy_train, eval_set=(CX_test, Cy_test))

    preds = model.predict_proba(CXt)

    auc = roc_auc_score(CYt, preds[:,1])

    print("AUC = ", auc)