# Libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow as tf
import os, random
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from wordcloud import WordCloud
import time
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics 
from tensorflow import keras 
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from modules.DataProcessing import *
from itertools import product # custom grid search
import random 
import sys

input_path = 'inputs\\data\\comestic'
input_clean = 'outputs\\data\\comestic'
name = '2019-Dec'

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# Recommendation System 

## 1. Interaction Matrix

In collaborative filtering, it will based on the past interaction of user and of similar users to give recommended items. We need 3 main matrix:

1. User matrix: containing user content

2. Product matrix: contains product information

3. Interation matrix: display the interaction score (e.g. ratings) of all users toward every available products. 

Alternating Least Squares algorithm is one approach to collaborative filtering that can be used to train a model on this interaction matrix. We will derive the interaction matrix based on recency combined with weight decay. Recent interactions are often more indicative of a user's current preferences than older interactions. In this case, we will intialize the interval recency for interaction matrix as 15 days (half-month) and weight decay to be 0.5 for this interval.

In [124]:
df = pd.read_csv(f'{input_clean}\\{name}-feature.csv').sort_values(by='user_id')

In [125]:
cleandata = pd.read_csv(f'{input_clean}\\{name}-clean.csv')

In [126]:
cleaner = DataCleaner(cleandata)

In [127]:
userTable = cleaner.create_user_table(is_return=True)
productTable = cleaner.create_product_table(is_return=True)

CREATING USER TABLE...
Basic features processing...
Interaction features processing...
Calculating relative price...
End calculating relative price. Finished in 5.062s.
-------------------------------------------------------------
Interaction rates processing...
Check datatypes processing...
Create user table successfully. Finished in 56.073s.
-------------------------------------------------------------
CREATING PRODUCT TABLE...
Basic features processing...
Interaction rates processing...
Create product table successfully. Finished in 25.803s.
-------------------------------------------------------------


In [7]:
def checkpoint_call(model_name=None):
    if model_name==None:
        raise NameError('Need to specify model name.')
    
    checkpoint_path = 'outputs\\checkpoints'
    checkpoint_path = checkpoint_path + f'\\{model_name}'
        
    if not os.path.exists(f'{checkpoint_path}'):
        os.makedirs(f'{checkpoint_path}')
        
    model_ver = max([int(i) for i in os.listdir(f"{checkpoint_path}")]+[0]) + 1
    filepath=f"{checkpoint_path}\\{model_ver}\\"+"{epoch}.ckpt"

    return ModelCheckpoint(filepath, monitor='val_loss', verbose=1,
                        save_weights_only=True, save_best_only=True, mode='min')

In [8]:
def model_setup(model_name=None, learning_rate=0.01, lr_schedule=False, optimizer='adam', callback=True):

    setup_params = {
        'optimizer': None,
        'loss':  tf.keras.losses.MeanSquaredError(),
        'callbacks': None
    }

    # learning rate
    initial_learning_rate = learning_rate
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=100,
        decay_rate=0.96,
        staircase=True)
    
    if lr_schedule:
        learning_rate = lr_schedule
    else: learning_rate = learning_rate

    # optimizer
    optimizers = {
        'adam': tf.keras.optimizers.Adam(learning_rate=learning_rate, decay=1e-6),
        'rmsprop': tf.keras.optimizers.RMSprop(learning_rate=learning_rate),
        'sgd': tf.keras.optimizers.SGD(learning_rate=learning_rate)
    }

    # callback 
    logdir = 'logs'
    callbacks = {
        True: [TensorBoard(log_dir=logdir), checkpoint_call(model_name=model_name)],
        False: None,
    }

    setup_params['optimizer'] = optimizers[optimizer]
    setup_params['callbacks'] = callbacks[callback]

    return setup_params

In [9]:
def evaluate_model(y_true, y_pred):
    mae = sklearn.metrics.mean_absolute_error(y_true, y_pred)
    mse = sklearn.metrics.mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(sklearn.metrics.mean_squared_error(y_true, y_pred))
    return mae, rmse

## 3.2. Content-based by Neural Network

In [10]:
# dicts and constants

# attributes
attributes =['interaction_score', 'user_id', 'views_user', 'carts_user', 
            'remove_from_carts_user', 'purchases_user', 'avg_view_price',
            'avg_purchase_price', 'avg_view_relative_price',
            'avg_purchase_relative_price', 'distinct_view_product',
            'distinct_cart_product', 'distinct_remove_product',
            'distinct_purchase_product', 'cart_per_view_user',
            'purchase_per_view_user', 'remove_per_cart_user',
            'purchase_per_cart_user', 
            'product_id', 'category_id', 'avg_price', 'relative_price', 'views_product',
            'carts_product', 'remove_from_carts_product', 'purchases_product',
            'cart_per_view_product', 'purchase_per_view_product',
            'remove_per_cart_product', 'purchase_per_cart_product']

# user and product attributes
excludes = ['interaction_score', 'user_id', 'product_id']
user_attr = [attr for attr in attributes[1:18] if attr not in excludes]
product_attr = [attr for attr in attributes[18:] if attr not in excludes]
target = 'interaction_score'
scaled_attr = [attr for attr in attributes if attr not in excludes]

user_size = len(user_attr)
product_size = len(product_attr)
print('User size: {}, product size: {}'.format(user_size, product_size))

User size: 16, product size: 11


In [11]:
def overlap_split(df,train_size=0.9):
    # train test split
    test_size=0.5
    train, test = train_test_split(df, train_size=train_size, random_state=42)
    dev, test = train_test_split(test, test_size=test_size, random_state=42)
    return train, dev, test

def disjoint_split(df,train_size=0.9):
    test_size=0.5
    
    # Group data by user_id
    user_grouped = df.groupby('user_id')
    unique_users = list(user_grouped.groups.keys())

    # Shuffle the order of user IDs
    random.shuffle(unique_users)

    # Calculate the number of users for each split
    num_users = len(unique_users)
    num_train_users = int(train_size * num_users)
    num_dev_users = int((1-train_size) * num_users)

    # Split user IDs into train, dev, and test sets
    train_users = unique_users[:num_train_users]
    dev_users = unique_users[num_train_users:]

    # Filter data based on split user IDs
    train = df[df['user_id'].isin(train_users)]
    dev = df[df['user_id'].isin(dev_users)]
    dev = dev.sample(frac = 1)
    dev, test = train_test_split(dev, test_size=test_size, random_state=42)

    # Verify user distribution in dev and test sets
    # common_dev_users = set(dev_users).intersection(train_users)
    # if not common_dev_users:
    #     print("Test and Train users are disjoint.")
    # else: print("Test and Train users are overlap!!!")

    return train, dev, test

In [12]:
def dataScaler(df):

    print('Scaling data processing...')
    scaler = StandardScaler()
    df_scaled = df.copy()
    scaler.fit(df[scaled_attr])
    df_scaled[scaled_attr] = scaler.fit_transform(df[scaled_attr])
    print('Scaled successfully.')
    # print(df_scaled.describe())

    return df_scaled

In [13]:
def ModelPipeline(df, models, split_type='disjoint_split', train_size=0.99, epochs=3, batch_size=256,
                   learning_rate=0.01, lr_schedule=False, callback=True):

    split_types = {
        'disjoint_split': disjoint_split,
        'overlap_split': overlap_split,
    }

    #df_scaled = dataScaler(df)
    df_scaled = df
    
    print('Train-Dev-Test plitting proceesing... Split type:', split_type)
    train, dev, test = split_types[split_type](df_scaled, train_size=train_size)

    print('Found {} model(s)'.format(len(models)))
    print(models.keys())

    # loop through models
    for key, model in zip(models.keys(), models.values()):
        print('Running {} model...'.format(key))

        start = time.time()
        currentmodel = model
        
        # model summary
        num_layers = len(currentmodel.layers)
        print("Number of layers: {}".format(num_layers))
        currentmodel.summary()

        # set up model
        setup = model_setup(model_name=key, learning_rate=learning_rate, lr_schedule=lr_schedule, callback=callback)
        callbacks = setup['callbacks']
        currentmodel.compile(optimizer=setup['optimizer'], loss=setup['loss'], metrics=[tf.keras.metrics.RootMeanSquaredError()])

        # train model
        print('Training processing...')
        hist = currentmodel.fit([train[user_attr], train[product_attr]], train[target],
                        validation_data = [[dev[user_attr], dev[product_attr]], dev[target]], 
                        epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[callbacks])
        
        # predict
        print('Predicting processing...')
        preds = currentmodel.predict([test[user_attr], test[product_attr]])

        print('Evaluation processing...')
        eval_loss = currentmodel.evaluate([test[user_attr], test[product_attr]], test[target])
        print("Evaluate on test data: {}".format(eval_loss))

        rae, rmae = evaluate_model(test[target], preds)
        print('MAE = {:.3f}, RMAE = {:.3f}'.format(rae, rmae))

        end = time.time()
        print('\nFinished {} model in {:.3f}s\n'.format(key, end-start))

        

In [154]:
# define models
def contentNN(content_size=None, neurals=[64, 64, 32], num_outputs=32):
    content_NN = tf.keras.models.Sequential()
    content_NN.add(tf.keras.layers.Input(shape=(content_size)))
    
    for neural in neurals:
        content_NN.add(tf.keras.layers.BatchNormalization())
        content_NN.add(tf.keras.layers.Dense(neural, activation='relu'))
    
    content_NN.add(tf.keras.layers.Dense(num_outputs, activation='linear'))
    
    return content_NN


def CBDeepLearning(neurals=[64, 64, 32], num_outputs = 32):

    tf.random.set_seed(1)
    
    user_NN = contentNN(user_size, neurals, num_outputs)
    product_NN = contentNN(product_size, neurals, num_outputs)

    # create the user input and point to the base network
    input_user = tf.keras.layers.Input(shape=(user_size))
    vu = user_NN(input_user)
    vu = tf.linalg.l2_normalize(vu, axis=1)

    # create the item input and point to the base network
    input_item = tf.keras.layers.Input(shape=(product_size))
    vm = product_NN(input_item)
    vm = tf.linalg.l2_normalize(vm, axis=1)

    # compute the dot product of the two vectors vu and vm
    output = tf.keras.layers.Dot(axes=1)([vu, vm])


    model = tf.keras.models.Model(inputs=[input_user, input_item], outputs=output)

    return model

In [155]:
num_outputs = 32
split_type='disjoint_split'
train_size=0.99
epochs=5
batch_size=1024
learning_rate=0.01
lr_schedule=False
callback=True
models = {
    'CBDeepLearning': CBDeepLearning(),
    }

In [None]:
# ModelPipeline(df, models=models, split_type=split_type, train_size=train_size, epochs=epochs, batch_size=batch_size, 
#               learning_rate=learning_rate, lr_schedule=True, callback=True)

In [156]:
def delaydot(num_dots=3,sleep_time=1, pattern='.'):
    for i in range(num_dots):
        sys.stdout.write(pattern)
        sys.stdout.flush()
        time.sleep(sleep_time)
    print()

In [157]:
def printRed(text):
    print('\033[31m' + str(text) + '\033[0m')

def printGreen(text):
    print('\033[32m' + str(text) + '\033[0m')

In [213]:
def ModelFineTuning(hyperparams, grid_number=20, train_size=0.99, epochs=3, batch_size=256,
                   learning_rate=0.01, lr_schedule=False, callback=True):

    results = {
        'params': [],
        'eval': [],
        'checkpoints': [],
    }

    print('Loading clean data...')
    cleandata = pd.read_csv(f'{input_clean}\\{name}-clean.csv')
    cleaner = DataCleaner(cleandata)
    cleaner.relative_price()
    cleandata = cleaner.data
    cleaner = DataCleaner(cleandata, recency_days=hyperparams['recency_days'], weight_decay=hyperparams['weight_decay'])
    df = cleaner.FeatureEngineering(return_merge=True)

    # grid search
    total_combinations = 1
    for values in hyperparams.values():
        total_combinations *= len(values)

    grid_number = min(total_combinations, grid_number)

    print('Running {} random searching'. format(grid_number), end='')
    delaydot()

    for key in range(grid_number):

        params = {k: random.choice(v) for k, v in hyperparams.items()}
        while params in results['params']:
            params = {k: random.choice(v) for k, v in hyperparams.items()}

        print('Fine Tuning Number {}, hyperparameters summary:'.format(key))
        printRed(params)
        
        print('1. Train-Dev-Test plitting proceesing', end='')
        delaydot()
        train, dev, test = disjoint_split(df, train_size=train_size)

        
        print('2. Bulding model...')

        start = time.time()
        currentmodel = CBDeepLearning(neurals=params['neurals'],num_outputs = params['num_outputs'])
        
        # model summary
        num_layers = len(currentmodel.layers)
        currentmodel.summary()

        # set up model
        setup = model_setup(model_name='CBDeepLearning', learning_rate=learning_rate, lr_schedule=lr_schedule, callback=callback)
        callbacks = setup['callbacks']
        currentmodel.compile(optimizer=setup['optimizer'], loss=setup['loss'], metrics=[tf.keras.metrics.RootMeanSquaredError()])

        # train model
        print('3. Training processing...')
        hist = currentmodel.fit([train[user_attr], train[product_attr]], train[target],
                        validation_data = [[dev[user_attr], dev[product_attr]], dev[target]], 
                        epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[callbacks])
        
        # predict
        print('4. Evaluation processing...')

        preds = currentmodel.predict([test[user_attr], test[product_attr]])
        eval = currentmodel.evaluate([test[user_attr], test[product_attr]], test[target])
        printRed('Evalulation result (RMSE): {}'.format(eval))
        print('Saving results...')
        
        path = f"outputs//checkpoints//CBDeepLearning"
        checkpoint_ver = max([int(i.split('.')[0]) for i in os.listdir(path)]+[0])
        last_checkpoint = path + f"//{checkpoint_ver}"
        printRed('Saved to {}.'.format(last_checkpoint))

        results['params'] = results['params'] + [params]
        results['eval'] = results['eval'] + [eval]
        results['checkpoints'] = results['checkpoints'] + [last_checkpoint]

        end = time.time()
        printGreen('\nFinished model {} in {:.3f}s\n'.format(key, end-start))
        delaydot(num_dots=25, sleep_time=0.05, pattern='-+-')
    
    best_result = min(results['eval'],  key=lambda x: x[1])
    best_id = results['eval'].index(best_result)
    best_params = results['params'][best_id]
    best_eval = results['eval'][best_id]
    best_checkpoint = results['checkpoints'][best_id]
    printRed('Best params: ')
    printRed(best_params)
    printRed('Loss = {}; Checkpoint saved in {}'.format(best_eval, best_checkpoint))

    return results
        

In [214]:
def best_params(results):
    result = min(results['eval'], key=lambda x: x[1])
    id = results['eval'].index(result)
    best_result = {}
    best_result['params'] = results['params'][id]
    best_result['eval'] = results['eval'][id]
    best_result['checkpoints'] = results['checkpoints'][id]

    return best_result

In [215]:
train_size=0.99
epochs=3
batch_size=1024
learning_rate=0.01
lr_schedule=False
callback=True
grid_number=3
hyperparams = {
    'recency_days': [15],
    'weight_decay': [0.25],
    'neurals': [[64,16], [64,64,32], [64,64,32,32,16], [64,64,32,32,16,16], [256,64,64,32,32,16,16]],
    'num_outputs': [16],
    }

In [216]:
results_1 = ModelFineTuning(hyperparams, grid_number=grid_number, train_size=train_size, epochs=epochs, batch_size=batch_size,
                   learning_rate=learning_rate, lr_schedule=True, callback=True)

Loading clean data...
Calculating relative price...
End calculating relative price. Finished in 6.995s.
-------------------------------------------------------------
FEATURE ENGINEERING PROCESSING...
CREATING PRODUCT TABLE...
Basic features processing...
Interaction rates processing...
Create product table successfully. Finished in 31.950s.
-------------------------------------------------------------
CREATING USER TABLE...
Basic features processing...
Interaction features processing...
Interaction rates processing...
Check datatypes processing...
Create user table successfully. Finished in 47.957s.
-------------------------------------------------------------
CREATE USER-PRODUCT INTERACTION TABLE...
Calculating recency processing...
Calculating recency successfully. Finished in 1.290s.
-------------------------------------------------------------
Calculating basic interactions...
Calculating interaction scores...
Create interaction table successfully. Finished in 10.061s.
------------

In [217]:
grid_number=3
hyperparams = {
    'recency_days': [15],
    'weight_decay': [0.5],
    'neurals': [[64,64,32], [64,64,32,32,16]],
    'num_outputs': [16],
    }

In [218]:
results_2 = ModelFineTuning(hyperparams, grid_number=grid_number, train_size=train_size, epochs=epochs, batch_size=batch_size,
                   learning_rate=learning_rate, lr_schedule=True, callback=True)

Loading clean data...
Calculating relative price...
End calculating relative price. Finished in 7.646s.
-------------------------------------------------------------
FEATURE ENGINEERING PROCESSING...
CREATING PRODUCT TABLE...
Basic features processing...
Interaction rates processing...
Create product table successfully. Finished in 30.161s.
-------------------------------------------------------------
CREATING USER TABLE...
Basic features processing...
Interaction features processing...
Interaction rates processing...
Check datatypes processing...
Create user table successfully. Finished in 56.751s.
-------------------------------------------------------------
CREATE USER-PRODUCT INTERACTION TABLE...
Calculating recency processing...
Calculating recency successfully. Finished in 1.426s.
-------------------------------------------------------------
Calculating basic interactions...
Calculating interaction scores...
Create interaction table successfully. Finished in 8.625s.
-------------

KeyboardInterrupt: 

In [16]:
def save_model(model):
    print('Saving model...')
    path = 'outputs//models'
    if not os.path.exists(path):
        os.makedirs(path)
    model_ver = max([int(i.split('.')[0]) for i in os.listdir(path)]+[0]) + 1
    model.save(f'{path}//{model_ver}.keras')
    print(f"Saved to '{path}//{model_ver}.keras'")

In [None]:
# Load weights by lastest checkpoint
def load_latest_checkpoint(modelname):
    print('Loading latest checkpoint...')
    model = models[modelname]
    #model.summary()

    path = f"outputs//checkpoints//{modelname}"
    if not os.path.exists(path):
        os.makedirs(path)
    model_ver = max([int(i.split('.')[0]) for i in os.listdir(path)]+[0])

    if model_ver==0: print('There is no checkpoint')
    else: print('Found checkpoint ',model_ver)

    latest = tf.train.latest_checkpoint(f"{path}//{model_ver}")
    model.load_weights(latest)
    print('Loaded: ',latest)
    return model

def load_latest_model():
    print('Loading latest model...')
    path = f"outputs//models"
    if not os.path.exists(path):
        os.makedirs(path)
    model_ver = max([int(i.split('.')[0]) for i in os.listdir(path)]+[0])

    if model_ver==0: print('There is no model')
    else: print('Found model ',model_ver)

    return tf.keras.models.load_model(f'{path}//{model_ver}.keras')

In [113]:
model = load_latest_checkpoint('CBDeepLearning')
save_model(model)

Loading latest checkpoint...
Found checkpoint  27
Loaded:  outputs//checkpoints//CBDeepLearning//27\2.ckpt
Saving model...
Saved to 'outputs//models//2.keras'


In [19]:
model = load_latest_model()

Loading latest model...
Found model  2


In [20]:
def predict(model,input):
    output = model.predict(input)
    return output

In [21]:
def user_content(user_id):
    user_content = userTable.loc[userTable.user_id==user_id]
    return user_content

def product_content(product_id):
    product_content = productTable.loc[productTable.product_id==product_id]
    return product_content

In [22]:
id = 5881337
proda = product_content(id)
usera = user_content(4661182)

In [121]:
df.loc[(df.user_id==4661182)]

Unnamed: 0,user_id,product_id,interaction_score,views_user,carts_user,remove_from_carts_user,purchases_user,avg_view_price,avg_purchase_price,avg_view_relative_price,...,avg_price,relative_price,views_product,carts_product,remove_from_carts_product,purchases_product,cart_per_view_product,purchase_per_view_product,remove_per_cart_product,purchase_per_cart_product
567830,4661182,5805749,0.125992,3,2,2,0,21.536667,19.37,5.556445,...,19.37,6.816327,54,38,17,9,0.007094,0.005425,0.009254,0.005425
407479,4661182,4185,0.197926,3,2,2,0,21.536667,19.37,5.556445,...,19.37,6.816327,744,311,97,98,0.058055,0.059072,0.052803,0.059072
942157,4661182,5778934,0.065975,3,2,2,0,21.536667,19.37,5.556445,...,25.87,3.036683,218,19,8,3,0.003547,0.001808,0.004355,0.001808
1816305,4661182,5667096,0.0,3,2,2,0,21.536667,19.37,5.556445,...,3.81,0.294991,1,0,1,0,0.0,0.0,0.0,0.0


In [23]:
def userNN(model):
    # use model
    user_layer = model.layers[2]
    input_user = Input(shape=user_layer.input_shape[1:])
    output_user = user_layer(input_user)
    user_model = Model(inputs=input_user, outputs=output_user)

    return user_model

In [24]:
def productNN(model):
    # product model
    product_layer = model.layers[3]
    input_product = Input(shape=product_layer.input_shape[1:])
    output_product = product_layer(input_product)
    product_model = Model(inputs=input_product, outputs=output_product)

    return product_model

In [31]:
def productVector(model):
    product_model = productNN(model)
    productVectors = pd.DataFrame(columns=range(num_outputs+1))
    productVectors = productVectors.rename(columns={0: 'product_id'})
    print('Found {} products'.format(len(productTable.product_id)))
    for id, product_id in enumerate(productTable.product_id):
        print('Processing {}: product id {}...'.format(id, product_id))
        product = product_content(product_id)
        new_row = [product_id]
        new_row = new_row + list(product_model.predict(product.iloc[:,3:])[0])
        productVectors.loc[len(df)] = new_row

    return productVectors

In [24]:
def recommend_list(user_ids,model,top):
    RecList = {}
    for user_id in user_ids:
        user = user_content(id)
        currentuser = df.loc[df.user_id==user_id]
        RecList[user_id] = pd.DataFrame
        for _, row in productTable.iloc[:,:].iterrows():
            product_id = row.product_id
            product = product_content(product_id)
            exists = ((currentuser['product_id'] == product_id) & (currentuser['user_id'] == user_id)).any()
            if not exists:
                prediction = predict(model, [user.iloc[:,3:],product.iloc[:,3:]])
                if len(RecList[user_id])==top:
                    min_id = np.argmin(RecList[user_id])
                    if RecList[user_id][min_id] < prediction:
                        RecList[user_id][min_id] = product_id
                else: RecList[user_id] = RecList[user_id] + [product_id]
    return RecList

In [236]:
RecList = recommend_list(user_ids=[1180452], model=model, top=10)

ValueError: Data cardinality is ambiguous:
  x sizes: 0, 1
Make sure all arrays contain the same number of samples.

In [233]:
RecList

{1180452: []}