In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import os
import implicit
import time

In [None]:
class Implicit:
    def __init__(self, seed = 941, environment = "offline"):
        self.seed = seed
        self.not_testable = 0
        
        if environment == "kaggle":
            self.read_matrices_timesplit("/kaggle/input/")
        else:
            self.read_matrices_timesplit("../")
        
    def read_matrices_timesplit(self, path):
        self.training_set_csr = sparse.load_npz(f"{path}/ratings_matrix_csr.npz")
        self.training_set_csr = self.training_set_csr.T
        self.training_set_coo = sparse.load_npz(f"{path}/ratings_matrix_coo.npz")
        self.training_set_coo = self.training_set_coo.T
        
        self.user_ids = np.load(f"{path}/user_ids.npy")
        self.content_ids = np.load(f"{path}/content_ids.npy")
        
        self.test_set = pd.read_pickle(f"{path}/test_set.pkl", compression="zip")
        self.data_popularity = np.load(f"{path}/popularity_data_mayjuly19.npy", allow_pickle=True)

        self.num_items, self.num_users = self.training_set_csr.shape
    
    def alternating_least_squares(self, iterations, factors, i_lambda, alpha):
        self.model = implicit.als.AlternatingLeastSquares(factors=factors, regularization = i_lambda, iterations = iterations)
        self.model.fit((self.training_set_csr*alpha).astype('double'))
                
    def logistic_factorization(self, iterations, factors, i_lambda, learning_parameter, alpha):
        self.model = implicit.lmf.LogisticMatrixFactorization(factors, learning_parameter, i_lambda, iterations = iterations)
        self.model.fit((self.training_set_coo*alpha).astype('double'))
        
    def predict_user_byIndex(self, index):
        return self.model.item_factors.dot(self.model.user_factors[index].T)
    
    def predict_user_byId(self, userId):
        index = np.where(self.user_ids==userId)[0][0]
        return self.model.item_factors.dot(self.model.user_factors[index].T)
    
    def predict_recommendations(self, userId):
        vector = self.predict_user_byId(userId)
        vector_sorted = np.sort(vector, kind="mergesort")
        
        recommendations = list()
        for i in range(5):
            recommendations.append(np.where(vector==vector_sorted[i])[0][0])
            
        return recommendations
        
    def get_user_vectors(self, users):
        user_dict = dict()
        
        for i in range(len(users)):
            try:
                user_indx = np.where(self.user_ids==users[i])[0][0]
                user_dict[users[i]] = self.predict_user_byIndex(user_indx)
            except:
                self.not_testable += 1
                
        return user_dict
    
    def get_user_vectors_sorted(self, user_dict):
        user_dict_sorted = dict()
        
        for key in user_dict:
            user_dict_sorted[key] = np.sort(user_dict[key], kind="mergesort")[:5000]
                
        return user_dict_sorted
    
    def get_rank(self, item, user_vector, user_vector_sorted):
        item_indx = np.where(self.content_ids==item)[0][0]
        
        prob = user_vector[item_indx]
        return np.where(user_vector_sorted==prob)[0][0]

    def expected_percentile_ranking(self):
        accuracy = list()
        mar = 0   
        self.not_testable = 0        
        users = self.test_set.idUser.values
        contents = self.test_set.fullId.values
        
        user_dict = self.get_user_vectors(self.test_set.idUser.unique())
        user_dict_sorted = self.get_user_vectors_sorted(user_dict)
        
        for i in range(len(users)):
            try:
                accuracy.append(self.get_rank(contents[i], user_vector = user_dict[users[i]],user_vector_sorted = user_dict_sorted[users[i]]) / 5000)
            except:
                pass
            
            if i % 84650 == 0:
                print(f"Solved iteration: {i}. That's approximately {np.round((i/len(contents))*100,2)}%.")

        mar = np.mean(accuracy)
            
        return mar, accuracy
    
    def calculate_epr_popularity(self):
        accuracy = list()
        mar = 0        
        users = self.test_set.idUser.values
        contents = self.test_set.fullId.values
        
        data_popularity_sorted = np.sort(self.data_popularity, kind="mergesort")
        
        for i in range(len(users)):
            try:
                accuracy.append(self.get_rank(contents[i], user_vector = self.data_popularity,user_vector_sorted = data_popularity_sorted) / 5000)
            except:
                pass

            if i % step == 0:
               print(f"Solved iteration: {i}. That's approximately {np.round((i/len(contents))*100,2)}%.")

        mar = np.mean(accuracy)
            
        return mar, accuracy 

In [None]:
class Studio:
    def __init__(self, seed, env, factorizer = ""):
        self.counter = 0
        self.seed = seed
        self.env = env
        
        if factorizer == "":
            self.MF = Implicit(seed, env)
        else:
            self.MF = factorizer
            
    def save_model(self, user_vec, item_vec, parameters = ""):
        np.save(f"user_vec_{self.counter}_{parameters}", user_vec)
        np.save(f"item_vec_{self.counter}_{parameters}", item_vec)
        self.counter += 1
        
    def run_test_als(self, v_iterations, v_factors, v_lambdas, v_alphas):
        model_acc = list()
        
        for it in v_iterations:
            for factor in v_factors:
                for in_lambda in v_lambdas:
                    for alpha in v_alphas:
                       # print(f"Starting Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}")
                        self.MF.alternating_least_squares(iterations = it, factors = factor, i_lambda = in_lambda, alpha = alpha)
                        mar, accuracy = self.MF.expected_percentile_ranking()
                        
                        #self.save_model(self.MF.model.user_factors, self.MF.model.item_factors, f"{time.time()}_iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}")
                        #model_acc.append([f"model_als-iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}", mar, accuracy])                        
                        print(f"Finishing up Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}. Reported MAR: {mar}.")
        #np.save(f"model_acc_{time.time()}", model_acc)              
        return model_acc
    
    def run_test_log(self, v_iterations, v_factors, v_lambdas, v_learning, v_alpha):
        model_acc = list()

        for in_alpha in v_alpha:
            for it in v_iterations:
                for factor in v_factors:
                    for in_lambda in v_lambdas:
                        for in_learning in v_learning:
                           # print(f"Starting Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{in_learning}_alpha-{in_alpha}")
                            self.MF.logistic_factorization(iterations = it, factors = factor, i_lambda = in_lambda, learning_parameter = in_learning, alpha = in_alpha)
                            mar, accuracy = self.MF.expected_percentile_ranking()

                            #self.save_model(self.MF.model.user_factors, self.MF.model.item_factors, f"{time.time()}_iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{learning_parameter}")
                           # model_acc.append([f"model_als-iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{in_learning}_alpha-{in_alpha}", mar, accuracy])                        
                            print(f"Finishing up Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{in_learning}_alpha-{in_alpha}. Reported MAR: {mar}.")
        #np.save(f"model_acc_{time.time()}", model_acc)              
        return model_acc
                        

In [None]:
# Implicit Matrix Factorization Model, Koren 2008.
Model_Implicit = Implicit(941, "kaggle")
#Model_Implicit.alternating_least_squares(iterations = 20, factors = 30, i_lambda = 0.01, alpha = 15)

In [None]:
# Logistic Matrix Factorization Model, Johnson 2014.
Logistic = Implicit(941, "kaggle")
#Logistic.logistic_factorization(iterations = 40, factors = 100, i_lambda = 0.01, learning_parameter = 0.1, alpha = 1)

In [None]:
# Calculate the Mean Accuracy Ranking for the popularity data.
mar, accuracy = Model_Implicit.calculate_epr_popularity()
mar

In [None]:
# The Studio class is designed to run multiple tests at ones.
Analysis = Studio(941, "kaggle", Model_Implicit)

In [None]:
model_accuracy = Analysis.run_test_als(v_iterations = [40], v_factors = [20,30,40,60,80,100,150,200,300], v_lambdas = [0.01], v_alphas = [15])

In [None]:
Analysis = Studio(941, "kaggle", Logistic)

In [None]:
model_accuracy = Analysis.run_test_log(v_iterations = [40], v_factors = [40], v_lambdas = [0.01], v_learning = [0.1], v_alpha = [1,10,40])

In [None]:
# Testing the trainings matrix with a specific user. Input: idUser and idContent
idUser = 298
idContent = "1_16946"
indexUser = np.where(Model_Implicit.user_ids==idUser)[0][0]
indexContent = np.where(Model_Implicit.content_ids==idContent)[0][0]
Model_Implicit.training_set_csr.toarray()[indexContent][indexUser]

In [None]:
"""
ALS, 5000 Products:
Finishing up Iteration: iterations-40_factors-15_lambda-0.01_alpha-15. Reported MAR: 0.5007135416666667.
Finishing up Iteration: iterations-40_factors-30_lambda-0.01_alpha-15. Reported MAR: 0.4913303225806451.
Finishing up Iteration: iterations-40_factors-40_lambda-0.01_alpha-15. Reported MAR: 0.506864739884393.
Finishing up Iteration: iterations-40_factors-40_lambda-0.01_alpha-15. Reported MAR: 0.520812426035503.
Finishing up Iteration: iterations-40_factors-60_lambda-0.01_alpha-15. Reported MAR: 0.4761294117647059.
Finishing up Iteration: iterations-40_factors-80_lambda-0.01_alpha-15. Reported MAR: 0.45779899497487436.
Finishing up Iteration: iterations-40_factors-100_lambda-0.01_alpha-15. Reported MAR: 0.4202738498789346.
Finishing up Iteration: iterations-40_factors-150_lambda-0.01_alpha-15. Reported MAR: 0.4120840476190476.
Finishing up Iteration: iterations-40_factors-200_lambda-0.01_alpha-15. Reported MAR: 0.4197281262646702.
Finishing up Iteration: iterations-40_factors-300_lambda-0.01_alpha-15. Reported MAR: 0.4236005339028297.


LOG, 5000 Products:
Finishing up Iteration: iterations-40_factors-20_lambda-0.01_learning_parameter-0.1_alpha-1. Reported MAR: 0.5301564245810055.
Finishing up Iteration: iterations-40_factors-30_lambda-0.01_learning_parameter-0.1_alpha-1. Reported MAR: 0.46028966725043785.
Finishing up Iteration: iterations-40_factors-30_lambda-0.01_learning_parameter-0.01_alpha-1. Reported MAR: 0.4983864004317323.
Finishing up Iteration: iterations-40_factors-40_lambda-0.01_learning_parameter-0.01_alpha-1. Reported MAR: 0.4961798309859155.
Finishing up Iteration: iterations-40_factors-40_lambda-0.01_learning_parameter-0.1_alpha-1. Reported MAR: 0.462060736196319.
Finishing up Iteration: iterations-40_factors-60_lambda-0.01_learning_parameter-0.01_alpha-1. Reported MAR: 0.5014183713355049.
Finishing up Iteration: iterations-40_factors-60_lambda-0.01_learning_parameter-0.01_alpha-1. Reported MAR: 0.5008075784487863.
Finishing up Iteration: iterations-40_factors-80_lambda-0.01_learning_parameter-0.01_alpha-1. Reported MAR: 0.4905897535667964
Finishing up Iteration: iterations-40_factors-100_lambda-0.01_learning_parameter-0.1_alpha-1. Reported MAR: 0.5021024330900243
Finishing up Iteration: iterations-40_factors-150_lambda-0.01_learning_parameter-0.1_alpha-1. Reported MAR: 0.4807635846372688
Finishing up Iteration: iterations-40_factors-200_lambda-0.01_learning_parameter-0.01_alpha-1. Reported MAR: 0.51474453125
"""

In [None]:
idUser = 298
idContent = "1_16946"
indexUser = np.where(Model_Implicit.user_ids==idUser)[0][0]
indexContent = np.where(Model_Implicit.content_ids==idContent)[0][0]
Model_Implicit.model.explain(indexUser, Model_Implicit.training_set_csr, indexContent)

In [None]:
def explain(userId):
    recommendation = Model_Implicit.predict_recommendations(userId)[0]
    
    return Model_Implicit.model.similar_items(recommendation)

In [None]:
Model_Implicit.predict_recommendations(298)

In [None]:
explain(298)

In [None]:
Model_Implicit.predict_user_byId(298)

In [None]:
model_accuracy = Analysis.run_test_als(v_iterations = [40], v_factors = [15,20,30,40,60,80,100,150,200,300], v_lambdas = [0.01], v_alphas = [15])