In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import os
import implicit
import time
from multiprocessing import Pool

In [17]:
class Implicit:
    def __init__(self, seed = 941, environment = "offline"):
        self.seed = seed
        self.not_testable = 0
        
        if environment == "kaggle":
            self.read_matrices_timesplit("/kaggle/input/")
        else:
            self.read_matrices_timesplit("../")
        
    def read_matrices_timesplit(self, path):
        self.training_set_csr = sparse.load_npz(f"{path}/ratings_matrix_csr.npz")
        self.training_set_coo = sparse.load_npz(f"{path}/ratings_matrix_coo.npz")
        
        self.user_ids = np.load(f"{path}/user_ids.npy")
        self.content_ids = np.load(f"{path}/content_ids.npy")
        
        self.test_set = pd.read_pickle(f"{path}/test_set.pkl", compression="zip")
        self.data_popularity = np.load(f"{path}/popularity_data_5000.npy", allow_pickle=True)

        self.num_users, self.num_items = self.training_set_csr.shape
    
    def alternating_least_squares(self, iterations, factors, i_lambda, alpha):
        self.model = implicit.als.AlternatingLeastSquares(factors=factors, regularization = i_lambda, iterations = iterations)
        self.model.fit((self.training_set_csr*alpha).astype('double'))
                
    def logistic_factorization(self, iterations, factors, i_lambda, learning_parameter):
        self.model = implicit.lmf.LogisticMatrixFactorization(factors, learning_parameter, i_lambda, iterations = iterations)
        self.model.fit((self.training_set_coo).astype('double'))
    
    def predict_user_byIndex(self, index):
        return self.model.item_factors[index].dot(self.model.user_factors.T)
    
    def get_user_vectors(self, users):
        user_dict = {"user": "values"}
        
        for i in range(len(users)):
            try:
                user_indx = np.where(self.user_ids==users[i])[0][0]
                user_dict[users[i]] = self.predict_user_byIndex(user_indx)
            except:
                self.not_testable += 1
                
        return user_dict
    
    def get_rank(self, user, item, user_vector):
        item_indx = np.where(self.content_ids==item)[0][0]
        
        prob = user_vector[item_indx]
        return np.where(np.sort(user_vector, kind="mergesort")[:5000]==prob)[0][0]

    def expected_percentile_ranking(self):
        accuracy = list()
        mar = 0   
        self.not_testable = 0        
        users = self.test_set.idUser.values
        contents = self.test_set.fullId.values
        breakpoint = 10*10**6
        reportpoint = breakpoint if breakpoint < len(self.test_set.idUser) else len(self.test_set.idUser)
        step = 8750 if 8750 < (breakpoint/4) else 2550
        
        user_dict = self.get_user_vectors(self.test_set.idUser.unique())
        
        for i in range(len(users)):
            if i > breakpoint:
                break

            try:
                accuracy.append(self.get_rank(users[i], contents[i], user_vector = user_dict[users[i]]) / 5000)
            except:
                pass

            if i % step == 0:
                print(f"Solved iteration: {i}. That's about {np.round((i/reportpoint)*100,2)}%.")

        mar = np.mean(accuracy)
            
        return mar, accuracy
    
    def calculate_epr_popularity(self):
        accuracy = list()
        mar = 0        
        users = self.test_set.idUser.values
        contents = self.test_set.fullId.values
        breakpoint = 100**4
        reportpoint = breakpoint if breakpoint < len(self.test_set.idUser) else len(self.test_set.idUser)
        step = 8750 if 8750 < (breakpoint/4) else 2550
        
        for i in range(len(users)):
            if i > breakpoint:
                break

            try:
                accuracy.append(self.get_rank(users[i], contents[i], user_vector = self.data_popularity) / 5000)
            except:
                pass

            if i % step == 0:
                print(f"Solved iteration: {i}. That's about {np.round((i/reportpoint)*100,2)}%.")

        mar = np.mean(accuracy)
            
        return mar, accuracy 

In [3]:
class Studio:
    def __init__(self, seed, env, factorizer = ""):
        self.counter = 0
        self.seed = seed
        self.env = env
        
        if factorizer == "":
            self.MF = Implicit(seed, env)
            self.MF.create_training_data()
        else:
            self.MF = factorizer
            
    def save_model(self, user_vec, item_vec, parameters = ""):
        np.save(f"user_vec_{self.counter}_{parameters}", user_vec)
        np.save(f"item_vec_{self.counter}_{parameters}", item_vec)
        self.counter += 1
        
    def run_test_als(self, v_iterations, v_factors, v_lambdas, v_alphas):
        model_acc = list()
        
        for it in v_iterations:
            for factor in v_factors:
                for in_lambda in v_lambdas:
                    for alpha in v_alphas:
                        print(f"Starting Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}")
                        self.MF.alternating_least_squares(iterations = it, factors = factor, i_lambda = in_lambda, alpha = alpha)
                        mar, accuracy = self.MF.expected_percentile_ranking()
                        
                        #self.save_model(self.MF.model.user_factors, self.MF.model.item_factors, f"{time.time()}_iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}")
                        model_acc.append([f"model_als-iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}", mar, accuracy])                        
                        print(f"Fishing up Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}. Reported MAR: {mar}.")
        np.save(f"model_acc_{time.time()}", model_acc)              
        return model_acc
    
    def run_test_log(self, v_iterations, v_factors, v_lambdas, v_learning):
        model_acc = list()

        for it in v_iterations:
            for factor in v_factors:
                for in_lambda in v_lambdas:
                    for in_learning in v_learning:
                        print(f"Starting Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{in_learning}")
                        self.MF.logistic_factorization(iterations = it, factors = factor, i_lambda = in_lambda, learning_parameter = in_learning)
                        mar, accuracy = self.MF.expected_percentile_ranking()

                        #self.save_model(self.MF.model.user_factors, self.MF.model.item_factors, f"{time.time()}_iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{learning_parameter}")
                        model_acc.append([f"model_als-iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{in_learning}", mar, accuracy])                        
                        print(f"Fishing up Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{in_learning}. Reported MAR: {mar}.")
        np.save(f"model_acc_{time.time()}", model_acc)              
        return model_acc
                        

In [18]:
# Implicit Alternating Least Squares Model Koren 2008.
Model_Implicit = Implicit(941, "kaggle")

In [None]:
# Logistic Factorization Johnson 2014.
Logistic = Implicit(941, "kaggle")

In [None]:
mar, accuracy = Model_Implicit.calculate_epr_popularity()
mar

Solved iteration: 0. That's about 0.0%.
Solved iteration: 8750. That's about 4.14%.
Solved iteration: 17500. That's about 8.27%.


In [None]:
Analysis = Studio(941, "kaggle", Logistic)

In [None]:
model_accuracy = Analysis.run_test_als(v_iterations = [40], v_factors = [60,80,150], v_lambdas = [0.01], v_alphas = [15])

In [None]:
model_accuracy = Analysis.run_test_log(v_iterations = [40], v_factors = [60,80,100,150,200,150,200], v_lambdas = [0.01], v_learning = [0.1])