In [61]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import os
import implicit
import time
from IPython.display import FileLink
from multiprocessing import Pool

In [62]:
class Implicit:
    def __init__(self, seed = 941, environment = "offline"):
        self.seed = seed
        self.not_testable = 0
        
        if environment == "kaggle":
            self.read_matrices_timesplit("/kaggle/input/")
        else:
            self.read_matrices_timesplit("../")
        
    def read_matrices_timesplit(self, path):
        self.training_set_csr = sparse.load_npz(f"{path}/ratings_matrix_csr.npz")
        self.training_set_coo = sparse.load_npz(f"{path}/ratings_matrix_coo.npz")
        
        self.user_ids = np.load(f"{path}/user_ids.npy")
        self.content_ids = np.load(f"{path}/content_ids.npy")
        
        self.test_set = pd.read_pickle(f"{path}/test_set.pkl", compression="zip")

        self.num_users, self.num_items = self.training_set_csr.shape
    
    def alternating_least_squares(self, iterations, factors, i_lambda, alpha):
        self.model = implicit.als.AlternatingLeastSquares(factors=factors, regularization = i_lambda, iterations = iterations)
        self.model.fit((self.training_set_csr*alpha).astype('double'))
                
    def logistic_factorization(self, iterations, factors, i_lambda, learning_parameter):
        self.model = implicit.lmf.LogisticMatrixFactorization(factors, learning_parameter, i_lambda, iterations = iterations)
        self.model.fit((self.training_set_coo).astype('double'))
    
    def predict_user_byIndex(self, index):
        return self.model.item_factors[index].dot(self.model.user_factors.T)
    
    def get_user_vectors(self, users):
        user_dict = {"user": "values"}
        
        for i in range(len(users)):
            try:
                user_indx = np.where(self.user_ids==users[i])[0][0]
                user_dict[users[i]] = self.predict_user_byIndex(user_indx)
            except:
                self.not_testable += 1
                
        return user_dict
    
    def get_rank(self, user, item, user_vector):
        item_indx = np.where(self.content_ids==item)[0][0]
        
        prob = user_vector[item_indx]
        return np.where(np.sort(user_vector, kind="mergesort")[:5000]==prob)[0][0]
        # [::-1]

    def expected_percentile_ranking(self):
        self.stopExecute = False
        accuracy = list()
        mar = 0   
        self.not_testable = 0        
        users = self.test_set.idUser.values
        contents = self.test_set.fullId.values
        breakpoint = 4500
        reportpoint = breakpoint if breakpoint < len(self.test_set.idUser) else len(self.test_set.idUser)
        step = 4350 if 4350 < (breakpoint/4) else 1550
        
        user_dict = self.get_user_vectors(self.test_set.idUser.unique())
        
        for i in range(len(self.test_set.idUser)):
            j = np.random.randint(len(self.test_set.idUser))
            if i > breakpoint:
                break

            try:
                accuracy.append(self.get_rank(users[j], contents[j], user_vector = user_dict[users[j]]) / 5000)
            except:
                pass

            if i % step == 0:
                print(f"Solved iteration: {i}. That's about {np.round((i/reportpoint)*100,2)}%.")

        mar = np.mean(accuracy)
            
        return mar, accuracy

In [66]:
class Studio:
    def __init__(self, seed, env, factorizer = ""):
        self.counter = 0
        self.seed = seed
        self.env = env
        
        if factorizer == "":
            self.MF = Implicit(seed, env)
            self.MF.create_training_data()
        else:
            self.MF = factorizer
            
    def save_model(self, user_vec, item_vec, parameters = ""):
        np.save(f"user_vec_{self.counter}_{parameters}", user_vec)
        np.save(f"item_vec_{self.counter}_{parameters}", item_vec)
        self.counter += 1
        
    def run_test_als(self, v_iterations, v_factors, v_lambdas, v_alphas):
        model_acc = list()
        
        for it in v_iterations:
            for factor in v_factors:
                for in_lambda in v_lambdas:
                    for alpha in v_alphas:
                        print(f"Starting Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}")
                        self.MF.alternating_least_squares(iterations = it, factors = factor, i_lambda = in_lambda, alpha = alpha)
                        mar, accuracy = self.MF.expected_percentile_ranking()
                        
                        #self.save_model(self.MF.model.user_factors, self.MF.model.item_factors, f"{time.time()}_iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}")
                        model_acc.append([f"model_als-iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}", mar, accuracy])                        
                        print(f"Fishing up Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}. Reported MAR: {mar}.")
        np.save(f"model_acc_{time.time()}", model_acc)              
        return model_acc
    
    def run_test_log(self, v_iterations, v_factors, v_lambdas, v_learning):
        model_acc = list()

        for it in v_iterations:
            for factor in v_factors:
                for in_lambda in v_lambdas:
                    for in_learning in v_learning:
                        print(f"Starting Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{in_learning}")
                        self.MF.logistic_factorization(iterations = it, factors = factor, i_lambda = in_lambda, learning_parameter = in_learning)
                        mar, accuracy = self.MF.expected_percentile_ranking()

                        #self.save_model(self.MF.model.user_factors, self.MF.model.item_factors, f"{time.time()}_iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{learning_parameter}")
                        model_acc.append([f"model_als-iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{in_learning}", mar, accuracy])                        
                        print(f"Fishing up Iteration: iterations-{it}_factors-{factor}_lambda-{in_lambda}_learning_parameter-{in_learning}. Reported MAR: {mar}.")
        np.save(f"model_acc_{time.time()}", model_acc)              
        return model_acc
                        
    def expected_percentile_ranking(self):
        def split_set(users, contents, splits):
            output = list()
            length = int(len(users) / splits)
            for i in range(splits):
                if i != (splits -1):
                    output.append([str(i+1), users[length * i: length * (i + 1)], contents[length * i: length * (i + 1)]])
                else:
                    output.append([str(i+1), users[length * i:], contents[length * i:]])
                
            return output
        
        mar = 0   
        self.not_testable = 0        
        users = self.MF.test_set.idUser.values
        contents = self.MF.test_set.fullId.values

        self.user_dict = self.MF.get_user_vectors(self.MF.test_set.idUser.unique())

        # Multiprocessing unit
        num_cores = 4
        pool = Pool(num_cores)
        all_chunks = split_set(users, contents, num_cores)
        result = pool.map(self.process_chunk, all_chunks)

        #mar = np.mean(accuracy)

        return result
    
    def process_chunk(self, chunk):
        # each chunk contains a list of users and items.
        accuracy = list()
        subtrain = chunk[0]
        users = chunk[1]
        contents = chunk[2]

        print(f"Subtrain: {subtrain}. Reporting running.")

        for i in range(len(users)):
            if i > 3000:
                break

            try:
                # !!!!!!!
                # This model doesnt work, because self.user_dict will be undefined. Aggregate the user_dict somehow in the all_chunks.
                # !!!!!!!!!!!
                accuracy.append(self.MF.get_rank(users[i], contents[i], user_vector = self.user_dict[users[i]]) / self.MF.num_items)
            except:
                pass

            if i % 1000 == 0:
                print(f"Subtrain: {subtrain}. Reporting iteration: {i} of {len(users)}.")
                
        return accuracy 
                        

In [None]:
# Implicit Alternating Least Squares Model Koren 2008.
Model_Implicit = Implicit(941, "kaggle")
Model_Implicit.alternating_least_squares(iterations = 25, factors = 60, i_lambda = 0.1, alpha = 30)

In [67]:
# Logistic Factorization Johnson 2014.
Logistic = Implicit(941, "kaggle")
Logistic.logistic_factorization(iterations = 1, factors = 1, i_lambda = 0.1, learning_parameter = 0.005)

100%|██████████| 1/1 [00:00<00:00,  3.79it/s]


In [None]:
mar, accuracy = Model_Implicit.expected_percentile_ranking()
mar

In [68]:
Analysis = Studio(941, "kaggle", Logistic)

In [None]:
model_accuracy = Analysis.run_test_als(v_iterations = [20], v_factors = [40], v_lambdas = [0.1], v_alphas = [5])

In [None]:
model_accuracy = Analysis.run_test_log(v_iterations = [20], v_factors = [40], v_lambdas = [0.1], v_learning = [0.01, 0.1, 0.2, 1])

Starting Iteration: iterations-20_factors-40_lambda-0.1_learning_parameter-0.01


100%|██████████| 20/20 [00:55<00:00,  2.79s/it]


Solved iteration: 0. That's about 0.0%.
Solved iteration: 1550. That's about 34.44%.
Solved iteration: 3100. That's about 68.89%.
Fishing up Iteration: iterations-20_factors-40_lambda-0.1_learning_parameter-0.01. Reported MAR: 0.6041.
Starting Iteration: iterations-20_factors-40_lambda-0.1_learning_parameter-0.1


100%|██████████| 20/20 [00:51<00:00,  2.56s/it]


Solved iteration: 0. That's about 0.0%.
Solved iteration: 1550. That's about 34.44%.
Solved iteration: 3100. That's about 68.89%.
Fishing up Iteration: iterations-20_factors-40_lambda-0.1_learning_parameter-0.1. Reported MAR: 0.5609999999999999.
Starting Iteration: iterations-20_factors-40_lambda-0.1_learning_parameter-0.2


100%|██████████| 20/20 [00:49<00:00,  2.49s/it]


Solved iteration: 0. That's about 0.0%.
Solved iteration: 1550. That's about 34.44%.
Solved iteration: 3100. That's about 68.89%.
Fishing up Iteration: iterations-20_factors-40_lambda-0.1_learning_parameter-0.2. Reported MAR: 0.53832.
Starting Iteration: iterations-20_factors-40_lambda-0.1_learning_parameter-1


 30%|███       | 6/20 [00:14<00:34,  2.48s/it]

In [None]:
# Training matrix works!
idUser = 298
idContent = "1_16946"
indexUser = np.where(Factorizer.user_ids==idUser)[0][0]
indexContent = np.where(Factorizer.content_ids==idContent)[0][0]
Factorizer.training_set_csr.toarray()[indexUser][indexContent]

In [None]:
Model_Implicit.model.user_factors

### Download Links

<a href="./model_acc.npy"> Download Accuracy Numpy File</a>

In [None]:
directory = list()

def get_directory():
    for filename in os.listdir("./"):
        if ".ipynb" not in filename:
            directory.append(filename)
            #input("Press Enter to continue...")

In [None]:
get_directory()
len(directory)

In [None]:
FileLink(r'model_acc.npy')

In [None]:
# Generate for each file in the directory the download link manually.
FileLink(directory[4])

In [None]:
# 500 reverse
# 30
0.5104545454545455
# 3
0.4364545454545455
# 5
0.4955
#60
0.49937499999999996

In [None]:
# 5000 normal
# 80
0.40148333333333336
# 5
0.6035428571428572