In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import os
import implicit
import pickle
from multiprocessing import Pool

In [13]:
class Implicit:
    def __init__(self, seed = 941, environment = "offline"):
        self.seed = seed
        
        if environment == "kaggle":
            self.read_matrices_timesplit("/kaggle/input/")
        else:
            self.read_matrices_timesplit("../")
        
    def read_matrices_timesplit(self, path):
        self.training_set_csr = sparse.load_npz(f"{path}/ratings_matrix_csr.npz")
        self.training_set_coo = sparse.load_npz(f"{path}/ratings_matrix_coo.npz")
        
        self.user_ids = np.load(f"{path}/user_ids.npy")
        self.content_ids = np.load(f"{path}/content_ids.npy")
        
        self.test_set = pd.read_pickle(f"{path}/test_set.pkl", compression="zip")

        self.num_users, self.num_items = self.training_set_csr.shape
    
    def alternating_least_squares(self, iterations, factors, i_lambda, alpha):
        self.model = implicit.als.AlternatingLeastSquares(factors=factors, regularization = i_lambda, iterations = iterations)
        self.model.fit((self.training_set_csr*alpha).astype('double'))
                
    def logistic_factorization(self, iterations, factors, i_lambda, learning_parameter):
        self.model = implicit.lmf.LogisticMatrixFactorization(factors, learning_parameter, i_lambda, iterations = iterations)
        self.model.fit((self.training_set_coo).astype('double'))
    
    def predict_user_byIndex(self, index):
        return self.model.item_factors[index].dot(self.model.user_factors.T)
    
    def get_user_vectors(self, users):
        user_dict = {"user": "values"}
        
        for i in range(len(users)):
            try:
                user_indx = np.where(self.user_ids==users[i])[0][0]
                user_dict[users[i]] = self.predict_user_byIndex(user_indx)
            except:
                self.not_testable += 1
                
        return user_dict
    
    def get_rank(self, user, item, user_vector = None):
        # If a user_vector is specificied as input, the calculation is done before.
        
        user_indx = np.where(self.user_ids==user)[0][0]
        item_indx = np.where(self.content_ids==item)[0][0]
        
        if user_vector == None:
            user_vector = self.predict_user_byIndex(user_indx)
            
        prob = user_vector[item_indx]
        
        return np.where(np.sort(user_vector, kind="mergesort")[::-1]==prob)[0][0]
    
    def expected_percentile_ranking(self):
        accuracy = list()
        mar = 0   
        self.not_testable = 0        
        users = self.test_set.idUser.values
        contents = self.test_set.fullId.values
        breakpoint = 4000
        reportpoint = breakpoint if breakpoint < len(self.test_set.idUser) else len(self.test_set.idUser)
        step = int(len(self.test_set.idUser) / 12) if breakpoint == None else int(breakpoint / 16.5)
        
        user_dict = self.get_user_vectors(self.test_set.idUser.unique())
        
        for i in range(len(self.test_set.idUser)):
            if i > breakpoint:
                break
                
            try:
                accuracy.append(self.get_rank(users[i], contents[i], user_vector = user_dict[users[i]]) / self.num_items)
            except:
                pass
                
            if i % step == 0:
                print(f"Solved iteration: {i}. That's about {np.round((i/reportpoint)*100,2)}%.")
            
        mar = np.mean(accuracy)
            
        return mar, accuracy

In [30]:
class Studio:
    def __init__(self, seed, env, factorizer = ""):
        self.counter = 0
        self.seed = seed
        self.env = env
        
        if factorizer == "":
            self.MF = Implicit(seed, env)
            self.MF.create_training_data()
        else:
            self.MF = factorizer
            
    def save_model(self, user_vec, item_vec, parameters = ""):
        np.save(f"user_vec_{self.counter}_{parameters}", user_vec)
        np.save(f"item_vec_{self.counter}_{parameters}", item_vec)
        self.counter += 1
        
    def run_test_als(self, v_iterations, v_factors, v_lambdas, v_alphas):
        model_acc = list()
        
        for it in v_iterations:
            for factor in v_factors:
                for in_lambda in v_lambdas:
                    for alpha in v_alphas:
                        print(f"iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}")
                        self.MF.alternating_least_squares(iterations = it, factors = factor, i_lambda = in_lambda, alpha = alpha)
                        mar, accuracy = self.MF.expected_percentile_ranking()
                        
                        self.save_model(self.MF.model.user_factors, self.MF.model.item_factors, f"iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}")
                        model_acc.append([f"model_als-iterations-{it}_factors-{factor}_lambda-{in_lambda}_alpha-{alpha}", mar, accuracy])
                        pickle.dump(model_acc,open("model_acc","w"))
                        
        return model_acc
                        
    def expected_percentile_ranking(self):
        def split_set(users, contents, splits):
            output = list()
            length = len(users) / splits
            for i in range(splits):
                output.append(str(i), [users[length * i: length * i + 1], contents[length * i: length * i + 1]])
                
            return output
        
        def process_chunk(chunk):
            # each chunk contains a list of users and items.
            subtrain = chunk[0]
            users = chunk[1]
            contents = chunk[2]
            
            for i in range(len(users)):
                if i > 5000:
                    break

                try:
                    accuracy.append(self.get_rank(users[i], contents[i], user_vector = self.user_dict[users[i]]) / self.num_items)
                except:
                    pass

                if i % 1000 == 0:
                    print(f"Subtrain: {subtrain}. Reporting iteration: {i} of {len(users)}.")
        
        accuracy = list()
        mar = 0   
        self.not_testable = 0        
        users = self.MF.test_set.idUser.values
        contents = self.MF.test_set.fullId.values

        self.user_dict = self.MF.get_user_vectors(users)

        # Multiprocessing unit
        num_cores = 4
        pool = Pool(num_cores)
        all_chunks = expected_percentile_ranking.split_set(users, contents, num_cores)
        result = pool.map(process_chunk, all_chunks)

        #mar = np.mean(accuracy)

        return result
                        

In [16]:
# Implicit Alternating Least Squares Model Koren 2008.
Implicit = Implicit(941, "kaggle")
#Implicit.alternating_least_squares(iterations = 15, factors = 40, i_lambda = 0.15, alpha = 20)

In [15]:
# Logistic Factorization Johnson 2014.
Logistic = Implicit(941, "kaggle")
Logistic.logistic_factorization(iterations = 15, factors = 60, i_lambda = 0.15, learning_parameter = 30)

100%|██████████| 15/15 [00:50<00:00,  3.35s/it]


In [None]:
mar, accuracy = Implicit.expected_percentile_ranking()
mar

In [26]:
Analysis = Studio(941, "kaggle", Implicit)

In [27]:
Analysis.run_test_als(v_iterations = [10], v_factors = [10, 20], v_lambdas = [0.1, 0.05, 0.15, 0.3], v_alphas = [10, 20, 40, 100])

iterations-10_factors-10_lambda-0.1_alpha-10


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Solved iteration: 0. That's about 0.0%.
Solved iteration: 242. That's about 6.05%.
Solved iteration: 484. That's about 12.1%.
Solved iteration: 726. That's about 18.15%.
Solved iteration: 968. That's about 24.2%.
Solved iteration: 1210. That's about 30.25%.
Solved iteration: 1452. That's about 36.3%.
Solved iteration: 1694. That's about 42.35%.
Solved iteration: 1936. That's about 48.4%.
Solved iteration: 2178. That's about 54.45%.
Solved iteration: 2420. That's about 60.5%.
Solved iteration: 2662. That's about 66.55%.
Solved iteration: 2904. That's about 72.6%.
Solved iteration: 3146. That's about 78.65%.
Solved iteration: 3388. That's about 84.7%.
Solved iteration: 3630. That's about 90.75%.
Solved iteration: 3872. That's about 96.8%.


AttributeError: 'tuple' object has no attribute 'append'

In [None]:
# Training matrix works!
idUser = 298
idContent = "1_16946"
indexUser = np.where(Factorizer.user_ids==idUser)[0][0]
indexContent = np.where(Factorizer.content_ids==idContent)[0][0]
Factorizer.training_set_csr.toarray()[indexUser][indexContent]

In [None]:
# 2.500 its, als
0.10589308363675712
# 15000 its, als
0.11361161798583706

In [28]:
import os