In [None]:
# fast als by flipboard
import implicit
alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((Factorizer.training_set*alpha).astype('double'), factors=20, regularization = 0.1, iterations = 50)
user_vecs[0,:].dot(item_vecs.transpose())

In [1]:
import pandas as pd
import numpy as np
import time
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn import metrics
import random
import os

In [2]:
class Matrix_Factorizer:
    def __init__(self, seed):
        self.seed = seed
        
    def read_matrices_small(self):        
        #self.matrix_asFrame = pd.read_excel("../data_500_full.xlsx")
        self.matrix = sparse.load_npz("../ratings_matrix_500_full.npz")
        
        self.num_users, self.num_items = self.matrix.shape
        
    def read_matrices_full(self, size):        
        self.matrix_asFrame = pd.read_excel(f"../data_10000_{size}.xlsx")
        #self.matrix_asFrame = self.matrix_asFrame.drop(labels="Unnamed: 0", axis=1)
        self.matrix = sparse.load_npz(f"../ratings_matrix_10000_{size}.npz")
        
        self.num_users, self.num_items = self.matrix.shape
    
    def read_matrices_kaggle(self, size):
        self.matrix_asFrame = pd.read_excel(f"/kaggle/input/data_10000_{size}.xlsx")
        self.matrix = sparse.load_npz(f"/kaggle/input/ratings_matrix_10000_{size}.npz")
        
        self.num_users, self.num_items = self.matrix.shape
        
    def save_model(self, user_vec, item_vec, additional = ""):
        time_int = int(time.time())
        sparse.save_npz(f"../saved_models/user_vec_{time_int}_{additional}", user_vec, compressed=True)
        sparse.save_npz(f"../saved_models/item_vec_{time_int}_{additional}", item_vec, compressed=True)
        
    def load_model(self, time):
        user_vec = sparse.load_npz(f"../saved_models/user_vec_{time}.npz")
        item_vec = sparse.load_npz(f"../saved_models/item_vec_{time}.npz")
        
        return user_vec, item_vec
    
    def create_training_data(self, percent_test_set = 0.15):
        training_set = self.matrix.copy()
        positive_indices = training_set.nonzero()
        positive_values = list(zip(positive_indices[0], positive_indices[1]))
        test_set = self.matrix.copy()
        # The test set should only measure, if the item is going to be bought, not the frequency.
        test_set[test_set != 0] = 1
        random.seed(self.seed)
        n_samples = int(percent_test_set*len(positive_values))
        samples = random.sample(positive_values, n_samples)
        # Put all chosen samples into lists.
        user_indices = [index[0] for index in samples]
        item_indices = [index[1] for index in samples]
        # All users and items that are used in the test set, should be unviewed in the training set.
        training_set[user_indices, item_indices] = 0
        training_set.eliminate_zeros()
        
        self.training_set = training_set
        self.test_set = test_set
        self.indices = list(set(user_indices))
        
    def alternating_least_squares(self, k, alpha, v_lambda, n):
        # K - Latent Factors, Alpha - Learning rate, V_lambda - Regulation, N - Iterations
        training_set = self.training_set
        test_set = self.test_set
        
        # Starting the confidence matrix
        confidence = (alpha*training_set)
        n_user = confidence.shape[0]
        n_items = confidence.shape[1]
        
        X = sparse.csr_matrix(np.random.RandomState(self.seed).normal(size = (n_user, k)))
        Y = sparse.csr_matrix(np.random.RandomState(self.seed).normal(size = (n_items, k)))
        # sparse.eye is creating the einheitsmatrix with the specific value on the diagonal
        X_e = sparse.eye(n_user)
        Y_e = sparse.eye(n_items)
        # The regulization term
        lambda_e = v_lambda * sparse.eye(k)
        
        i = 0
        length = n * n_user
        report_step = int(length/30)
        
        for iteration in range(n):
            Y_tY = Y.T.dot(Y)
            X_tX = X.T.dot(X)
            
            for user in range(n_user):
                confidence_user = confidence[user,:].toarray()
                preference = confidence_user.copy()
                preference[preference != 0] = 1
                C_uI = sparse.diags(confidence_user, [0])
                y_tC_uIY = Y.T.dot(C_uI).dot(Y)
                y_tC_up_u = Y.T.dot(C_uI + Y_e).dot(preference.T)
                # Minimizing X_u, keeping Y_i stable.
                X[user] = spsolve(Y_tY + y_tC_uIY + lambda_e, y_tC_up_u)


                # Progress Status
                i += 1
                if i % report_step == 0:
                    print(f"This is iteration: {i} of {length}.")
                
                for item in range(n_items):
                    confidence_item = confidence[:,item].T.toarray()
                    preference = confidence_item.copy()
                    preference[preference != 0] = 1
                    C_iI = sparse.diags(confidence_item, [0])
                    x_tC_iTX = X.T.dot(C_iI).dot(X)
                    x_tC_iP_i = X.T.dot(C_iI + X_e).dot(preference.T)
                    # Minimizing Y_i, keeping X_u stable.
                    Y[item] = spsolve(X_tX + x_tC_iTX + lambda_e, x_tC_iP_i)
                    
        return X, Y.T
    
    def mean_percentage_ranking(self, user_vector, item_vector):
        # for each item in the test set that was bought, check how hight it is in our predictions.
        # 1. sort all calculated ratings
        # 2. go through each bought one in the test set and check where we predicted it (in percent)
        indices = self.indices
        
        for user_index in indices:
            print(np.sort(user_vector[user_index,:].dot(item_vector).toarray()[0,:])[::-1])
    
    def test_parameters(self, in_k, in_alpha, in_v_lambda, in_n):
        all_user_vecs = list()
        all_item_vecs = list()
        
        length = len(in_k) * len(in_alpha) * len(in_v_lambda) * len(in_n)
        iteration = 1
        
        for n in in_n:
            for k in in_k:
                for alpha in in_alpha:
                    for v_lambda in in_v_lambda:
                        user_vecs, item_vecs = self.alternating_least_squares(k=k, alpha=alpha, v_lambda=v_lambda, n=n)
                        all_user_vecs.append(user_vecs)
                        all_item_vecs.append(item_vecs)
                        Factorizer.save_model(user_vecs, item_vecs, f"k={k}_alpha={alpha}_lambda={v_lambda}_n={n}")

                        print(f"Solved set: {iteration} of {length}. Parameters were K={k}, alpha={alpha}, lambda={v_lambda} and n={n}.")
                        iteration += 1

In [6]:
Factorizer = Matrix_Factorizer(941)
Factorizer.read_matrices_full("extra_small")
Factorizer.create_training_data()

In [None]:
# In Kaggle Kernels
Factorizer = Matrix_Factorizer(941)
Factorizer.read_matrices_kaggle("small")
Factorizer.create_training_data()

In [None]:
# Run one model
user_vecs, item_vecs = Factorizer.alternating_least_squares(k=40, alpha=15, v_lambda=0.1, n=5)

In [None]:
# Test models with a set of parameters
Factorizer.test_parameters(in_k=[10,20,50], in_alpha=[10], in_v_lambda=[0.1], in_n=[5, 10])

In [None]:
Factorizer.matrix.toarray()

In [None]:
user_vecs.toarray()

In [None]:
item_vecs.toarray()

In [13]:
%%time
np.max(user_vecs[0,:].dot(item_vecs).toarray()[0,:])

Wall time: 2 ms


0.11300903804113793

In [14]:
%%time
saved_models = ["1568052491_k=10_alpha=15_lambda=0.1_n=5", "1568079022_k=20_alpha=15_lambda=0.1_n=5", "1568108649_k=50_alpha=15_lambda=0.1_n=5"]
user_vecs, item_vecs = Factorizer.load_model(saved_models[0])
#Factorizer.mean_percentage_ranking(user_vecs, item_vecs)

Wall time: 34 ms
