In [None]:
# fast blackbox als by flipboard
import implicit
alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((product_train*alpha).astype('double'), factors=20, regularization = 0.1, iterations = 50)

In [1]:
import pandas as pd
import numpy as np
import time
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn import metrics
import random

In [18]:
class Matrix_Factorizer:
    def __init__(self, seed):
        self.seed = seed
        
    def read_matrices_small(self):        
        #self.matrix_asFrame = pd.read_excel("../data_500_full.xlsx")
        self.matrix = sparse.load_npz("../ratings_matrix_500_full.npz")
        
        self.num_users, self.num_items = self.matrix.shape
        
    def read_matrices_full(self, size):        
        self.matrix_asFrame = pd.read_excel(f"../data_10000_{size}.xlsx")
        #self.matrix_asFrame = self.matrix_asFrame.drop(labels="Unnamed: 0", axis=1)
        self.matrix = sparse.load_npz(f"../ratings_matrix_10000_{size}.npz")
        
        self.num_users, self.num_items = self.matrix.shape
        
    def save_model(self, user_vec, item_vec):
        time_int = int(time.time())
        sparse.save_npz(f"../user_vec_{time_int}", user_vec, compressed=True)
        sparse.save_npz(f"../item_vec_{time_int}", item_vec, compressed=True)
        
    def load_model(self, time):
        user_vec = sparse.load_npz(f"../user_vec_{time}.npz")
        item_vec = sparse.load_npz(f"../item_vec_{time}.npz")
        
        return user_vec, item_vec
    
    def create_training_data(self, percent_test_set = 0.15):
        training_set = self.matrix.copy()
        positive_indices = training_set.nonzero()
        positive_values = list(zip(positive_indices[0], positive_indices[1]))
        test_set = self.matrix.copy()
        # The test set should only measure, if the item is going to be bought, not the frequency.
        test_set[test_set != 0] = 1
        random.seed(self.seed)
        n_samples = int(percent_test_set*len(positive_values))
        samples = random.sample(positive_values, n_samples)
        # Put all chosen samples into lists.
        user_indices = [index[0] for index in samples]
        item_indices = [index[1] for index in samples]
        # All users and items that are used in the test set, should be unviewed in the training set.
        training_set[user_indices, item_indices] = 0
        training_set.eliminate_zeros()
        
        self.training_set = training_set
        self.test_set = test_set
        self.indices = list(set(user_indices))
        
    def alternating_least_squares(self, k, alpha, v_lambda, n):
        # K - Latent Factors, Alpha - Learning rate, V_lambda - Regulation, N - Iterations
        training_set = self.training_set
        test_set = self.test_set
        
        # Starting the confidence matrix
        confidence = (alpha*training_set)
        n_user = confidence.shape[0]
        n_items = confidence.shape[1]
        
        X = sparse.csr_matrix(np.random.RandomState(self.seed).normal(size = (n_user, k)))
        Y = sparse.csr_matrix(np.random.RandomState(self.seed).normal(size = (n_items, k)))
        # sparse.eye is creating the eigenmatrix
        X_e = sparse.eye(n_user)
        Y_e = sparse.eye(n_items)
        # The regulization term
        lambda_e = v_lambda * sparse.eye(k)
        
        i = 0
        length = n * n_user
        report_step = int(legth/20)
        
        for iteration in range(n):
            Y_tY = Y.T.dot(Y)
            X_tX = X.T.dot(X)
            
            for user in range(n_user):
                confidence_sample = confidence[user,:].toarray()
                preference = confidence_sample.copy()
                preference[preference != 0] = 1
                C_uI = sparse.diags(confidence_sample, [0])
                y_tC_uIY = Y.T.dot(C_uI).dot(Y)
                y_tC_up_u = Y.T.dot(C_uI + Y_e).dot(preference.T)
                X[user] = spsolve(Y_tY + y_tC_uIY + lambda_e, y_tC_up_u)


                # Progress Status
                i += 1
                if i % report_step == 0:
                    print(f"This is iteration: {i} of {length}.")
                
                for item in range(n_items):
                    confidence_sample = confidence[:,item].T.toarray()
                    preference = confidence_sample.copy()
                    preference[preference != 0] = 1
                    C_iI = sparse.diags(confidence_sample, [0])
                    x_tC_iTX = X.T.dot(C_iI).dot(X)
                    x_tC_iP_i = X.T.dot(C_iI + X_e).dot(preference.T)
                    Y[item] = spsolve(X_tX + x_tC_iTX + lambda_e, x_tC_iP_i)
                    
        return X, Y.T
    
    def get_progress(self, percentage, printed):
        val = printed
        pct = ""
        if percentage > 0.95 and printed < 0.95:
            pct = "95%"
            val = 0.95
        elif percentage > 0.90 and printed < 0.9:
            pct = "90%"
            val = 0.90
        elif percentage > 0.75 and printed < 0.75:
            pct = "75%"
            val = 0.75
        elif percentage > 0.50 and printed < 0.5:
            pct = "50%"
            val = 0.5
        elif percentage > 0.25 and printed < 0.25:
            pct = "25%"
            val = 0.25
        elif percentage > 0.10 and printed < 0.1:
            pct = "10%"
            val = 0.1
        elif percentage > 0.01 and printed < 0.01:
            pct = "1%"
            val = 0.01
    
        if pct != "":
            print(f"More than: {pct} done.")
        return val
    
    def mean_percentage_ranking(self, user_vector, item_vector):
        # for each item in the test set that was bought, check how hight it is in our predictions.
        # 1. sort all calculated ratings
        # 2. go through each bought one in the test set and check where we predicted it (in percent)
        indices = self.indices
        
        for user_index in indices:
            print(np.sort(user_vector[user_index,:].dot(item_vector).toarray()[0,:])[::-1])
    
    def test_parameters(self, in_k, in_alpha, in_v_lambda, in_n):
        all_user_vecs = list()
        all_item_vecs = list()
        
        length = len(in_k) * len(in_alpha) * len(in_v_lambda) * len(in_n)
        iteration = 1
        
        for n in in_n:
            for k in in_k:
                for alpha in in_alpha:
                    for v_lambda in in_v_lambda:
                        user_vecs, item_vecs = self.alternating_least_squares(k=k, alpha=alpha, v_lambda=v_lambda, n=in_n)
                        all_user_vecs.append(user_vecs)
                        all_item_vecs.append(item_vecs)

                        iteration += 1
                        print(f"Solved set: {iteration} of {length}. Parameters were K={k}, alpha={alpha}, lambda={v_lambda} and n={n}.")

                        self.all_user_vecs = all_user_vecs
                        self.all_item_vecs = all_item_vecs

In [23]:
Factorizer = Matrix_Factorizer(941)
Factorizer.read_matrices_full("extra_small")
Factorizer.create_training_data()

In [None]:
user_vecs, item_vecs = Factorizer.alternating_least_squares(k=40, alpha=15, v_lambda=0.1, n=5)

In [None]:
Factorizer.matrix.toarray()

In [None]:
user_vecs.toarray()

In [None]:
item_vecs.toarray()

In [6]:
np.max(user_vecs[0,:].dot(item_vecs).toarray()[0,:])

1.1214366122778239

In [None]:
#Factorizer.test_parameters(in_k=[10,20,50], in_alpha=[10], in_v_lambda=[0.1], in_n=[5, 10])

In [22]:
Factorizer.save_model(user_vecs, item_vecs)

In [20]:
user_vecs, item_vecs = Factorizer.load_model("1568023640")
Factorizer.mean_percentage_ranking(user_vecs, item_vecs)

[ 1.12143661  0.9253988   0.88924219  0.88323715  0.70173664  0.6630911
  0.62384431  0.60807457  0.60807457  0.60807457  0.60807457  0.60807457
  0.60807457  0.60807457  0.60807457  0.60807457  0.60807457  0.60807457
  0.57640679  0.57640679  0.57640679  0.57640679  0.57640679  0.57640679
  0.57640679  0.57640679  0.57640679  0.57640679  0.57640679  0.57640679
  0.56574975  0.5538477   0.52422589  0.51576225  0.51576225  0.49917708
  0.49917708  0.49917708  0.49917708  0.49917708  0.49917708  0.49917708
  0.49817005  0.48940342  0.47614231  0.47614231  0.47606124  0.46453306
  0.46453306  0.46453306  0.46034775  0.45739843  0.40470399  0.39272095
  0.39272095  0.35593688  0.33746797  0.32576321  0.32576321  0.32576321
  0.32576321  0.32576321  0.32576321  0.32576321  0.32576321  0.32576321
  0.32576321  0.32576321  0.32576321  0.32576321  0.32576321  0.31525481
  0.31525481  0.31525481  0.31525481  0.30872944  0.30872944  0.30872944
  0.30462601  0.30462601  0.30095593  0.28375262  0.

[ 1.12628086  1.07136943  1.07136943  1.07136943  0.99441161  0.99441161
  0.92602492  0.90427996  0.90427996  0.90427996  0.90427996  0.90427996
  0.90427996  0.90427996  0.86255152  0.86255152  0.80148163  0.80148163
  0.80148163  0.80148163  0.80148163  0.78979675  0.74859153  0.74859153
  0.74859153  0.74859153  0.74532922  0.73938177  0.73938177  0.72453129
  0.72335281  0.72335281  0.71680932  0.71463797  0.68736966  0.67548082
  0.67102951  0.58934692  0.58934692  0.58552528  0.58552528  0.58552528
  0.58552528  0.56873485  0.56402149  0.56402149  0.56402149  0.56402149
  0.55652297  0.55538903  0.55103615  0.52312751  0.51688936  0.51688936
  0.51688936  0.51688936  0.48919194  0.48919194  0.48557022  0.48557022
  0.48557022  0.48557022  0.48557022  0.48557022  0.47989082  0.47729629
  0.47507158  0.45898193  0.45898193  0.45898193  0.45898193  0.45898193
  0.45898193  0.45667267  0.45667267  0.45667267  0.45667267  0.45667267
  0.45667267  0.45667267  0.45667267  0.45667267  0