In [4]:
import pandas as pd
import numpy as np
import os
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

In [33]:
class Matrix_Factorizer:
    def __init__(self, seed):
        self.seed = seed
        
    def read_matrices_small(self):        
        #self.matrix_asFrame = pd.read_excel("../data_500_full.xlsx")
        self.matrix = sparse.load_npz("../ratings_matrix_500_full.npz")
        
        self.num_users, self.num_items = self.matrix.shape
        
    def read_matrices_full(self):        
        self.matrix_asFrame = pd.read_excel("../data_10000_full.xlsx")
        #self.matrix_asFrame = self.matrix_asFrame.drop(labels="Unnamed: 0", axis=1)
        self.matrix = sparse.load_npz("../ratings_matrix_10000_full.npz")
        
        self.num_users, self.num_items = self.matrix.shape
    
    def create_training_data(self, percent_test_set = 0.15):
        training_set = self.matrix.copy()
        positive_indices = training_set.nonzero()
        positive_values = list(zip(positive_indices[0], positive_indices[1]))
        test_set = self.matrix.copy()
        test_set[test_set != 0] = 1
        random.seed(self.seed)
        n_samples = int(percent_test_set*len(positive_values))
        samples = random.sample(positive_values, n_samples)
        user_indices = [index[0] for index in samples]
        item_indices = [index[1] for index in samples]
        training_set[user_indices, item_indices] = 0
        training_set.eliminate_zeros()
        
        self.training_set = training_set
        self.test_set = test_set
        self.indices = list(set(user_indices))
        
    def alternating_least_squares(self, k, alpha, v_lambda, n):
        # K - Latent Factors, Alpha - Learning rate, V_lambda - Regulation, N - Iterations
        training_set = self.training_set
        test_set = self.test_set
        
        confidence = (alpha*training_set)
        n_user = confidence.shape[0]
        n_items = confidence.shape[1]
        
        X = sparse.csr_matrix(np.random.RandomState(self.seed).normal(size = (n_user, k)))
        Y = sparse.csr_matrix(np.random.RandomState(self.seed).normal(size = (n_items, k)))
        # sparse.eye erzeugt eine Eigenmatrix
        X_eye = sparse.eye(n_user)
        Y_eye = sparse.eye(n_items)
        # The regulization term
        lambda_eye = v_lambda * sparse.eye(k)
        
        for iteration in range(n):
            Y_tY = Y.T.dot(Y)
            X_tX = X.T.dot(X)
            
            for user in range(n_user):
                confidence_sample = confidence[user,:].toarray()
                preference = confidence_sample.copy()
                preference[preference != 0] = 1
                CuI = sparse.diags(confidence_sample, [0])
                yTCuIY = Y.T.dot(CuI).dot(Y)
                yTCupu = Y.T.dot(CuI + Y_eye).dot(preference.T)
                X[user] = spsolve(Y_tY + yTCuIY + lambda_eye, yTCupu)
                
                for item in range(n_items):
                    confidence_sample = confidence[:,item].T.toarray()
                    preference = confidence_sample.copy()
                    preference[preference != 0] = 1
                    CiI = sparse.diags(confidence_sample, [0])
                    xTCiTX = X.T.dot(CiI).dot(X)
                    xTCiPi = X.T.dot(CiI + X_eye).dot(preference.T)
                    Y[item] = spsolve(X_tX + xTCiTX + lambda_eye, xTCiPi)
            self.printed = self.get_progress(iteration, n, self.printed)
        return X, Y.T
    def get_progress(self, i, n, printed):
        val = printed
        if i/n > 0.95 and printed < 0.95:
            pct = "95%"
            val = 0.95
            break
        elif i/n > 0.90:
            pct = "90%"
            val = 0.90
            break
        elif i/n > 0.75:
            pct = "75%"
            val = 0.75
            break
        elif i/n > 0.50:
            pct = "50%"
            val = 0.5
            break
        elif i/n > 0.25:
            pct = "25%"
            val = 0.25
            break
        elif i/n > 0.10:
            pct = "10%"
            val = 0.1
            break
        elif i/n > 0.01:
            pct = "1%"
            val = 0.01
            break
    
        print(f"More than: {pct} done.")
        return val

In [34]:
Factorizer = Matrix_Factorizer(941)
Factorizer.read_matrices_small()
Factorizer.create_training_data()

In [35]:
Factorizer.matrix.toarray()

array([[1.2759e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [2.2000e+01, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [6.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]])

In [None]:
user_vecs, item_vecs = Factorizer.alternating_least_squares(10, 15, 0.1, 1)

In [None]:
user_vecs

In [None]:
item_vecs

In [38]:
# fast blackbox als by flipboard
import implicit
alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((product_train*alpha).astype('double'), factors=20, regularization = 0.1, iterations = 50)

ModuleNotFoundError: No module named 'implicit'