In [5]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import scipy.linalg as la
from tqdm import tqdm
import scipy.sparse as sp

In [3]:
def create_sparse_matrices(df):
    
    N = len(df['userid'].unique())
    M = len(df['movieid'].unique())

    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df['userid']).astype('str'), list(range(N))))
    movie_mapper = dict(zip(np.unique(df['movieid']).astype('str'), list(range(M))))

    # Map indices to IDs
    user_inv_mapper = dict(zip([str(i) for i in range(N)], [int(i) for i in np.unique(df['userid'])]))
    movie_inv_mapper = dict(zip([str(i) for i in range(M)], np.unique(df['movieid'])))

    user_index = [user_mapper[str(i)] for i in df['userid']]
    movie_index = [movie_mapper[str(i)] for i in df['movieid']]

    sparse_movie_user = coo_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
    sparse_user_movie = coo_matrix((df["rating"], (user_index, movie_index)), shape=(N, M))

    return sparse_movie_user, sparse_user_movie, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

def low_rank_matrix_factorization(X_sparse, k, niters=8, lam=10., seed = 0):
    """
    Factor a rating matrix into user-features and movie-features.

    args:
        X_sparse (sp.coo_matrix[num_users, num_movies]) : the ratings matrix, assumed sparse in COO format
        k (int) : the number of features in the lower-rank matrices U and V
        niters (int) : number of iterations to run
        lam (float) : regularization parameter, shown as lambda
        seed (int) : the seed for numpy random generator

    return : Tuple(U, V)
        U : np.array[num_users,  k] -- the user-feature matrix
        V : np.array[k, num_movies] -- the movie-feature matrix
    """
    np.random.seed(seed)
    I = np.eye(k)
    X_sparse_shape = X_sparse.shape
    U = np.random.normal(loc = 0, scale = 0.1, size = (X_sparse_shape[0],k))

    V = np.random.normal(loc = 0, scale = 0.1, size = (k,X_sparse_shape[1]))


    X_rows = X_sparse.tocsr()
    X_cols = X_sparse.tocsc()

    for _ in tqdm(range(niters)):
        for i in tqdm(range(V.shape[1])):
            if i % 5000 == 0:
              print(i)

            ori_col = X_cols.getcol(i).toarray()
            indicator_col = (ori_col!=0)


            B = (ori_col * U).sum(axis = 0)

            A =  U.T @ (indicator_col * U)

            V[:,i] = la.solve(A + lam * I, B)



        for s in range(U.shape[0]):
            if s % 5000 == 0:
                print(s)

            ori_row = X_rows.getrow(s).toarray()
            indicator_row = (ori_row!=0)

            B = (ori_row * V).sum(axis = 1)
            A = (indicator_row *V ) @  V.T
            U[s] = (la.solve(A + lam * I, B))


    return (U,V)

In [None]:
data_rate_path = 'data/rate.csv'
data_movie_path = 'data/movies.csv'
n_data = 3000

In [8]:
df_rate = pd.read_csv(data_rate_path)[:n_data]
df_movie = pd.read_csv(data_movie_path)  
(sparse_movie_user, sparse_user_movie, 
user_mapper, movie_mapper, 
user_inv_mapper, movie_inv_mapper) = create_sparse_matrices(df_rate)

X_check = sparse_user_movie
U_check, V_check = low_rank_matrix_factorization(X_check, 4,niters = 5)

np.save('exp_data/U_check_new.npy', U_check)
np.save('exp_data/V_check_new.npy', V_check)
sp.save_npz('exp_data/X_check_new.npz', X_check)

  0%|          | 0/5 [00:00<?, ?it/s]
 68%|██████▊   | 995/1466 [00:00<00:00, 4981.66it/s]

0


100%|██████████| 1466/1466 [00:00<00:00, 4964.88it/s]
 20%|██        | 1/5 [00:00<00:01,  2.67it/s]

0




0


100%|██████████| 1466/1466 [00:00<00:00, 5000.22it/s]
 40%|████      | 2/5 [00:00<00:01,  2.71it/s]

0




0


100%|██████████| 1466/1466 [00:00<00:00, 5461.59it/s]
 60%|██████    | 3/5 [00:01<00:00,  2.82it/s]

0




0


100%|██████████| 1466/1466 [00:00<00:00, 5003.30it/s]
 80%|████████  | 4/5 [00:01<00:00,  2.78it/s]

0




0


100%|██████████| 1466/1466 [00:00<00:00, 5100.42it/s]
100%|██████████| 5/5 [00:01<00:00,  2.76it/s]

0



