In [None]:
import pandas as pd
from scipy.spatial.distance import cosine
import numpy as np
import random

Mounted at /content/gdrive


Read data from dataset. Choice of rows and columns used to create R matrix.

In [None]:
netflix_table = pd.read_csv('netflix_2_set.txt', index_col=None, header=None)
netflix_table.columns = ['movieId', 'userId', 'rating']
columns = list(set(netflix_table['userId']))
index = list(set(netflix_table['movieId']))

columns = columns[:20]
index = index[:50]

In [None]:
def create_R_matrix(netflix_table: pd.DataFrame) -> pd.DataFrame:
  ''' function transform input dataset for proper R matrix. ''' 
  R = pd.DataFrame(columns=columns, index=index)
            
  for ind in index:
    for col in columns:
      try:
        R.at[ind, col] = int(netflix_table.loc[(netflix_table['movieId'] == ind)
        & (netflix_table['userId'] == col)]['rating'])
      except:
        pass
  R = R.fillna(0)
  return R

'\n  for c in columns:\n    for i in index:\n      if R.at[i, c] == 0:\n        if  np.random.random() < 0.3:\n          R.at[i, c] = np.random.randint(5)+1\n\n  return R\n'

# Recommendation system

In [None]:
def error(R, P, Q):
  ''' calulate error '''
  err = 0
  for x in range(len(R)):
    for i in range(len(R[0])):
      if R[x][i] > 0:
        err = err + (R[x][i] - np.dot(P[x,:], Q[:,i]))**2
  return err


def factorize_matrix(R: np.array, K:int = 2, regularization: bool = True, alpha = 0.002, beta = 0.2, steps = 10000):
  ''' SVD algorithm '''
  N = len(R)
  M = len(R[0])
  P = np.random.rand(N, K)
  Q = np.random.rand(K, M)
  for st in range(steps):
    for x in range(len(R)):
      for i in range(len(R[0])):
        if R[x][i] > 0:
          Jxi = -2* (R[x][i]  - np.dot(P[x, :], Q[:, i])) 
          for k in range(K):
            if regularization:
              P[x][k] = P[x][k] - alpha * (Jxi* Q[k][i]* Q[k][i] + beta * P[x][k]) 
              Q[k][i] = Q[k][i] - alpha * (Jxi* P[x][k]* P[x][k] + beta * Q[k][i]) 
            else:
              P[x][k] = P[x][k] - alpha * (Jxi* Q[k][i]* Q[k][i]) 
              Q[k][i] = Q[k][i] - alpha * (Jxi* P[x][k]* P[x][k])

    if error(R, P, Q) < 0.06:
      break
      
  return P, Q

In [None]:
def compare_R_and_S_matrices(R: np.array, S: np.array) -> None:
  ''' matrixies comparator, function to check norm only on known values'''
  null_indexes = np.where(np.array(R) == 0)
  nindx = zip(null_indexes[0], null_indexes[1])
  print(nindx)
  difference_between_matricies = 0.0
  for ni in range(len(R)):
    for nj in range(len(R[0])):
      if (ni, nj) not in nindx:
        difference_between_matricies += (R[ni][nj] - S[ni][nj])**2
  return np.sqrt(difference_between_matricies)

Creation of the R matrix based on data, SVD factorization of the matrix, creation S - output matrix.

In [None]:
R = create_R_matrix(netflix_table)
R = R.to_numpy()

In [None]:
P, Q = factorize_matrix(R, K = 2, regularization = True, alpha = 0.002, beta = 0.2, steps = 10000)
S = np.dot(P, Q)

Comparation of the matrcies R and S.

In [None]:
print(np.linalg.norm((R, S))) # Frobenius norm            
print(compare_R_and_S_matrices(R,S))

<zip object at 0x7f1b48b4b5a0>
23.095455178795206
