# SVD

## Import libraries

In [1]:
from lib.models import RecommendSystemModel

from typing import List, Any, Tuple,Union
from numpy.typing import NDArray
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# import tensorflow as tf

## Function to update class in Jupyter Notebook 
https://stackoverflow.com/questions/45161393/jupyter-split-classes-in-multiple-cells

In [2]:
import functools
def update_class(
    main_class=None, exclude=("__module__", "__name__", "__dict__", "__weakref__")
):
    """Class decorator. Adds all methods and members from the wrapped class to main_class

    Args:
    - main_class: class to which to append members. Defaults to the class with the same name as the wrapped class
    - exclude: black-list of members which should not be copied
    """

    def decorates(main_class, exclude, appended_class):
        if main_class is None:
            main_class = globals()[appended_class.__name__]
        for k, v in appended_class.__dict__.items():
            if k not in exclude:
                setattr(main_class, k, v)
        return main_class

    return functools.partial(decorates, main_class, exclude)

### Example

In [3]:
class MyClass:
    def method1(self):
        print("method1")
me = MyClass()


In [5]:
@update_class()
class MyClass:
    def method2(self):
        print("method2")
me.method1()
me.method2()

method1
method2


## The ML model

In [6]:
class SVDModel(RecommendSystemModel):
    def __init__(self, mode:str=None, features: int = None, lr: float = None, epochs: int = None, weight_decay: float = None, stopping: float = None, momentum: float = None) -> None:
        # Data frame
        self.data:pd.DataFrame
        # # Training data 
        self.train:NDArray
        # # Validating Data
        self.valid:NDArray
        self.test:NDArray
        # SVD mode 
        self.mode: str = mode or 'funk'
        # Number of features
        self.features: int = features or 10
        # Learning rate
        self.lr: float = lr or 0.0002
        # Number of total epochs
        self.epochs: int = epochs or 101
        # the weight decay 
        self.weight_decay: float = weight_decay or 0.02
        self.stopping: float = stopping or 0.001
        self.momentum: float = momentum or 0.0
        # Tensor SGD optimizer
        # self.optimizer = tf.keras.optimizers.SGD(learning_rate=self.lr, momentum=self.momentum,)
        
        # # Rating matrix
        # self.R: NDArray
        # # User matrix
        # self.P: NDArray
        # # Item matrix
        # self.Q: NDArray
        
        # Rating matrix
        # self._R = self.R.copy()
        # User latent matrix
        self._P: NDArray# = np.random.rand(self.n_users, features) * 0.1
        # Item latent matrix
        self._Q: NDArray # = np.random.rand(self.n_items, features) * 0.1
        
        
        super().__init__()

In [7]:
@update_class()
class SVDModel(RecommendSystemModel):
    def split(self, ratio_train_test: float, ratio_train_valid: float, tensor: bool = False) -> List[NDArray]:
        userItemMatrix = self.convertToUserItemMatrix(self.data, self.n_users, self.n_items)
        
        trainBeforeSplit = np.zeros((len(userItemMatrix), len(userItemMatrix[0]))).tolist()
        self.train = np.zeros((len(userItemMatrix), len(userItemMatrix[0]))).tolist()
        self.valid = np.zeros((len(userItemMatrix), len(userItemMatrix[0]))).tolist()
        self.test = np.zeros((len(userItemMatrix), len(userItemMatrix[0]))).tolist()

        for i in range(len(userItemMatrix)):
            for j in range(len(userItemMatrix[i])):
                if userItemMatrix[i][j] > 0:
                    if np.random.binomial(1, ratio_train_test, 1):
                        trainBeforeSplit[i][j] = userItemMatrix[i][j]
                    else:
                        self.test[i][j] = userItemMatrix[i][j]
        
        for i in range(len(trainBeforeSplit)):
            for j in range(len(trainBeforeSplit[i])):
                if trainBeforeSplit[i][j] > 0:
                    if np.random.binomial(1, ratio_train_valid, 1):
                        self.train[i][j] = trainBeforeSplit[i][j]
                    else:
                        self.valid[i][j] = trainBeforeSplit[i][j]


In [8]:
@update_class()
class SVDModel(RecommendSystemModel):
    def data_loader(self, path:str=None, nrows:int=None, skiprows=None, data:pd.DataFrame=None, n_users: int = None, n_items = None) -> None:
        if not path and data.empty:
            raise 'Error: one of path or data frame should be provided'
        if data.empty:
            self.data = pd.read_csv(path,low_memory=False,nrows=nrows,skiprows=skiprows)
        elif not path:
            self.data = data
        self.n_users = n_users
        self.n_items = n_items

In [9]:
@update_class()
class SVDModel(RecommendSystemModel):
    def train(self) -> Tuple[NDArray, NDArray, float, float]:
        loss_train = []
        loss_valid = []
        errors = []

        # self.n_users = len(self.train)
        # self.n_items = len(self.valid)
        self._P = np.random.rand(self.n_users, self.features) * 0.1
        self._Q = np.random.rand(self.n_items, self.features) * 0.1

        # Johnny
        for e in range(self.epochs):
            for id_user in range(self.n_users):
                for id_item in range(self.n_items):
                    if self.train[id_user][id_item] > 0:
                        
                        predict = self.prediction(self._P, self._Q, id_user, id_item)
                        
                        error = self.train[id_user][id_item] - predict
                        errors.append(error)
                        
                        self.optimize(error, id_user, id_item, self.weight_decay)
            trainLoss = self.loss(self.train, self._P, self._Q)
            validLoss = self.loss(self.valid, self._P, self._Q)
            loss_train.append(trainLoss)
            loss_valid.append(validLoss)
            if e % 10 == 0:
                print('Epoch : ', "{:3.0f}".format(e+1), ' | Train :', "{:3.3f}".format(trainLoss), 
                    ' | Valid :', "{:3.3f}".format(validLoss))
                
            # TODO stopping criterion
            if (trainLoss - loss_train[-2]) < self.stopping:
                break
        return loss_train, loss_valid, errors
        # return super().learn_to_recommend(data, features, lr, epochs, weight_decay, stopping)

In [33]:
@update_class()
class SVDModel(RecommendSystemModel):
    def convertToUserItemMatrix(self, data, n_users, n_movies):
        data = np.array(data,dtype=int)
        userItemMatrix = []
        for id_user in range(1, n_users + 1):
            if id_user % 1000 == 0: print('{}/{}'.format(id_user, n_users))
            # id_movies = np.array([ratingRecord[1] for ratingRecord in data if ratingRecord[0] == id_user],dtype=int)
            # # print(id_movies.dtype)
            # id_ratings = [ratingRecord[2] for ratingRecord in data if ratingRecord[0] == id_user]
            id_movies = data[:, 1][data[:, 0] == id_user]
            id_ratings = data[:, 2][data[:, 0] == id_user]
            ratings = np.zeros(n_movies)
            ratings[id_movies - 1] = id_ratings
            # try:
            #     ratings[id_movies - 1] = id_ratings
            # except:
            #     print(id_user)
            #     raise 'Error:'
            userItemMatrix.append(list(ratings))
        return userItemMatrix

In [11]:
@update_class()
class SVDModel(RecommendSystemModel):
    def prediction(self, P: NDArray, Q: NDArray, u: int, i: int) -> float:
        # Woody
        return np.dot(P[u: ], Q[ :i])
        # print(321)
        # return super().prediction(P, Q, u, i)

In [12]:
@update_class()
class SVDModel(RecommendSystemModel):
    def loss(self, groundTruthData, P: NDArray, Q: NDArray) -> float:
        # Woody
        squaredErrors = 0.0
        numOfPrediction = 0
        # nb_users, nb_items = len(data), len(data[0])

        for u in range(self.n_users):
            for i in range(self.n_items):
                if groundTruthData[u][i] > 0:
                    squaredErrors += pow(groundTruthData[u][i] - self.prediction(P, Q, u, i), 2)
                    numOfPrediction += 1
                    
        return squaredErrors / numOfPrediction
        print(654)
        # return super().loss(data, P, Q)

In [13]:
@update_class()
class SVDModel(RecommendSystemModel):
    def optimize(self, error:float, id_user:int, id_item:int,weight_decay):
        # Johnny
        # P[id_user] = self.optimizer.minimize(P[id_user], [error])
        # Q[id_item] = self.optimizer.minimize()
        # return super().svd()
        
        self._P[:, id_user] += self.lr * (error * self._Q[:, id_item] - weight_decay * self._P[:, id_user])
        self._Q[:, id_item] += self.lr * (error * self._P[:, id_user] - weight_decay * self._Q[:, id_item])

In [37]:
svd = SVDModel()
# svd.svd()
# svd.learn_to_recommend(2)

In [4]:
# define the paths
data_path = './data/MovieLens25M/'
df_rating = pd.read_csv(data_path+'ratings.csv')
df_movies = pd.read_csv(data_path+'movies.csv')
m_movies = df_movies['movieId'].nunique()
n_users = df_rating['userId'].nunique()
n_ratings = len(df_rating)

In [5]:
print(m_movies)
print(n_users)
print(df_rating)
print(df_movies)

62423
162541
          userId  movieId  rating   timestamp
0              1      296     5.0  1147880044
1              1      306     3.5  1147868817
2              1      307     5.0  1147868828
3              1      665     5.0  1147878820
4              1      899     3.5  1147868510
...          ...      ...     ...         ...
25000090  162541    50872     4.5  1240953372
25000091  162541    55768     2.5  1240951998
25000092  162541    56176     2.0  1240950697
25000093  162541    58559     4.0  1240953434
25000094  162541    63876     5.0  1240952515

[25000095 rows x 4 columns]
       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62418   209157              

In [6]:
df_movies['movieId'].max()

209171

In [7]:
df_rating['userId'].max()

162541

In [8]:
df_movies.insert(1, 'newMovieId', value=range(1,len(df_movies)+1))

In [9]:
newMovieIdDict = {}
for index, movie in df_movies.iterrows():
  newMovieIdDict[movie['movieId']] = movie['newMovieId']

In [10]:
newMovieIdDict[209159]

62420

In [102]:
df_rating.replace({"movieId": newMovieIdDict}, inplace=True)

: 

: 

In [11]:
for index, rating in df_rating.iterrows():
  if index % 100000 == 0: print('{}/{}'.format(index, len(df_rating)))
  rating['movieId'] = newMovieIdDict[rating['movieId']]

0/25000095
100000/25000095
200000/25000095
300000/25000095
400000/25000095
500000/25000095
600000/25000095
700000/25000095
800000/25000095
900000/25000095
1000000/25000095
1100000/25000095
1200000/25000095
1300000/25000095
1400000/25000095
1500000/25000095
1600000/25000095
1700000/25000095
1800000/25000095
1900000/25000095
2000000/25000095
2100000/25000095
2200000/25000095
2300000/25000095
2400000/25000095
2500000/25000095
2600000/25000095
2700000/25000095
2800000/25000095
2900000/25000095
3000000/25000095
3100000/25000095
3200000/25000095
3300000/25000095
3400000/25000095
3500000/25000095
3600000/25000095
3700000/25000095
3800000/25000095
3900000/25000095
4000000/25000095
4100000/25000095
4200000/25000095
4300000/25000095


KeyboardInterrupt: 

In [None]:
for index, movie in df_movies.iterrows():
  for index, rating in df_rating.iterrows():
    if rating['movieId'] == movie['movieId']:
      rating['movieId'] = index+1

In [62]:
for index, movie in df_movies.iterrows():
  c = df_rating['movieId'].eq(movie['movieId'])
  df_rating.loc[c,'movieId']=index+1

In [32]:
data = np.array(df_rating,dtype=int)
u3 = data[:, 1][data[:, 0] == 3]
print(len(u3))
print(u3)

656
[     1     29     32     50    111    172    173    214    260    293
    296    318    356    442    480    527    541    589    593    741
    745    778    780    858    912    913    924   1089   1127   1148
   1196   1198   1200   1201   1206   1208   1210   1213   1214   1217
   1221   1222   1240   1252   1270   1274   1320   1356   1527   1584
   1653   1676   1682   1732   1748   1882   1909   1960   2019   2021
   2105   2329   2542   2571   2628   2641   2858   2938   2959   3091
   3156   3285   3300   3301   3328   3408   3452   3471   3484   3527
   3534   3535   3578   3646   3702   3730   3744   3745   3751   3753
   3755   3785   3793   3798   3825   3827   3863   3879   3948   3968
   3969   3977   3979   3980   3986   3988   3991   3993   3994   3996
   3999   4011   4018   4019   4022   4023   4025   4027   4034   4069
   4148   4167   4226   4232   4234   4246   4262   4306   4308   4340
   4343   4344   4366   4367   4368   4369   4383   4388   4447   4448
  

In [38]:

df_rating['rating'] = df_rating['rating'].astype(np.int64)
df_rating.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype
---  ------     -----
 0   userId     int64
 1   movieId    int64
 2   rating     int64
 3   timestamp  int64
dtypes: int64(4)
memory usage: 762.9 MB


In [17]:
df_rating.empty

False

In [39]:
svd.data_loader(data=df_rating, n_items = m_movies, n_users=n_users)

In [40]:
print(svd.n_items)
print(svd.__dict__)
# svd.n_items = 10

62423
{'mode': 'funk', 'features': 10, 'lr': 0.0002, 'epochs': 101, 'weight_decay': 0.02, 'stopping': 0.001, 'momentum': 0.0, 'data':           userId  movieId  rating   timestamp
0              1      296       5  1147880044
1              1      306       3  1147868817
2              1      307       5  1147868828
3              1      665       5  1147878820
4              1      899       3  1147868510
...          ...      ...     ...         ...
25000090  162541    50872       4  1240953372
25000091  162541    55768       2  1240951998
25000092  162541    56176       2  1240950697
25000093  162541    58559       4  1240953434
25000094  162541    63876       5  1240952515

[25000095 rows x 4 columns], 'valid': None, 'train': None, 'test': None, 'n_users': 162541, 'n_items': 62423}


In [41]:
svd.split(0.6,0.8)

IndexError: index 62848 is out of bounds for axis 0 with size 62423

In [None]:
print(len(svd.test), len(svd.test[0]))

AttributeError: 'SVDModel' object has no attribute 'test'