In [1]:
from SVDModel import SVDModel
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time

np.random.seed(3362)

In [13]:
# Parameters
PARAM = dict()
PARAM['path'] = {
  'ml':'./data/MovieLens25M/',
  'netflix':'./data/NetflixPrize/'
}
PARAM['features'] = [5,10,50,100,150,200,250,300]
PARAM['lr'] = [1e-1,1e-2,1e-3,1e-4,1e-5]
PARAM['epochs'] = [11,101,201,301,501,1001,]
PARAM['weight_decay'] = [0.02]
PARAM['stopping'] = [0.001]

PARAM

{'path': {'ml': './data/MovieLens25M/', 'netflix': './data/NetflixPrize/'},
 'features': [5, 10, 50, 100, 150, 200, 250, 300],
 'lr': [0.1, 0.01, 0.001, 0.0001, 1e-05],
 'epochs': [11, 101, 201, 301],
 'weight_decay': [0.02],
 'stopping': [0.001]}

In [3]:
ML_25M = {
  'ratings': pd.read_csv(PARAM['path']['ml']+'ratings.csv', dtype={
    'userId': np.int32,
    'movieId': np.int32,
    'rating': np.float64,
    'timestamp':np.float64}).sample(2**16),
  'movies': pd.read_csv(PARAM['path']['ml']+'movies.csv', dtype= {
    'movieId':np.int32,
    'title': str,
    'genres': str
  })
}
ML_25M['m_movies'] = ML_25M['movies']['movieId'].nunique()
ML_25M['n_users'] = ML_25M['ratings']['userId'].nunique()
ML_25M['n_ratings'] = len(ML_25M['ratings'])


ML_25M['movies'].insert(1, 'newMovieId', value=range(1,len(ML_25M['movies'])+1))
newMovieIdDict = {}
for index, movie in ML_25M['movies'].iterrows():
  newMovieIdDict[movie['movieId']] = movie['newMovieId']
ML_25M['ratings'].replace({"movieId": newMovieIdDict}, inplace=True)
ML_25M['ratings']['rating'] = ML_25M['ratings']['rating'].astype(np.float16)

ML_25M

{'ratings':           userId  movieId  rating     timestamp
 15290174   99074     7953     3.5  1.255155e+09
 12937849   83777    23250     4.5  1.482952e+09
 13923369   90257    11857     5.0  1.535852e+09
 22119882  143800     1266     3.0  9.961158e+08
 12257633   79385      504     4.0  1.081312e+09
 ...          ...      ...     ...           ...
 3370359    22263      904     4.5  1.347882e+09
 21814823  141830     1708     5.0  1.548413e+09
 12732594   82334     1071     4.0  8.909923e+08
 16051202  104038     5184     5.0  1.142117e+09
 22732412  147792    11729     4.5  1.488756e+09
 
 [65536 rows x 4 columns],
 'movies':        movieId  newMovieId                               title  \
 0            1           1                    Toy Story (1995)   
 1            2           2                      Jumanji (1995)   
 2            3           3             Grumpier Old Men (1995)   
 3            4           4            Waiting to Exhale (1995)   
 4            5           5

In [4]:
print(ML_25M['ratings'].size)
print(ML_25M['movies'].size)

262144
249692


In [11]:
names = ['userId','rating','timestamp','movieId']
df_netflix = pd.read_csv(PARAM['path']['netflix']+'data_1.txt', names=names, low_memory=False)
df_netflix_2 = pd.read_csv(PARAM['path']['netflix']+'data_2.txt', names=names, low_memory=False)
df_netflix_3 = pd.read_csv(PARAM['path']['netflix']+'data_3.txt', names=names, low_memory=False)
df_netflix_4 = pd.read_csv(PARAM['path']['netflix']+'data_4.txt', names=names, low_memory=False)
df_netflix = pd.concat([df_netflix,df_netflix_2,df_netflix_3,df_netflix_4], names=names)
df_netflix['movieId'] = df_netflix['movieId'].fillna(method='ffill')
df_netflix = df_netflix.dropna()
df_netflix['rating'] = df_netflix['rating'].astype(np.float64)
df_netflix['movieId'] = df_netflix['movieId'].astype(np.int16)
df_netflix['userId'] = df_netflix['userId'].astype(np.int32)
df_netflix['timestamp'] = pd.to_datetime(df_netflix['timestamp']).view(dtype=np.int64) // 10 ** 9

NETFLIX = {
  'ratings': df_netflix[['userId','movieId','rating','timestamp']].sample(2**16),
  'movies': pd.read_csv(PARAM['path']['netflix']+'movie_titles.csv',encoding='latin-1',names=['movieId','release_year','title'],usecols=range(3)),
  'm_movies': df_netflix['movieId'].nunique(),
  'n_users': df_netflix['userId'].nunique(),
  'n_ratings': len(df_netflix)
}

In [12]:
print(NETFLIX['ratings'].size)
print(NETFLIX['movies'].size)

262144
53310


In [20]:
rows = []
for features in PARAM['features']:
  for lr in PARAM['lr']:
    for epochs in PARAM['epochs']:
      for weight_decay in PARAM['weight_decay']:
        for stopping in PARAM['stopping']:
          rows.append([features,lr,epochs,weight_decay,stopping])

[[5, 0.1, 11, 0.02, 0.001],
 [5, 0.1, 101, 0.02, 0.001],
 [5, 0.1, 201, 0.02, 0.001],
 [5, 0.1, 301, 0.02, 0.001],
 [5, 0.01, 11, 0.02, 0.001],
 [5, 0.01, 101, 0.02, 0.001],
 [5, 0.01, 201, 0.02, 0.001],
 [5, 0.01, 301, 0.02, 0.001],
 [5, 0.001, 11, 0.02, 0.001],
 [5, 0.001, 101, 0.02, 0.001],
 [5, 0.001, 201, 0.02, 0.001],
 [5, 0.001, 301, 0.02, 0.001],
 [5, 0.0001, 11, 0.02, 0.001],
 [5, 0.0001, 101, 0.02, 0.001],
 [5, 0.0001, 201, 0.02, 0.001],
 [5, 0.0001, 301, 0.02, 0.001],
 [5, 1e-05, 11, 0.02, 0.001],
 [5, 1e-05, 101, 0.02, 0.001],
 [5, 1e-05, 201, 0.02, 0.001],
 [5, 1e-05, 301, 0.02, 0.001],
 [10, 0.1, 11, 0.02, 0.001],
 [10, 0.1, 101, 0.02, 0.001],
 [10, 0.1, 201, 0.02, 0.001],
 [10, 0.1, 301, 0.02, 0.001],
 [10, 0.01, 11, 0.02, 0.001],
 [10, 0.01, 101, 0.02, 0.001],
 [10, 0.01, 201, 0.02, 0.001],
 [10, 0.01, 301, 0.02, 0.001],
 [10, 0.001, 11, 0.02, 0.001],
 [10, 0.001, 101, 0.02, 0.001],
 [10, 0.001, 201, 0.02, 0.001],
 [10, 0.001, 301, 0.02, 0.001],
 [10, 0.0001, 11, 0.02, 