In [2]:
from SVDModel import SVDModel
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
from joblib import Parallel, delayed
import pickle
import os.path
from pathlib import Path

np.random.seed(3362)

In [3]:
# Parameters
PARAM = dict()
PARAM['path'] = {
  'ml':'./data/MovieLens25M/',
  'netflix':'./data/NetflixPrize/'
}

PARAM

{'path': {'ml': './data/MovieLens25M/', 'netflix': './data/NetflixPrize/'}}

In [4]:
names = ['userId','rating','timestamp','movieId']
df_netflix = pd.read_csv(PARAM['path']['netflix']+'data_1.txt', names=names, low_memory=False)
df_netflix_2 = pd.read_csv(PARAM['path']['netflix']+'data_2.txt', names=names, low_memory=False)
df_netflix_3 = pd.read_csv(PARAM['path']['netflix']+'data_3.txt', names=names, low_memory=False)
df_netflix_4 = pd.read_csv(PARAM['path']['netflix']+'data_4.txt', names=names, low_memory=False)
df_netflix = pd.concat([df_netflix,df_netflix_2,df_netflix_3,df_netflix_4], names=names)
df_netflix['movieId'] = df_netflix['movieId'].fillna(method='ffill')
df_netflix.dropna(inplace=True)
df_netflix['rating'] = df_netflix['rating'].astype(np.int8)
df_netflix['movieId'] = df_netflix['movieId'].astype(np.int16)
df_netflix['userId'] = df_netflix['userId'].astype(np.int32)
df_netflix['timestamp'] = pd.to_datetime(df_netflix['timestamp']).view(dtype=np.int64) // 10 ** 9
df_movies = pd.read_csv(PARAM['path']['netflix']+'movie_titles.csv',
                        encoding='latin-1',
                        names=['movieId','release_year','title'],
                        usecols=range(3))
NETFLIX = {
  'ratings': df_netflix[['userId','movieId','rating','timestamp']].sample(2**16),
  'movies': df_movies,
  'm_movies': df_netflix['movieId'].nunique(),
  'n_users': df_netflix['userId'].nunique(),
  'n_ratings': len(df_netflix)
}


In [5]:
print(NETFLIX['ratings'].shape)
print(NETFLIX['movies'].shape)

(65536, 4)
(17770, 3)


In [6]:
PARAM['features'] = [3,30,150,300]
PARAM['lr'] = [1e-2,1e-3,1e-4]
PARAM['epochs'] = [11,101,201,301,501]
PARAM['weight_decay'] = [0.02,0.2,2]
PARAM['stopping'] = 0.001
 
data = 'NETFLIX'
 
if data=='ML_25M':
    resultDir = 'model_movielens'
else:
    resultDir = 'model_netflix'

print('Creating SVD Model')
svd = SVDModel()
print("Created\nLoading Data")
if data=='ML_25M':
    svd.data_loader(data=ML_25M['ratings'],
                    n_items = ML_25M['m_movies'],
                    n_users= ML_25M['n_users'])
else:
    svd.data_loader(data=NETFLIX['ratings'],
                    n_items = NETFLIX['m_movies'],
                    n_users= NETFLIX['n_users'])
print("Loaded Data\nSpliting Data")
svd.split(0.8,0.8)

def train_model(features, lr, epochs, data, weight_decay):
    if os.path.isfile(f'./{resultDir}/funk/result/{features}_{lr}_{epochs}_{weight_decay}_0.001.pkl'):
        return
    print(f"==============================")
    print(f'features: {features}\nlearning rate: {lr}\nEpochs: {epochs}\nweight decay: {weight_decay}\nstopping: 0.001\n')
 
    svd.features = features
    svd.lr = lr
    svd.epochs = epochs
    svd.weight_decay = weight_decay
 

    print("Splitted Data\nTraining...")
    start = time.perf_counter()
    result = svd.training()
    print('Time used =', time.perf_counter()-start)
   
    print('Saving Model')
    # svd.data = []
    Path(f"./{resultDir}").mkdir(parents=True, exist_ok=True)
    Path(f"./{resultDir}/funk").mkdir(parents=True, exist_ok=True)
    Path(f"./{resultDir}/funk/model").mkdir(parents=True, exist_ok=True)
    Path(f"./{resultDir}/funk/result").mkdir(parents=True, exist_ok=True)
   
   
#     with open(f'./{resultDir}/funk/model/{features}_{lr}_{epochs}_{weight_decay}_0.001.pkl', 'wb') as out:
#         pickle.dump(svd, out, pickle.HIGHEST_PROTOCOL)
#         out.close()
    with open(f'./{resultDir}/funk/result/{features}_{lr}_{epochs}_{weight_decay}_0.001.pkl', 'wb') as out:
        pickle.dump(result, out, pickle.HIGHEST_PROTOCOL)
        out.close()
    print(f"==============================")
    return svd, result

Creating SVD Model
Created
Loading Data
Loaded Data
Spliting Data
User Item Matrix Shape: userItemMatrix.shape
User Reference length: self.n_users
Item Reference length: self.n_items


In [None]:
for epochs in PARAM['epochs']:
    for features in PARAM['features']:
        for lr in PARAM['lr']:
            for weight_decay in PARAM['weight_decay']:
                for data in ['NETFLIX']:
                    train_model(features, lr, epochs, data, weight_decay)