## Neural Collaborative Filtering
* Pytorch version
* rating prediction (korean movie recommendation dataset)

In [1]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

import math
from torch import nn, optim
import torch
import torch.nn.functional as F
from torch.autograd import Variable

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
path = './kmrd/kmr_dataset/datafile/kmrd-small/'
df = pd.read_csv(path + 'rates.csv')
df.columns = ['userId','movieId','rating','time']
df.head()

Unnamed: 0,userId,movieId,rating,time
0,0,10003,7,1494128040
1,0,10004,7,1467529800
2,0,10018,9,1513344120
3,0,10021,9,1424497980
4,0,10022,7,1427627340


In [3]:
train_df , val_df = train_test_split(df, test_size=0.2, random_state=123)

In [4]:
print(train_df.shape)
print(val_df.shape)

(112568, 4)
(28142, 4)


In [5]:
train_df['rating'].describe()

count    112568.000000
mean          8.952820
std           2.103876
min           1.000000
25%           9.000000
50%          10.000000
75%          10.000000
max          10.000000
Name: rating, dtype: float64

In [6]:
movies_df = pd.read_csv(path+'movie_info.csv')
movies_df.head()

Unnamed: 0,movie,title,title_eng,year,grade,genres,people
0,10001,시네마 천국,"Cinema Paradiso , 1988",2013.0,전체 관람가,드라마/멜로/로맨스,[ 4374 178 3241 47952 47953 19538 18991 479...
1,10002,빽 투 더 퓨쳐,"Back To The Future , 1985",2015.0,12세 관람가,SF/코미디,[1076 4603 917 8637 5104 9986 7470 9987]
2,10003,빽 투 더 퓨쳐 2,"Back To The Future Part 2 , 1989",2015.0,12세 관람가,SF/코미디,[1076 4603 917 5104 391 5106 5105 5107 580 ...
3,10004,빽 투 더 퓨쳐 3,"Back To The Future Part III , 1990",1990.0,전체 관람가,서부/SF/판타지/코미디,[ 1076 4603 1031 5104 10001 5984 10002 100...
4,10005,스타워즈 에피소드 4 - 새로운 희망,"Star Wars , 1977",1997.0,PG,판타지/모험/SF/액션,[1007 535 215 1236 35]


In [7]:
# Define DataLoader
def read_data(data_path):
    df = pd.read_csv(os.path.join(data_path,'rates.csv'))
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=1234, shuffle=True)
    return train_df, val_df


class DatasetLoader:
    def __init__(self, data_path):
        self.train_df, val_temp_df = read_data(data_path)

        self.min_rating = min(self.train_df.rate)
        self.max_rating = self.train_df.rate.max()

        self.unique_users = self.train_df.user.unique()
        self.num_users = len(self.unique_users)
        self.user_to_index = {original: idx for idx, original in enumerate(self.unique_users)}
        # 0 1 0 0 0 ... 0

        self.unique_movies = self.train_df.movie.unique()
        self.num_movies = len(self.unique_movies)
        self.movie_to_index = {original: idx for idx, original in enumerate(self.unique_movies)}

        self.val_df = val_temp_df[val_temp_df.user.isin(self.unique_users) & val_temp_df.movie.isin(self.unique_movies)]

    def generate_trainset(self):
        # user 0, 0, 0, 1,2, 3,3, -> movie: 0,0,0,0,0,0,
        X_train = pd.DataFrame({'user': self.train_df.user.map(self.user_to_index),
                     'movie': self.train_df.movie.map(self.movie_to_index)})
        y_train = self.train_df['rate'].astype(np.float32)

        return X_train, y_train

    def generate_valset(self):
        X_val = pd.DataFrame({'user': self.val_df.user.map(self.user_to_index),
                              'movie': self.val_df.movie.map(self.movie_to_index)})
        y_val = self.val_df['rate'].astype(np.float32)
        return X_val, y_val

* User Vector : 전체 영화 데이터에서 영화를 평가한 userid를 onehot vector로 나타낸 형태
* Item Vector : 전체 영화 데이터에 등장하는 영화의 id를 onehot vector로 나타낸 형태

In [8]:
class FeedForwardEmbedNN(nn.Module):

    def __init__(self, n_users, n_movies, hidden, dropouts, n_factors, embedding_dropout):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, n_factors)
        self.movie_emb = nn.Embedding(n_movies, n_factors)
        self.drop = nn.Dropout(embedding_dropout)
        self.hidden_layers = nn.Sequential(*list(self.generate_layers(n_factors*2, hidden, dropouts)))
        self.fc = nn.Linear(hidden[-1], 1)

    def generate_layers(self, n_factors, hidden, dropouts):
        assert len(dropouts) == len(hidden)

        idx = 0
        while idx < len(hidden):
            if idx == 0:
                yield nn.Linear(n_factors, hidden[idx])
            else:
                yield nn.Linear(hidden[idx-1], hidden[idx])
            yield nn.ReLU()
            yield nn.Dropout(dropouts[idx])

            idx += 1

    def forward(self, users, movies, min_rating=0.5, max_rating=5):
        concat_features = torch.cat([self.user_emb(users), self.movie_emb(movies)], dim=1)
        x = F.relu(self.hidden_layers(concat_features))
        # 0과 1사이의 숫자로 나타낸다
        out = torch.sigmoid(self.fc(x))
        # rating으로 변환한다
        out = (out * (max_rating - min_rating)) + min_rating

        return out

    def predict(self, users, movies):
        # return the score
        output_scores = self.forward(users, movies)
        return output_scores

In [9]:
class BatchIterator:

    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)

        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]

        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(X.shape[0] // batch_size))
        self._current = 0

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        return self.X[k * bs:(k + 1) * bs], self.y[k * bs:(k + 1) * bs]

In [10]:
def batches(X, y, bs=32, shuffle=True):
    for x_batch, y_batch in BatchIterator(X, y, bs, shuffle):
        x_batch = torch.LongTensor(x_batch)
        y_batch = torch.FloatTensor(y_batch)
        yield x_batch, y_batch.view(-1, 1)

* 데이터셋과 모델 학습에 필요한 configuration을 입력하고, 학습을 하는 함수 정의
    * configuration을 바꾸면서 모델의 성능을 측정해볼 수 있다.

In [11]:
def model_train(ds, config):
    device = torch.device('mps' if torch.mps.is_available() else 'cpu')

    X_train, y_train = ds.generate_trainset()
    X_valid, y_valid = ds.generate_valset()
    print(f'TrainSet Info: {ds.num_users} users, {ds.num_movies} movies')

    model = FeedForwardEmbedNN(
        n_users=ds.num_users, n_movies=ds.num_movies,
        n_factors=config['num_factors'], hidden=config['hidden_layers'],
        embedding_dropout=config['embedding_dropout'], dropouts=config['dropouts']
    )
    model.to(device)

    batch_size = config['batch_size']
    num_epochs = config['num_epochs']
    max_patience = config['total_patience']
    num_patience = 0
    best_loss = np.inf

    criterion = nn.MSELoss(reduction='sum')
    criterion.to(device)
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])

    result = dict()
    for epoch in tqdm(range(num_epochs)):
        training_loss = 0.0
        for batch in batches(X_train, y_train, shuffle=True, bs=batch_size):
            x_batch, y_batch = [b.to(device) for b in batch]
            optimizer.zero_grad()
            # with torch.no_grad() 와 동일한 syntax 입니다
            with torch.set_grad_enabled(True):
                outputs = model(x_batch[:, 0], x_batch[:, 1], ds.min_rating, ds.max_rating)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
            training_loss += loss.item()
        result['train'] = training_loss / len(X_train)

        # Apply Early Stopping criteria and save best model params
        val_outputs = model(torch.LongTensor(X_valid.user.values).to(device),
                            torch.LongTensor(X_valid.movie.values).to(device), ds.min_rating, ds.max_rating)
        val_loss = criterion(val_outputs.to(device), torch.FloatTensor(y_valid.values).view(-1, 1).to(device))
        result['val'] = float((val_loss / len(X_valid)).data)

        if val_loss < best_loss:
            print('Save new model on epoch: %d' % (epoch + 1))
            best_loss = val_loss
            result['best_loss'] = val_loss
            torch.save(model.state_dict(), config['save_path'])
            num_patience = 0
        else:
            num_patience += 1

        print(f'[epoch: {epoch+1}] train: {result["train"]} - val: {result["val"]}')

        if num_patience >= max_patience:
            print(f"Early Stopped after epoch {epoch+1}")
            break

    return result

In [15]:
config = {
  "num_factors": 50,
  "hidden_layers": [64, 32, 16],
  "embedding_dropout": 0.05,
  "dropouts": [0.3, 0.3, 0.3],
  "learning_rate": 1e-3,
  "weight_decay": 1e-5,
  "batch_size": 64,
  "num_epochs": 12,
  "total_patience": 5,
  "save_path": "params.data"
}

In [16]:
def model_valid(user_id_list, movie_id_list, data_path):
    dataset = DatasetLoader(data_path)
    processed_test_input_df = pd.DataFrame({
        'user_id': [dataset.user_to_index[x] for x in user_id_list],
        'movie_id': [dataset.movie_to_index[x] for x in movie_id_list]
    })

    # 학습한 모델 load하기 
    my_model = FeedForwardEmbedNN(dataset.num_users, dataset.num_movies,
                       config['hidden_layers'], config['dropouts'], config['num_factors'], config['embedding_dropout'])
    my_model.load_state_dict(torch.load('params.data'))
    prediction_outputs = my_model.predict(users=torch.LongTensor(processed_test_input_df.user_id.values),
                     movies=torch.LongTensor(processed_test_input_df.movie_id.values))

    return prediction_outputs

In [17]:
dataset = DatasetLoader(path)

In [18]:
model_train(dataset, config)

TrainSet Info: 44453 users, 597 movies


  8%|▊         | 1/12 [00:11<02:01, 11.01s/it]

Save new model on epoch: 1
[epoch: 1] train: 4.5949572042042695 - val: 4.1170125007629395


 17%|█▋        | 2/12 [00:21<01:47, 10.74s/it]

Save new model on epoch: 2
[epoch: 2] train: 3.977288518139366 - val: 3.7995035648345947


 25%|██▌       | 3/12 [00:32<01:37, 10.82s/it]

Save new model on epoch: 3
[epoch: 3] train: 3.5550281194454145 - val: 3.6596181392669678


 33%|███▎      | 4/12 [00:42<01:25, 10.70s/it]

Save new model on epoch: 4
[epoch: 4] train: 3.195357079072494 - val: 3.6428141593933105


 42%|████▏     | 5/12 [00:53<01:13, 10.49s/it]

Save new model on epoch: 5
[epoch: 5] train: 2.859014451397854 - val: 3.6325838565826416


 50%|█████     | 6/12 [01:03<01:01, 10.29s/it]

[epoch: 6] train: 2.6054250454396026 - val: 3.681940793991089


 58%|█████▊    | 7/12 [01:12<00:50, 10.16s/it]

[epoch: 7] train: 2.4314276158500436 - val: 3.7278892993927


 67%|██████▋   | 8/12 [01:22<00:40, 10.07s/it]

[epoch: 8] train: 2.2812083346800853 - val: 3.727844476699829


 75%|███████▌  | 9/12 [01:32<00:30, 10.01s/it]

[epoch: 9] train: 2.170528189153714 - val: 3.731149911880493


 75%|███████▌  | 9/12 [01:42<00:34, 11.38s/it]

[epoch: 10] train: 2.1009224397644255 - val: 3.7462754249572754
Early Stopped after epoch 10





{'train': 2.1009224397644255,
 'val': 3.7462754249572754,
 'best_loss': tensor(73581.6172, device='mps:0', grad_fn=<MseLossBackward0>)}

In [19]:
val_df.head()

Unnamed: 0,userId,movieId,rating,time
86381,14566,10213,10,1082734020
78663,12048,10113,10,1225357740
22101,1477,10181,10,1108375800
5615,170,10142,10,1190847960
53717,5803,10341,10,1309855740


In [43]:
user_id = 105 # sample user

In [44]:
pd.merge(
    train_df.loc[train_df['userId']==user_id].sort_values(by='rating', ascending=False), 
movies_df.rename(columns={'movie':'movieId'}), on='movieId',how='left'
)[['title','year','genres']].dropna().head(20)

Unnamed: 0,title,year,genres
0,바람과 함께 사라지다,1995.0,멜로/로맨스/드라마/전쟁
1,내일을 향해 쏴라,1970.0,드라마/서부/범죄
2,아마데우스,2015.0,드라마
3,델마와 루이스,2017.0,드라마
4,대부,2010.0,드라마/스릴러/범죄
5,티파니에서 아침을,2012.0,멜로/로맨스/드라마
6,싸이코,1962.0,스릴러/공포/미스터리
7,사운드 오브 뮤직,2017.0,멜로/로맨스/뮤지컬/드라마
8,베를린 천사의 시,1993.0,멜로/로맨스/판타지/드라마
9,아마데우스,2015.0,드라마


In [45]:
watched_movieId = train_df.loc[train_df['userId']==user_id,'movieId'].unique()
nonwatched_movieId = train_df.loc[~train_df['movieId'].isin(watched_movieId),'movieId'].unique()
len(nonwatched_movieId)

541

In [46]:
watched_movieId = train_df.loc[train_df['userId']==user_id,'movieId'].unique()
movie_id_list = val_df.loc[~val_df['movieId'].isin(watched_movieId),'movieId'].unique().tolist()
user_id_list = [user_id] * len(movie_id_list)
pred_results = [float(x) for x in model_valid(user_id_list, movie_id_list, path)]

result_df = pd.DataFrame({
    'userId': user_id_list,
    'movieId': movie_id_list,
    'movieName': movies_df.loc[movies_df['movie'].isin(movie_id_list),'title'].values,
    'genres': movies_df.loc[movies_df['movie'].isin(movie_id_list),'genres'].values,
    'pred_ratings': pred_results
})

result_df.sort_values(by='pred_ratings', ascending=False).head(20)

Unnamed: 0,userId,movieId,movieName,genres,pred_ratings
116,105,10301,007 황금총을 가진 사나이,액션,4.65966
53,105,10940,영자의 전성시대,멜로/로맨스/드라마,4.522726
41,105,10033,라 밤바,드라마,4.512043
450,105,10152,아파치,서부,4.458166
17,105,10173,록키 2,드라마/액션,4.453176
470,105,10462,나일 살인사건,미스터리/범죄,4.43723
47,105,10629,프레데터,SF/액션/스릴러/공포,4.409897
315,105,10822,블랙 레인,드라마/액션/범죄,4.398178
86,105,10767,강시번생,코미디/공포,4.392677
144,105,10185,터미네이터 2:오리지널,SF/액션/스릴러,4.353568
