In [1]:
import os
import math
import time
import sys
import random
import warnings 
from torch.nn.modules.loss import _WeightedLoss
from pickle import dump, load

import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
import scipy as sp
import numpy as np
import pandas as pd
import RecDataset
import RecTestDataset

from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau


warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import matplotlib.pyplot as plt

In [3]:
class CFG:
    num_workers=4
    scheduler='CosineAnnealingWarmRestarts'
    epochs=10
    T_0=10
    lr=1e-4
    min_lr=1e-6
    batch_size=32
    weight_decay=1e-6
    gradient_accumulation_steps=1
    seed=2019
    embedding_size=64

In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(seed=2019)

In [5]:
train = pd.read_csv(r'D:\Dataset\recommendation\ml-latest-small\ratings.csv')

In [6]:
movies = pd.read_csv(r'D:\Dataset\recommendation\ml-latest-small\movies.csv')

In [7]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
train.shape

(100836, 4)

In [9]:
user_lbl = preprocessing.LabelEncoder()
movie_lbl = preprocessing.LabelEncoder()
    
train.userId = user_lbl.fit_transform(train.userId.values)
train.movieId = movie_lbl.fit_transform(train.movieId.values)

dump(user_lbl, open('user_lbl.pkl', 'wb'))
dump(movie_lbl, open('movie_lbl.pkl', 'wb'))

In [10]:
class RecSysModel(nn.Module):
    def __init__(self, num_user, num_movies, embedding_size):
        super().__init__()
        self.user_embed = nn.Embedding(num_user, embedding_size)
        self.movie_embed = nn.Embedding(num_movies, embedding_size)
        self.user_bias = nn.Embedding(num_user, 1)
        self.movie_bias = nn.Embedding(num_movies, 1)
        self.out = nn.Linear(130, 1)
        
    def forward(self, user_id, movie_id):
        user_embeds = self.user_embed(user_id)
        movie_embeds = self.movie_embed(movie_id)
        user_bias = self.user_bias(user_id)
        movie_bias = self.movie_bias(movie_id)
        x = torch.cat([user_embeds, movie_embeds, user_bias, movie_bias], dim=1)
        x = self.out(x)
        output = x + user_bias + movie_bias
        
        return output  

In [11]:
def train_fn(model, train_dataloader, optimizer, scheduler, loss_fn=None, fp16=False):
    model.train()
    
    scaler =  torch.cuda.amp.GradScaler()
    
    train_loss = 0
    
    for step, (user, movie, ratings) in enumerate(train_dataloader):
        user, movie, ratings = user.to(device), movie.to(device), ratings.to(device)  
        with torch.cuda.amp.autocast(enabled=True):
            output = model(user, movie)
            loss = loss_fn(output, ratings.view(-1, 1))
            
        train_loss +=loss.item()

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps

        if fp16:
            scaler.scale(loss).backward()
        else:
            loss.backward()
        
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            if fp16:
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
            else:
                optimizer.step()
                scheduler.step()

            optimizer.zero_grad()
        
    return train_loss/len(train_dataloader)


def valid_fn(model, valid_dataloader, loss_fn=None):
    
    model.eval()
    predictions = []
    valid_loss = 0
    
    for user, movie, ratings in valid_dataloader:
        user, movie, ratings = user.to(device), movie.to(device), ratings.to(device)
        with torch.no_grad():
            output = model(user, movie)
            loss = loss_fn(output, ratings.view(-1, 1))
        valid_loss +=loss.item()
        predictions.append(output.detach().cpu().numpy())
        
    return valid_loss/len(valid_dataloader), np.concatenate(predictions)


def inference_fn(model, dataloader):
    model.eval()
    predictions = [] 
    with torch.no_grad():
        for user, movie in dataloader:
            user, movie = user.to(device), movie.to(device)
            output = model(user, movie)
            predictions.append(output.squeeze().detach().cpu().numpy())
            
    return np.concatenate(predictions)


In [12]:
def rmse_score(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [13]:
def train_model(df, seed=42):    

    best_loss = np.inf
    
    seed_everything(seed)
    
    df_train, df_valid = train_test_split(
            df,
            test_size=0.1, 
            stratify=df.rating.values,
            random_state=CFG.seed
    )
        
    train_dataset = RecDataset.RecSysDataset(
            usr_id=df_train.userId.values,
            mov_id=df_train.movieId.values,
            ratings=df_train.rating.values
    )
    
    valid_dataset = RecDataset.RecSysDataset(
            usr_id=df_valid.userId.values,
            mov_id=df_valid.movieId.values,
            ratings=df_valid.rating.values
        )

    train_loader = DataLoader(train_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=True, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    model = RecSysModel(len(user_lbl.classes_), len(movie_lbl.classes_), CFG.embedding_size)
    model.to(device)
        
    criterion = nn.MSELoss()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
                optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
    )
    
    for epoch in range(CFG.epochs): 
        train_loss = train_fn(model, train_loader, optimizer, scheduler, loss_fn=criterion, fp16=True)
        valid_loss, valid_preds = valid_fn(model, valid_loader, loss_fn=criterion)
        valid_score = rmse_score(valid_preds, df_valid.rating.values)
        
        print(f'epochs {epoch+1} - avg_train_loss: {train_loss:.4f}  avg_val_loss: {valid_loss:.4f}')
        
        if valid_loss < best_loss:
            best_score = valid_loss
            print(f'epoch {epoch+1} - save best score: {best_score:.4f} save model!')
            torch.save(model.state_dict(), f'RecSysModel.pth')
    torch.cuda.empty_cache()
     

In [14]:
if __name__ == '__main__':
    train_model(train, CFG.seed) 


epochs 1 - avg_train_loss: 14.2476  avg_val_loss: 11.9280
epoch 1 - save best score: 11.9280 save model!
epochs 2 - avg_train_loss: 9.7542  avg_val_loss: 7.9243
epoch 2 - save best score: 7.9243 save model!
epochs 3 - avg_train_loss: 6.4942  avg_val_loss: 5.3401
epoch 3 - save best score: 5.3401 save model!
epochs 4 - avg_train_loss: 4.5049  avg_val_loss: 3.7968
epoch 4 - save best score: 3.7968 save model!
epochs 5 - avg_train_loss: 3.2816  avg_val_loss: 2.8236
epoch 5 - save best score: 2.8236 save model!
epochs 6 - avg_train_loss: 2.4869  avg_val_loss: 2.1807
epoch 6 - save best score: 2.1807 save model!
epochs 7 - avg_train_loss: 1.9569  avg_val_loss: 1.7515
epoch 7 - save best score: 1.7515 save model!
epochs 8 - avg_train_loss: 1.6031  avg_val_loss: 1.4648
epoch 8 - save best score: 1.4648 save model!
epochs 9 - avg_train_loss: 1.3657  avg_val_loss: 1.2723
epoch 9 - save best score: 1.2723 save model!
epochs 10 - avg_train_loss: 1.2053  avg_val_loss: 1.1432
epoch 10 - save best s

In [15]:
def predict(df1):    
    
    test_dataset = RecTestDataset.RecSysDataset(
            usr_id=df1.userId.values,
            mov_id=df1.movieId.values
    )


    test_loader = DataLoader(test_dataset, 
                              batch_size=CFG.batch_size, 
                              shuffle=False, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    model = RecSysModel(len(user_lbl.classes_), len(movie_lbl.classes_), CFG.embedding_size)
    model.load_state_dict(torch.load(f'RecSysModel.pth'))
    model.to(device)

    predictions = inference_fn(model, test_loader)
    return predictions

In [16]:
#provide user id here to get recommendations
user_id=609

test = train.loc[train.userId==user_id, :].copy()

movies_not_watched = train[
    ~train["movieId"].isin(test.movieId.values)
]["movieId"]
movies_not_watched=movies_not_watched.drop_duplicates()
movies_not_watched  = movies_not_watched.to_frame()
movies_not_watched['userId']=user_id

predicted = predict(movies_not_watched)

movies_not_watched['preds']=predicted
movies_not_watched = movies_not_watched.sort_values(by=['preds'], ascending=False).head(100)

In [17]:
test.movieId = movie_lbl.inverse_transform(test.movieId.values)
movies_not_watched.movieId = movie_lbl.inverse_transform(movies_not_watched.movieId.values)

In [18]:
#movies watched top 10
pd.merge(test, movies, how='left', on=['movieId']).dropna().head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,609,1,5.0,1479542900,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,609,6,5.0,1493850345,Heat (1995),Action|Crime|Thriller
2,609,16,4.5,1479542171,Casino (1995),Crime|Drama
3,609,32,4.5,1479543331,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
4,609,47,5.0,1479545853,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
5,609,50,4.0,1493844757,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
6,609,70,4.0,1495959282,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller
7,609,95,3.5,1479542004,Broken Arrow (1996),Action|Adventure|Thriller
8,609,110,4.5,1479545829,Braveheart (1995),Action|Drama|War
9,609,111,5.0,1479542162,Taxi Driver (1976),Crime|Drama|Thriller


In [20]:
#recommended movies top 10
pd.merge(movies_not_watched, movies, how='left', on=['movieId']).dropna().head(10)

Unnamed: 0,movieId,userId,preds,title,genres
0,4458,609,5.263389,Africa: The Serengeti (1994),Documentary|IMAX
1,105351,609,5.049533,Runner Runner (2013),Crime|Drama|Thriller
2,7487,609,4.898856,Henry & June (1990),Drama
3,27193,609,4.864571,Taxi 2 (2000),Action|Comedy
4,1304,609,4.856672,Butch Cassidy and the Sundance Kid (1969),Action|Western
5,6938,609,4.82211,Billabong Odyssey (2003),Documentary
6,1916,609,4.812134,Buffalo '66 (a.k.a. Buffalo 66) (1998),Drama|Romance
7,3932,609,4.797697,"Invisible Man, The (1933)",Horror|Sci-Fi
8,2629,609,4.789517,"Love Letter, The (1999)",Comedy|Romance
9,499,609,4.78177,Mr. Wonderful (1993),Comedy|Romance
