In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
!cp /content/gdrive/My\ Drive/RecSys -r ./
%cd RecSys

Mounted at /content/gdrive
/content/RecSys


Data: movielens-1m

Metrics: NDCG & HR

Reason: it is used in [this](https://arxiv.org/pdf/1708.05031.pdf) NCF paper (implemented below) and in most MF models. As far as I understand, movielens is a quite common dataset for recsys tasks.

In [3]:
!unzip ml-1m.zip

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [None]:
!pip install implicit

In [6]:
import implicit
import pandas as pd
import random
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.utils.data as t_data

In [7]:
ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, 
        names=['uid', 'mid', 'rating', 'timestamp'], engine='python')

In [8]:
movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
        names=['mid', 'name', 'category'], engine='python')

Reindexing

In [9]:
uids = ratings[['uid']].drop_duplicates().reindex()
uids['user_id'] = np.arange(len(uids))
ratings = pd.merge(ratings, uids, on=['uid'], how='left')

mids = movie_info[['mid']].drop_duplicates()
mids['movie_id'] = np.arange(len(mids))
ratings = pd.merge(ratings, mids, on=['mid'], how='left')
movie_info = pd.merge(movie_info, mids, on=['mid'], how='left')

ratings = ratings[['user_id', 'movie_id', 'rating', 'timestamp']]
movie_info = movie_info[['movie_id', 'name', 'category']]

In [10]:
n_users = len(uids)
n_movies = len(mids)
n_users, n_movies

(6040, 3883)

According to NCF paper any interaction with an item is an implicit feedback. They also use leave-one-out method to split the dataset.

In [11]:
max_timestamps = ratings.groupby('user_id')['timestamp'].apply(max).reset_index().rename(
            columns={'timestamp': 'max_timestamp'})
ratings = pd.merge(ratings, max_timestamps, on=['user_id'], how='left')

extra_test = ratings[ratings['max_timestamp'] == ratings['timestamp']]
test = extra_test.drop_duplicates(subset='user_id', keep="last").reset_index()
extra_test = extra_test[~extra_test.index.isin(test.index)]

train = ratings[ratings['max_timestamp'] != ratings['timestamp']]
train = pd.concat([train, extra_test]).reset_index()
train, test = train[['user_id', 'movie_id', 'rating']], test[['user_id', 'movie_id', 'rating']]

del extra_test
del max_timestamps
assert train['user_id'].nunique() == test['user_id'].nunique()
ratings.drop('max_timestamp',axis='columns', inplace=True)

In [None]:
test.head(10)

Unnamed: 0,user_id,movie_id,rating
0,0,47,5
1,1,1848,3
2,2,2012,4
3,3,1885,5
4,4,285,2
5,5,593,5
6,6,3038,3
7,7,3188,3
8,8,2225,4
9,9,2183,5


In [12]:
train_csr = sp.coo_matrix((np.ones_like(train['user_id']), (train['user_id'], train['movie_id'])))
train_t_csr = train_csr.T.tocsr()
train_csr = train_csr.tocsr()

In [13]:
train_csr.shape

(6040, 3883)

Now let's find all the possible negtaive items and set negative samples for test.

In [14]:
movies_set = set(mids['movie_id'].to_list())

negatives = ratings.groupby('user_id')['movie_id'].apply(set).reset_index().rename(
    columns={'movie_id': 'watched'})
negatives['nonwatched'] = negatives['watched'].apply(lambda x: list(movies_set - x))
negatives.drop('watched', axis='columns', inplace=True)
negatives.head()

Unnamed: 0,user_id,nonwatched
0,0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,1,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
3,3,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
4,4,"[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14..."


In [15]:
test['negatives'] = pd.merge(test, negatives, on='user_id')['nonwatched'].apply(lambda x: np.random.choice(x, 99))
test.head()

Unnamed: 0,user_id,movie_id,rating,negatives
0,0,47,5,"[710, 1554, 1914, 1255, 1390, 1716, 1461, 2145..."
1,1,1848,3,"[319, 1147, 2363, 3273, 550, 3070, 2089, 3632,..."
2,2,2012,4,"[230, 805, 1354, 3295, 3336, 3630, 2176, 3305,..."
3,3,1885,5,"[768, 201, 1082, 1813, 339, 1715, 22, 2988, 29..."
4,4,285,2,"[1727, 1098, 3373, 1808, 999, 64, 3109, 1430, ..."


In [16]:
train = pd.merge(train, negatives, on='user_id')

In [124]:
train.head()

Unnamed: 0,user_id,movie_id,rating,nonwatched
0,0,1176,5,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,0,655,3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
2,0,902,3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
3,0,3339,4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
4,0,2286,5,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."


Functions to view results from hw1

In [17]:
get_similars = lambda item_id, model : [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() 
                                        for x in model.similar_items(item_id)]

In [18]:
get_user_history = lambda user_id, implicit_ratings : [movie_info[movie_info["movie_id"] == x]["name"].to_string() 
                                            for x in implicit_ratings[implicit_ratings["user_id"] == user_id]["movie_id"]]

In [19]:
get_recommendations = lambda user_id, model : [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() 
                                               for x in model.recommend(user_id, train_csr)]

All further recommendations will be for user #1. As you can see, they mostly watch animated movies. Also some drama and scifi.

In [20]:
USER_REC = 0
get_user_history(USER_REC, ratings)

["1176    One Flew Over the Cuckoo's Nest (1975)",
 '655    James and the Giant Peach (1996)',
 '902    My Fair Lady (1964)',
 '3339    Erin Brockovich (2000)',
 "2286    Bug's Life, A (1998)",
 '1179    Princess Bride, The (1987)',
 '1267    Ben-Hur (1959)',
 '2735    Christmas Story, A (1983)',
 '590    Snow White and the Seven Dwarfs (1937)',
 '907    Wizard of Oz, The (1939)',
 '591    Beauty and the Beast (1991)',
 '926    Gigi (1958)',
 '2329    Miracle on 34th Street (1947)',
 "2849    Ferris Bueller's Day Off (1986)",
 '1022    Sound of Music, The (1965)',
 '2722    Airplane! (1980)',
 '2618    Tarzan (1999)',
 '1949    Bambi (1942)',
 '3036    Awakenings (1990)',
 '2728    Big (1988)',
 '2252    Pleasantville (1998)',
 '711    Wallace & Gromit: The Best of Aardman Animatio...',
 '1250    Back to the Future (1985)',
 "523    Schindler's List (1993)",
 '2271    Meet Joe Black (1998)',
 '47    Pocahontas (1995)',
 '1081    E.T. the Extra-Terrestrial (1982)',
 '1672    Titanic (19

Similar movies will be check on a movie #1. It's an animated movie.

In [21]:
MOVIE_REC = 0
movie_info.head(1)

Unnamed: 0,movie_id,name,category
0,0,Toy Story (1995),Animation|Children's|Comedy


NDCG and HR score are used, following NCF paper.

In [22]:
from sklearn.metrics import ndcg_score

def get_metrics(model, n_top=10):
    predictions = model.predict(test)
    hr, ndcg = [], []
    true_sample = [0] * 99 + [1]
    for scores in predictions['scores']:
        ndcg.append(ndcg_score([true_sample], [scores], k=n_top))
        scores = np.argsort(scores)[::-1][:n_top]
        hr.append(99 in scores)
    print(len(hr))
    return np.mean(hr), np.mean(ndcg)

In [24]:
class RecSysModule():
    def __init__(self, model):
        self.model = model
        self.user_factors = None
        self.item_factors = None
        self.item_bias = None
        self.global_bias = 0.

    def similar_items(self, item_id, n_top=10):
        scores = np.dot(self.item_factors, self.item_factors[item_id])
        scores = scores / np.linalg.norm(self.item_factors[item_id])
        scores = scores / np.linalg.norm(self.item_factors, axis=-1)
        predictions = list(enumerate(scores))
        predictions.sort(key=lambda pr: -pr[1])
        return predictions[:n_top]

    def recommend(self, user_id, user_items, n_top=10):
        predictions = []
        watched = user_items[user_id].indices
        n_items = user_items.shape[1]
        for item_id in range(n_items):
            if item_id in watched:
                continue
            score = self.predict_score(user_id, item_id)
            predictions.append((item_id, score))
        predictions.sort(key=lambda pr: -pr[1])
        return predictions[:n_top]
    
    def predict(self, df):
        #df: user_id: int, movie_id: int, negatives: List[int]
        #result_df: user_id: int, scores: List[float] where scores[i] is for negatives[i] and scores[0] is for movie_id
        raise NotImplementedError()

# BPR

In [32]:
EMBED_DIM = 64

In [None]:
bpr_model = implicit.bpr.BayesianPersonalizedRanking(factors=EMBED_DIM, learning_rate=0.05, use_gpu=False, iterations=100)

In [None]:
bpr_model.fit(train_t_csr)

In [None]:
len(bpr_model.item_factors), len(bpr_model.user_factors)

(3883, 6040)

In [None]:
n_movies, n_users

(3883, 6040)

In [None]:
class BPRModule(RecSysModule):
    def __init__(self, model):
        super().__init__(model)
        self.model = model
        self.user_factors = model.user_factors[:, :-1]
        self.user_bias = np.zeros((len(self.user_factors),), dtype=np.float32)
        self.item_factors = model.item_factors[:, :-1]
        self.item_bias = model.item_factors[:, -1]

    def predict_score(self, user_id, item_id):
        res = np.dot(self.user_factors[user_id], self.item_factors[item_id])
        res += self.global_bias + self.user_bias[user_id] + self.item_bias[item_id]
        return res
    
    def predict(self, df):
        def get_scores(user_id, movie_id, negs):
            res = []
            for neg in negs:
                res.append(self.predict_score(user_id, neg))
            res.append(self.predict_score(user_id, movie_id))
            return res


        result_df = df[['user_id']].copy()
        result_df['scores'] = df.apply(lambda row: 
                                        get_scores(row['user_id'], row['movie_id'], row['negatives']), axis=1)
        return result_df

In [None]:
bpr = BPRModule(bpr_model)

In [None]:
get_similars(MOVIE_REC, bpr)

['0    Toy Story (1995)',
 '33    Babe (1995)',
 '12    Balto (1995)',
 '47    Pocahontas (1995)',
 '38    Clueless (1995)',
 '31    Twelve Monkeys (1995)',
 '53    Big Green, The (1995)',
 '3045    Toy Story 2 (1999)',
 '20    Get Shorty (1995)',
 '360    Lion King, The (1994)']

In [97]:
get_recommendations(USER_REC, bpr)

['360    Lion King, The (1994)',
 '3090    Fantasia 2000 (1999)',
 '3414    Road to El Dorado, The (2000)',
 '2027    Sleeping Beauty (1959)',
 '2692    Iron Giant, The (1999)',
 '1656    Good Will Hunting (1997)',
 '592    Pinocchio (1940)',
 '2068    Charlottes Web (1973)',
 '258    Little Women (1994)',
 '259    Little Princess, A (1995)']

In [None]:
hr_bpr, ndcg_bpr = get_metrics(bpr)
hr_bpr, ndcg_bpr

6040


(0.8177152317880795, 0.5372090403136356)

Similars look ok. Recommendations are pretty good. Metrics look kinda high, idk, probably I did something wrong

# NCF

Let's use dataloader

In [25]:
class Dataset(t_data.Dataset):
    def __init__(self, users, items, ratings):
        self.users = users
        self.items = items
        self.ratings = ratings

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

    def __len__(self):
        return self.users.size(0)

In [27]:
import random

# if n_negs == -1 then negatives are already in dataframe
def get_dataloader(df, batch_size, n_negs = 4, vocal=True):
    if n_negs != -1:
        df['negatives'] = df['nonwatched'].apply(lambda x: random.sample(x, n_negs))
        if vocal:
            print("Generated negatives")
    users, items, ratings = [], [], []

    def add_(user, item, rating):
        users.append(user)
        items.append(item)
        ratings.append(rating)

    for user, item, rating, negs in zip(df['user_id'], df['movie_id'], df['rating'], df['negatives']):
        add_(user, item, rating)
        for neg in negs:
            add_(user, neg, 0.)
    df.drop('negatives', axis='columns', inplace=True)
    if vocal:
        print("Loaded samples", len(users))

    dataset = Dataset(torch.LongTensor(users), torch.LongTensor(items), torch.FloatTensor(ratings))
    return t_data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [28]:
tmp_df = train[:20000].copy()
dl = get_dataloader(tmp_df, 10)
print("Batches number", len(dl))

Generated negatives
Loaded samples 100000
Batches number 10000


Now let's create basic NCF class with trainer and MLP, GMF, NeuMF components with forward methods.

In [29]:
from tqdm.notebook import tqdm

class NCF(nn.Module):
    def __init__(self, n_users, n_items, n_emb):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.n_emb = n_emb
        self.user_factors = nn.Embedding(num_embeddings=self.n_users, embedding_dim=n_emb)
        self.item_factors = nn.Embedding(num_embeddings=self.n_items, embedding_dim=n_emb)
        self.f_loss = torch.nn.BCELoss()
    
    def fit(self, train_df, n_epochs=100, batch_size=128, lr=0.001):
        self.opt = torch.optim.Adam(self.parameters(), lr=lr)

        for epoch in tqdm(range(n_epochs)):
            data_loader = get_dataloader(train_df, batch_size, vocal=(epoch % 10 == 0))
            total_loss = 0.

            for users, items, ratings in data_loader:
                self.opt.zero_grad()
                predictions = self(users, items)
                loss = self.f_loss(predictions.view(-1), ratings)
                loss.backward()
                self.opt.step()
                loss = loss.item()
                total_loss += loss
        
    def _user_factors(self):
        return self.user_factors.weight.detach().cpu().numpy()

    def _item_factors(self):
        return self.item_factors.weight.detach().cpu().numpy()


In [146]:
class GMF(NCF):
    def __init__(self, n_users, n_items, n_emb):
        super().__init__(n_users, n_items, n_emb)
        self.predictor = nn.Sequential(nn.Linear(n_emb, 1), nn.Sigmoid())
    
    def forward(self, user_ids, item_ids):
        z = self.forward_without_result(user_ids, item_ids)
        ratings = self.predictor(z)
        return ratings
    
    def forward_without_result(self, user_ids, item_ids):
        z = torch.mul(self.user_factors(user_ids), self.item_factors(item_ids))
        return z

In [None]:
gmf = GMF(n_users, n_movies, EMBED_DIM // 2)
gmf.fit(train, n_epochs=30, batch_size=512)

In [152]:
torch.save(gmf.state_dict(), 'gmf.pcl')
gmf_copy = GMF(n_users, n_movies, EMBED_DIM // 2)
gmf_copy.load_state_dict(torch.load('gmf.pcl'))

<All keys matched successfully>

In [36]:
class MLP(NCF):
    def __init__(self, n_users, n_items, n_emb, hidden_outs):
        super().__init__(n_users, n_items, n_emb)
        self.hidden_outs = hidden_outs #list
        layers = []
        for n_in, n_out in zip([2 * n_emb] + hidden_outs[:-1], hidden_outs):
            layers.append(nn.Linear(n_in, n_out))
            layers.append(nn.ReLU())
        self.model = nn.Sequential(*layers)
        self.predictor = nn.Sequential(nn.Linear(hidden_outs[-1], 1), nn.Sigmoid())
    
    def forward(self, user_ids, item_ids):
        out = self.forward_without_result(user_ids, item_ids)
        ratings = self.predictor(out)
        return ratings
        
    def forward_without_result(self, user_ids, item_ids):
        z = torch.cat([self.user_factors(user_ids), self.item_factors(item_ids)], dim=-1) 
        out = self.model(z)
        return out
        

In [44]:
mlp = MLP(n_users, n_movies, EMBED_DIM // 2, [EMBED_DIM // 2, EMBED_DIM // 4, EMBED_DIM // 8])
mlp.fit(train, n_epochs=20, batch_size=256, lr=0.001)

In [None]:
torch.save(mlp.state_dict(), 'mlp.pcl')
mlp_copy = MLP(n_users, n_movies, EMBED_DIM // 2, [EMBED_DIM // 2, EMBED_DIM // 4, EMBED_DIM // 8])
mlp_copy.load_state_dict(torch.load('mlp.plc'))

In [46]:
class NeuMF(NCF):
    def __init__(self, gmf, mlp, n_emb, n_hidden_last):
        super().__init__(0, 0, n_emb) # NeuMF doesn't need it's own embeddings
        self.gmf = gmf
        self.mlp = mlp
        self.predictor = nn.Sequential(nn.Linear(n_emb + n_hidden_last, 1), nn.Sigmoid())
    
    def forward(self, user_ids, item_ids):
        z_gmf = self.gmf.forward_without_result(user_ids, item_ids, early_stop=True)
        z_mlp = self.mlp.forward_without_result(user_ids, item_ids, early_stop=True)
        z = torch.cat([z_gmf, z_mlp], dim=-1)
        ratings = self.predictor(z)
        return ratings

In [None]:
neumf = NeuMF(gmf_copy, mlp_copy, EMBED_DIM // 2, EMBED_DIM // 8)
neumf.fit(train, batch_size=1024)

In [48]:
class NCFModule(RecSysModule):
    def __init__(self, model):
        super().__init__(model)
        self.user_factors = model._user_factors()
        self.user_bias = np.zeros((len(self.user_factors),), dtype=np.float32)
        self.item_factors = model._item_factors()
        self.item_bias = np.zeros((len(self.item_factors),), dtype=np.float32)
    
    def predict_score(self, user_id, item_id):
        res = self.model(torch.LongTensor([user_id]), torch.LongTensor([item_id]))
        return res.item()
    
    def predict(self, df):
        def get_scores(user_id, item_id, negs):
            n_neg = len(negs)
            users = torch.LongTensor([user_id] * (n_neg + 1))
            items = torch.LongTensor(negs + [item_id])
            res = self.model(users, items).detach().numpy()
            return res


        result_df = df[['user_id']].copy()
        result_df['scores'] = df.apply(lambda row:
                                       get_scores(row['user_id'], row['movie_id'], row['negatives']), axis=1)
        return result_df

In [None]:
ncf = NCFModule(neumf)

In [92]:
get_similars(MOVIE_REC, ncf)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '612    Aristocats, The (1970)',
 '38    Clueless (1995)',
 '1220    Terminator, The (1984)',
 '33    Babe (1995)',
 '360    Lion King, The (1994)',
 '35    Dead Man Walking (1995)',
 '47    Pocahontas (1995)',
 '2068    Charlottes Web (1973)']

In [101]:
get_recommendations(USER_REC, ncf)

['592    Pinocchio (1940)',
 '360    Lion King, The (1994)',
 '2027    Sleeping Beauty (1959)',
 '109    Taxi Driver (1976)',
 '3414    Road to El Dorado, The (2000)',
 '2018    Peter Pan (1953)',
 '768    Trainspotting (1996)',
 '1642    Anastasia (1997)',
 '72    Misarables, Les (1995)',
 '461    Heaven & Earth (1993)']

In [102]:
hr_bpr, ndcg_bpr = get_metrics(bpr)
hr_bpr, ndcg_bpr

(0.7412564312388091, 0.4500163403546231)

Similars are mostly animations, which is good. Recommendations are also mostly animations. HR & NDCG look ok