# Collaborative Filtering model on MovieLens

Download the 20m [movielens dataset](http://files.grouplens.org/datasets/movielens/ml-20m.zip)

You can use the aria2c or wget to download

In [1]:
# %cd /data
# !!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# !!unzip ml-latest-small.zip

In [1]:
import pandas as pd
import numpy as np
import os
import torch
from ray.matchbox import Trainer,Arr_Dataset
from torch.utils.data import DataLoader,Dataset

In [2]:
DATA = "/data/ml-latest-small/"
DIM = 100
CUDA = torch.cuda.is_available()

In [3]:
files = os.listdir(DATA)
files

['links.csv', 'tags.csv', 'ratings.csv', 'README.txt', 'movies.csv']

In [4]:
data = dict()
for f in files:
    if f[-3:]=="csv":
        data[f.split(".")[0]] = pd.read_csv(DATA+f)

### Check Data

In [5]:
from IPython.display import display
list(display(k,v.sample(5)) for k,v in data.items())

'links'

Unnamed: 0,movieId,imdbId,tmdbId
7369,72129,1233227,22804.0
6622,50798,799949,9760.0
4163,5489,79641,6404.0
7672,81847,398286,38757.0
1688,2130,80388,23954.0


'tags'

Unnamed: 0,userId,movieId,tag,timestamp
300,346,3265,martial arts,1159734552
526,364,118997,funny,1444530106
656,431,4641,thora birch,1140455465
368,364,1176,lyrical,1444528947
1206,547,114662,toplist14,1423131235


'ratings'

Unnamed: 0,userId,movieId,rating,timestamp
98937,664,4887,3.5,1393891251
82566,562,2959,4.5,1167428248
32147,232,2688,4.0,955093825
56747,408,1059,5.0,933036132
21460,150,180,3.0,1114306408


'movies'

Unnamed: 0,movieId,title,genres
6409,44788,This Film Is Not Yet Rated (2006),Documentary
3666,4676,Troop Beverly Hills (1989),Comedy
6598,50005,Curse of the Golden Flower (Man cheng jin dai ...,Action|Drama
781,963,"Inspector General, The (1949)",Musical
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary


[None, None, None, None]

## Model on rating

In [6]:
data["ratings"].sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
13986,91,78041,3.5,1448798561
95981,636,85,5.0,855227416
26924,197,3701,4.0,975429504
80637,547,99089,4.0,1356000054
86360,577,3996,5.0,1111476682


In [7]:
len(data["ratings"])

100004

In [8]:
userId = list(set(data["ratings"]["userId"]))
movieId = list(set(data["ratings"]["movieId"]))
print(len(userId),len(movieId))

671 9066


### Mapping
user to index, movie to index, index to user, index to movie

In [22]:
u2i = dict((v,k) for k,v in enumerate(userId))
m2i = dict((v,k) for k,v in enumerate(movieId))
i2u = dict((k,v) for k,v in enumerate(userId))
i2m = dict((k,v) for k,v in enumerate(movieId))

In [10]:
data["ratings"]["movie_idx"] = data["ratings"]["movieId"].apply(lambda x:m2i[x])
data["ratings"]["user_idx"] = data["ratings"]["userId"].apply(lambda x:u2i[x])

### Separate train/valid dataset

In [11]:
train_pick = np.random.rand(len(data["ratings"]))>.2
valid_pick = ~train_pick

In [12]:
train_pick,valid_pick

(array([False,  True,  True, ...,  True,  True,  True]),
 array([ True, False, False, ..., False, False, False]))

In [13]:
train_df = data["ratings"][train_pick].reset_index()
valid_df = data["ratings"][valid_pick].reset_index()

### Data generator

In [14]:
train = Arr_Dataset(train_df["user_idx"].values,
                    train_df["movie_idx"].values
                    ,train_df["rating"].values/5
                    ,bs=512)
valid = Arr_Dataset(valid_df["user_idx"].values,
                    valid_df["movie_idx"].values
                    ,valid_df["rating"].values/5
                    ,bs=512)

## Basic Cross Filtering

In [15]:
from torch import nn

In [16]:
class cf(nn.Module):
    def __init__(self):
        """
        Cross Filtering Module
        """
        super(cf,self).__init__()
        self.emb_u = nn.Embedding(len(userId), DIM)
        self.emb_m = nn.Embedding(len(movieId), DIM)
    
    def forward(self,u,m):
        u_vec = self.emb_u(u)
        m_vec = self.emb_m(m)
        return u_vec * m_vec
    
class cfnn(nn.Module):
    def __init__(self):
        super(cfnn,self).__init__()
        self.cf = cf()
        self.fcb = nn.Sequential(*[
                                    nn.Dropout(p=.3),
                                    nn.Linear(DIM,512,bias=False),
                                   nn.BatchNorm1d(512),
                                   nn.LeakyReLU(inplace=True),
                                   nn.Linear(512,1,bias=False),
                                   nn.BatchNorm1d(1),
#                                    nn.Sigmoid()
                                  ],
                                )
    
    def forward(self,u,m):
        x = self.cf(u,m)
        return self.fcb(x)

In [17]:
cfmodel = cfnn()

In [18]:
from torch.optim import Adam
mse = nn.MSELoss()
opt = Adam(cfmodel.parameters())
if CUDA:
    cfmodel.cuda()

Step function for train and valid

In [19]:
def action(*args,**kwargs):
    u,m,y = args[0]
    u,m,y = u.squeeze(),m.squeeze(),y.squeeze()
    opt.zero_grad()
    if CUDA:
        u,m,y  = u.cuda(),m.cuda(),y.cuda()
        
    y_ = cfmodel(u,m) # prediction
    
    loss = mse(y_,y.unsqueeze(-1).float())
    
    loss.backward()
    opt.step()
    
    return {"mse":loss.item()}

def val_action(*args,**kwargs):
    u,m,y = args[0]
    u,m,y = u.squeeze(),m.squeeze(),y.squeeze()
    y_ = cfmodel(u,m)
    loss = mse(y_,y.unsqueeze(-1).float())
    
    return {"mse":loss.item()}

In [20]:
trainer = Trainer(train, val_dataset=valid, batch_size=1, print_on = 5)

trainer.action = action
trainer.val_action = val_action

In [21]:
trainer.train(20)

⭐[ep_0_i_154]	mse	0.325: 100%|██████████| 156/156 [00:06<00:00, 24.18it/s]
😎[val_ep_0_i_39]	mse	0.362: 100%|██████████| 40/40 [00:00<00:00, 60.48it/s]
⭐[ep_1_i_154]	mse	0.211: 100%|██████████| 156/156 [00:06<00:00, 23.77it/s]
😎[val_ep_1_i_39]	mse	0.231: 100%|██████████| 40/40 [00:00<00:00, 59.66it/s]
⭐[ep_2_i_154]	mse	0.134: 100%|██████████| 156/156 [00:06<00:00, 23.87it/s]
😎[val_ep_2_i_39]	mse	0.147: 100%|██████████| 40/40 [00:00<00:00, 59.83it/s]
⭐[ep_3_i_154]	mse	0.099: 100%|██████████| 156/156 [00:07<00:00, 21.23it/s]
😎[val_ep_3_i_39]	mse	0.097: 100%|██████████| 40/40 [00:00<00:00, 63.21it/s]
⭐[ep_4_i_154]	mse	0.081: 100%|██████████| 156/156 [00:06<00:00, 23.28it/s]
😎[val_ep_4_i_39]	mse	0.069: 100%|██████████| 40/40 [00:00<00:00, 58.81it/s]
⭐[ep_5_i_154]	mse	0.045: 100%|██████████| 156/156 [00:06<00:00, 23.21it/s]
😎[val_ep_5_i_39]	mse	0.055: 100%|██████████| 40/40 [00:00<00:00, 61.73it/s]
⭐[ep_6_i_154]	mse	0.044: 100%|██████████| 156/156 [00:06<00:00, 22.42it/s]
😎[val_ep_6_i_39]	ms