# Factorization Machines: 

## Alleviating Choice Paralysis for Cinephiles 
### by Jordy Klustner 


In [None]:
!pip install rankfm

Collecting rankfm
  Downloading rankfm-0.2.5.tar.gz (145 kB)
[K     |████████████████████████████████| 145 kB 5.1 MB/s 
Building wheels for collected packages: rankfm
  Building wheel for rankfm (setup.py) ... [?25l[?25hdone
  Created wheel for rankfm: filename=rankfm-0.2.5-cp37-cp37m-linux_x86_64.whl size=386145 sha256=ed0f6e8a7c4b1894fae326dcc8ef49fbddf4111f4cf1b8abf7018d96084da55d
  Stored in directory: /root/.cache/pip/wheels/99/5f/9d/caa74d8a3cad3dcc3ed9e02d27e7bc18d0ccd1dd5ed1fcdb99
Successfully built rankfm
Installing collected packages: rankfm
Successfully installed rankfm-0.2.5


### Import Required Libraries and Read Ratings Dataset

In [None]:
import pandas as pd
import numpy as np
from rankfm.rankfm import RankFM 
import warnings
warnings.filterwarnings('ignore')

ratings = pd.read_csv('ratings.csv')

In [None]:
ratings['rating'] = ratings['rating']/5

In [None]:
ratings = ratings.drop(columns= 'timestamp')
ratings['random'] = np.random.random(len(ratings['rating']))

### Create Data For Roommate's Movie Preferences

In [None]:
c = np.array([[42069, 310131, .8],
              [42069, 16392, .6],
              [42069, 15371, 1],
              [42069, 146233, .6],
              [42069, 421, 1],
              [42069, 97367, 1],
              [42069, 489, .8],
              [42069, 278, .8],
              [42069, 9647, .6],
              [42069, 279690, .6],
              [42069, 55931, .6],
              [42069, 24128, .8],
              [42069, 84892, .4],
              [42069, 25750, .2],
              [42069, 149, .8],
              [42069, 2291, .4],
              [42069, 339403, .6],
              [42069, 861, .6],
              [42069, 9741, .4],
              [42069, 19908, .2],
              [42069, 153, .8]])
chris = pd.DataFrame(c, columns= ["userId", "movieId", "rating"])
chris["random"] = np.random.random(len(chris['rating']))
chris

Unnamed: 0,userId,movieId,rating,random
0,42069.0,310131.0,0.8,0.117112
1,42069.0,16392.0,0.6,0.857393
2,42069.0,15371.0,1.0,0.79689
3,42069.0,146233.0,0.6,0.575762
4,42069.0,421.0,1.0,0.774184
5,42069.0,97367.0,1.0,0.076446
6,42069.0,489.0,0.8,0.192059
7,42069.0,278.0,0.8,0.917216
8,42069.0,9647.0,0.6,0.482191
9,42069.0,279690.0,0.6,0.071241


### Get Sample of Users

In [None]:
all_users = ratings.userId.unique()
s_users = np.random.choice(all_users, size=20000, replace=False)

In [None]:
s_ratings = ratings[ratings.userId.isin(s_users)].copy()
s_ratings = s_ratings.append(chris)

In [None]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,random
0,1,110,0.2,0.620213
1,1,147,0.9,0.881993
2,1,858,1.0,0.909298
3,1,1221,1.0,0.957572
4,1,1246,1.0,0.204583
5,1,1968,0.8,0.186593
6,1,2762,0.9,0.494464
7,1,2918,1.0,0.642935
8,1,2959,0.8,0.739074
9,1,4226,0.8,0.515519


In [123]:
ratings.shape

(1903042, 4)

In [124]:
s_ratings.shape

(1903042, 4)

In [None]:
s_train = s_ratings[s_ratings['random']<.7]
s_valid = s_ratings[(s_ratings['random'] >= .7) & (s_ratings['random'] < .85)]
s_test = s_ratings[s_ratings['random']>=.85]

In [None]:
s_train.shape

(1330775, 4)

In [None]:
s_test.shape

(287047, 4)

In [None]:
s_valid.shape

(285220, 4)

### Create Movie Features 

In [None]:
movie_feat = pd.read_csv('movies_metadata.csv')
movie_feat = movie_feat.drop([19730,29503,35587])
movie_feat['movieId'] = movie_feat['id'].astype(str).astype(int)
movie_feat = movie_feat[['movieId', 'title','genres', 'vote_average', 'vote_count']]


In [None]:
train_movie = s_train.movieId.unique()
movie_feat_train = movie_feat[movie_feat.movieId.isin(train_movie)]
test_movie = np.sort(s_test.movieId.unique())
movie_feat_test = movie_feat[movie_feat.movieId.isin(test_movie)]
len(train_movie)

20323

### Fit Factorization Machine

In [116]:
model = RankFM(factors=6, loss='warp', max_samples=40, learning_schedule='invscaling', learning_rate= .02)
model.fit(s_train[["userId","movieId"]], sample_weight= s_train["rating"], 
          epochs=100, verbose=True)


training epoch: 0
log likelihood: -654682.4375

training epoch: 1
log likelihood: -636492.375

training epoch: 2
log likelihood: -639471.375

training epoch: 3
log likelihood: -641625.1875

training epoch: 4
log likelihood: -642573.1875

training epoch: 5
log likelihood: -643519.3125

training epoch: 6
log likelihood: -644098.375

training epoch: 7
log likelihood: -644754.75

training epoch: 8
log likelihood: -644959.75

training epoch: 9
log likelihood: -644384.0625

training epoch: 10
log likelihood: -642744.125

training epoch: 11
log likelihood: -640669.125

training epoch: 12
log likelihood: -637692.6875

training epoch: 13
log likelihood: -635160.3125

training epoch: 14
log likelihood: -633967.5625

training epoch: 15
log likelihood: -633841.5625

training epoch: 16
log likelihood: -632286.8125

training epoch: 17
log likelihood: -632480.625

training epoch: 18
log likelihood: -631743.4375

training epoch: 19
log likelihood: -631014.875

training epoch: 20
log likelihood: -6302

### Get Training Metrics

In [117]:
from rankfm.evaluation import hit_rate, precision, recall

model_hrt = hit_rate(model, s_train[['userId','movieId']], k=10)
model_pre = precision(model, s_train[['userId','movieId']], k=10)
model_rec = recall(model, s_train[['userId','movieId']], k=10)

In [118]:
model_hrt,model_pre,model_rec

(0.6728858857344562, 0.2773334678029348, 0.08050620539409406)

### Get Validation Metrics

In [119]:
model_hrt = hit_rate(model, s_valid[['userId','movieId']], k=10)
model_pre = precision(model, s_valid[['userId','movieId']], k=10)
model_rec = recall(model, s_valid[['userId','movieId']], k=10)

In [120]:
model_hrt,model_pre,model_rec

(0.4076168302012307, 0.06280281612062753, 0.07534429237551644)

### Recommend Movies To My Cinephile Roommate 

In [121]:
rec = model.recommend(users = [42069], n_items = 10)
chris_rec = list(rec.iloc[0,])
movie_feat.title[movie_feat['movieId'].isin(chris_rec)]

286                     Once Were Warriors
302                      Three Colors: Red
938                           The 39 Steps
3382                               Solaris
4020              The Million Dollar Hotel
5004                       Monsoon Wedding
5325                       Men in Black II
6388    Terminator 3: Rise of the Machines
Name: title, dtype: object

### Get Testing Set Metrics 

In [122]:
model_hrt = hit_rate(model, s_test[['userId','movieId']], k=10)
model_pre = precision(model, s_test[['userId','movieId']], k=10)
model_rec = recall(model, s_test[['userId','movieId']], k=10)

model_hrt, model_pre, model_rec

(0.40890395731436197, 0.062311027123165844, 0.07464185489482321)