# Factorization Machine example

In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from reco.datasets import loadMovieLens100k
from reco.recommender import FM

### At first we'll test only with the bare minimum *userId*, *itemId* and *rating* columns.

In [14]:
train, test, _, _ = loadMovieLens100k(train_test_split=True)
train.describe()

Unnamed: 0,userId,itemId,rating
count,90570.0,90570.0,90570.0
mean,461.494038,428.104891,3.523827
std,266.004364,333.088029,1.126073
min,1.0,1.0,1.0
25%,256.0,174.0,3.0
50%,442.0,324.0,4.0
75%,682.0,636.0,4.0
max,943.0,1682.0,5.0


So we have the user ids, item ids and the respective ratings in the 3 columns. Next we need to separate the rating column since we are going to predict that. Also we need to explicitly set the column data type to string for *userId* and *itemId* so that the model treats them as categorical variables, not integers. We'll do this for both the train and test sets.

In [15]:
y_train = train['rating']
train.drop(['rating'], axis=1, inplace=True)

train['userId'] = train['userId'].astype('str')
train['itemId'] = train['itemId'].astype('str')

y_test = test['rating']
test.drop(['rating'], axis=1, inplace=True)

test['userId'] = test['userId'].astype('str')
test['itemId'] = test['itemId'].astype('str')

Next we'll train the model. We choose 60 iterations here. Tweak the hyperparameters to get the best performance.

In [16]:
f = FM(k=10, iterations = 60, learning_rate = 0.003, regularizer=0.005)
f.fit(X=train, y=y_train)

epoch 0 time 0.6695690155029297 mse 1.103635986616477
epoch 1 time 0.6391921043395996 mse 0.9915833703974116
epoch 2 time 0.6295511722564697 mse 0.9439610213225902
epoch 3 time 0.6386301517486572 mse 0.9179175311761331
epoch 4 time 0.6657459735870361 mse 0.9010386788424799
epoch 5 time 0.6070492267608643 mse 0.8888732411739291
epoch 6 time 0.6132230758666992 mse 0.879493405204137
epoch 7 time 0.7943868637084961 mse 0.8719075818141927
epoch 8 time 0.818760871887207 mse 0.8655493500405546
epoch 9 time 0.6596410274505615 mse 0.8600582403325009
epoch 10 time 0.6409389972686768 mse 0.8551827870234563
epoch 11 time 0.6452288627624512 mse 0.8507407151685272
epoch 12 time 0.6655309200286865 mse 0.8465936124655455
epoch 13 time 0.658170223236084 mse 0.8426295615217765
epoch 14 time 0.7007808685302734 mse 0.8387568052885036
epoch 15 time 0.6789200305938721 mse 0.8348947658203087
epoch 16 time 0.6594047546386719 mse 0.8309731470754939
epoch 17 time 0.7430441379547119 mse 0.8269270674085164
epoch 

In [17]:
y_pred = f.predict(test)
print("MSE: {}".format(mean_squared_error(y_test, y_pred)))

MSE: 0.917503526572731


### Now we'll try with all the columns and train our model on the whole dataset.

In [18]:
train, test, _, _ = loadMovieLens100k(train_test_split=True, all_columns=True)
print(train.head())

  userId itemId  rating  age gender  occupation  5  6  7  8  ...  14  15  16  \
0      1      1     5.0   24      M  technician  0  0  0  1  ...   0   0   0   
1      2      1     4.0   53      F       other  0  0  0  1  ...   0   0   0   
2      6      1     4.0   42      M   executive  0  0  0  1  ...   0   0   0   
3     10      1     4.0   53      M      lawyer  0  0  0  1  ...   0   0   0   
4     13      1     3.0   47      M    educator  0  0  0  1  ...   0   0   0   

   17  18  19  20  21  22  23  
0   0   0   0   0   0   0   0  
1   0   0   0   0   0   0   0  
2   0   0   0   0   0   0   0  
3   0   0   0   0   0   0   0  
4   0   0   0   0   0   0   0  

[5 rows x 25 columns]


This time, we also need to change the data type of the columns *gender* and *occupation* to string so that they are treated as categorical variables and hence one-hot encoded.

In [19]:
y_train = train['rating']
train.drop(['rating'], axis=1, inplace=True)
train['userId'] = train['userId'].astype('str')
train['itemId'] = train['itemId'].astype('str')
train['gender'] = train['gender'].astype('str')
train['occupation'] = train['occupation'].astype('str')


y_test = test['rating']
test.drop(['rating'], axis=1, inplace=True)
test['userId'] = test['userId'].astype('str')
test['itemId'] = test['itemId'].astype('str')
test['gender'] = test['gender'].astype('str')
test['occupation'] = test['occupation'].astype('str')

Before training, we also need to normalize the age column since the values are greatly different from the other columns and hence will hamper the performance of the model. We choose min-max normaliztion here.

In [20]:
train['age'] = (train['age']-train['age'].min())/(train['age'].max()-train['age'].min())
test['age'] = (test['age']-test['age'].min())/(test['age'].max()-test['age'].min())

In [21]:
f = FM(k=10, iterations = 60, learning_rate = 0.003, regularizer=0.005)
f.fit(X=train, y=y_train)

epoch 0 time 2.8004140853881836 mse 1.000345198100816
epoch 1 time 2.876314163208008 mse 0.926330813517465
epoch 2 time 2.8077518939971924 mse 0.8956873431878055
epoch 3 time 2.8738458156585693 mse 0.8774932518050282
epoch 4 time 2.8108808994293213 mse 0.8641899815328599
epoch 5 time 2.795496940612793 mse 0.8532059220237216
epoch 6 time 2.9185550212860107 mse 0.8434474024775377
epoch 7 time 2.8622257709503174 mse 0.8344279848713625
epoch 8 time 2.8102173805236816 mse 0.8259066922947447
epoch 9 time 2.8238000869750977 mse 0.8177314068749388
epoch 10 time 2.7962749004364014 mse 0.8097984702929173
epoch 11 time 2.821711778640747 mse 0.8020400103147397
epoch 12 time 2.8062620162963867 mse 0.794413201353103
epoch 13 time 2.8096868991851807 mse 0.7868945339707508
epoch 14 time 2.7938878536224365 mse 0.7794747118789921
epoch 15 time 2.7710390090942383 mse 0.7721539719413844
epoch 16 time 2.8009660243988037 mse 0.7649335354446439
epoch 17 time 2.789910078048706 mse 0.7578306061799868
epoch 18 

In [22]:
y_pred = f.predict(test)
print("MSE: {}".format(mean_squared_error(y_test, y_pred)))

MSE: 1.0451905865379454
