In [43]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k

In [14]:
df = pd.read_csv('ratings.csv')

# EDA

In [12]:
df['movieId'].value_counts()[:5]

356    341
296    324
318    311
593    304
260    291
Name: movieId, dtype: int64

In [24]:
df['rating'].value_counts()

4.0    28750
3.0    20064
5.0    15095
3.5    10538
4.5     7723
2.0     7271
2.5     4449
1.0     3326
1.5     1687
0.5     1101
Name: rating, dtype: int64

In [25]:
# average amount of movies rated by user

df.groupby('userId')['movieId'].count().mean()

149.03725782414307

In [17]:
dff = pd.read_csv('movies.csv')

In [18]:
dff

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary
9123,164977,The Gay Desperado (1936),Comedy


In [26]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [29]:
ratings = np.array(df['rating'])
users = np.array(df['userId'])
items = np.array(df['movieId'])

In [31]:
# Encoders
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

# preparation for the csr matrix
u = user_encoder.fit_transform(users)
i = item_encoder.fit_transform(items)
lu = len(np.unique(u))
li = len(np.unique(i))

In [33]:
interactions = csr_matrix((ratings, (u, i)), shape=(lu, li))

In [35]:
interactions.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [39]:
from lightfm.cross_validation import random_train_test_split

In [40]:
train, test = random_train_test_split(interactions, test_percentage=0.2)

In [47]:
model = LightFM(loss='warp')
model.fit(train, epochs=500)

<lightfm.lightfm.LightFM at 0x7f018fa85d10>

In [48]:
auc_score(model, test).mean()

0.91882855