In [1]:
from fastai.collab import *
from fastai.tabular import *

## Collaborative filtering example

`collab` models use data in a `DataFrame` of user, items, and ratings.

In [2]:
user,item,title = 'userId','movieId','title'

In [3]:
path = untar_data(URLs.ML_SAMPLE)
path

WindowsPath('C:/Users/Gerst/.fastai/data/movie_lens_sample')

In [4]:
ratings = pd.read_csv(path/'ratings.csv')
ratings.head()

AttributeError: 'LazyConfigValue' object has no attribute 'lower'

   userId  movieId  rating   timestamp
0      73     1097     4.0  1255504951
1     561      924     3.5  1172695223
2     157      260     3.5  1291598691
3     358     1210     5.0   957481884
4     130      316     2.0  1138999234

That's all we need to create and train a model:

In [5]:
data = CollabDataBunch.from_df(ratings, seed=42)

In [6]:
y_range = [0,5.5]

In [7]:
learn = collab_learner(data, n_factors=50, y_range=y_range)

In [8]:
learn.fit_one_cycle(3, 5e-3)

epoch,train_loss,valid_loss
1,1.590603,0.932732
2,0.852269,0.679162
3,0.652138,0.666001


## Movielens 100k

Let's try with the full Movielens 100k data dataset, available from http://files.grouplens.org/datasets/movielens/ml-100k.zip

In [9]:
path=Config.data_path()/'ml-100k'

In [10]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=[user,item,'rating','timestamp'])
ratings.head()

FileNotFoundError: [Errno 2] File b'C:\\Users\\Gerst\\.fastai\\data\\ml-100k\\u.data' does not exist: b'C:\\Users\\Gerst\\.fastai\\data\\ml-100k\\u.data'

In [None]:
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1', header=None,
                    names=[item, 'title', 'date', 'N', 'url', *[f'g{i}' for i in range(19)]])
movies.head()

In [None]:
len(ratings)

In [None]:
rating_movie = ratings.merge(movies[[item, title]])
rating_movie.head()

In [None]:
data = CollabDataBunch.from_df(rating_movie, seed=42, pct_val=0.1, item_name=title)

In [None]:
data.show_batch()

In [None]:
y_range = [0,5.5]

In [None]:
learn = collab_learner(data, n_factors=40, y_range=y_range, wd=1e-1)

In [None]:
learn.lr_find()
learn.recorder.plot(skip_end=15)

In [None]:
learn.fit_one_cycle(5, 5e-3)

In [None]:
learn.save('dotprod')

Here's [some benchmarks](https://www.librec.net/release/v1.3/example.html) on the same dataset for the popular Librec system for collaborative filtering. They show best results based on RMSE of 0.91, which corresponds to an MSE of `0.91**2 = 0.83`.

## Interpretation

### Setup

In [None]:
learn.load('dotprod');

In [None]:
learn.model

In [None]:
g = rating_movie.groupby(title)['rating'].count()
top_movies = g.sort_values(ascending=False).index.values[:1000]
top_movies[:10]

### Movie bias

In [None]:
movie_bias = learn.bias(top_movies, is_item=True)
movie_bias.shape

In [None]:
mean_ratings = rating_movie.groupby(title)['rating'].mean()
movie_ratings = [(b, i, mean_ratings.loc[i]) for i,b in zip(top_movies,movie_bias)]

In [None]:
item0 = lambda o:o[0]

In [None]:
sorted(movie_ratings, key=item0)[:15]

In [None]:
sorted(movie_ratings, key=lambda o: o[0], reverse=True)[:15]

### Movie weights

In [None]:
movie_w = learn.weight(top_movies, is_item=True)
movie_w.shape

In [None]:
movie_pca = movie_w.pca(3)
movie_pca.shape

In [None]:
fac0,fac1,fac2 = movie_pca.t()
movie_comp = [(f, i) for f,i in zip(fac0, top_movies)]

In [None]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(movie_comp, key=itemgetter(0))[:10]

In [None]:
movie_comp = [(f, i) for f,i in zip(fac1, top_movies)]

In [None]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(movie_comp, key=itemgetter(0))[:10]

In [None]:
idxs = np.random.choice(len(top_movies), 50, replace=False)
idxs = list(range(50))
X = fac0[idxs]
Y = fac2[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_movies[idxs], X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
plt.show()