In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *
from fastai.column_data import *

ModuleNotFoundError: No module named 'fastai.learner'

Data: http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [None]:
path = "data/movie-lens/"

In [None]:
ratings = pd.read_csv(os.path.join(path, "ratings.csv"))
ratings.head()

In [None]:
movies = pd.read_csv(os.path.join(path, "movies.csv"))
movies.head()

In [None]:
val_idxs = get_cv_idxs(len(ratings))

## Collaborative filtering scratch

### dot product example

In [None]:
a = T([[1,2],[3,4]])
b = T([[2,2],[10,10]])
a, b

In [None]:
a*b

In [None]:
(a*b).sum(1)

In [None]:
class DotProduct(nn.Module):
    def forward(self, u, m):
        return (u*m).sum(1)

In [None]:
model = DotProduct()

In [None]:
model(a, b)

### dot product collaborative filtering

In [None]:
u_uniq = ratings.userId.unique()
user2idx = {u_id:i for i,u_id in enumerate(u_uniq)}
ratings.userId = ratings.userId.apply(lambda x: user2idx[x])

m_uniq = ratings.movieId.unique()
movie2idx = {m_id:i for i,m_id in enumerate(m_uniq)}
ratings.movieId = ratings.movieId.apply(lambda x: movie2idx[x])

n_users  = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()

In [None]:
class EmbeddingDot(nn.Module):
    def __init__(self, n_users, n_movies, n_factors):
        """
        :param n_users: number of unique users
        :type  n_users: int
        
        :param n_movies: number of unique movies
        :type  n_users: int
        
        :param n_factors: size of embedding matrix for users and movies
        :type  n_factors: int
        """
        super().__init__()
        self.u = nn.Embedding(n_users, n_factors)
        self.m = nn.Embedding(n_movies, n_factors)
        self.u.weight.data.uniform_(0,0.05)
        self.m.weight.data.uniform_(0,0.05)
        
    def forward(self, cats, conts):
        """
        :param cats: the categorical indices for users and movies
        :type  cats: ndarray
        :param conts: continuous values, does not apply here but data generator requires it
        """
        users,movies = cats[:,0],cats[:,1]
        u,m = self.u(users),self.m(movies)
        return (u*m).sum(1)

In [None]:
x = ratings.drop(['rating', 'timestamp'], axis=1)
y = ratings['rating'].astype(np.float32)

In [None]:
data = ColumnarModelData.from_data_frame(path, val_idxs, x, y, ['userId', 'movieId'], 64)

In [None]:
n_factors = 40
model = EmbeddingDot(n_users, n_movies, n_factors).cuda()

In [None]:
wd = 1e-5
lr = .1
opt = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=.9)

In [None]:
fit(model, data, 3, opt, F.mse_loss)

In [None]:
set_lrs(opt, 0.01)
fit(model, data, 3, opt, F.mse_loss)

### Add bias

In [None]:
min_rating, max_rating = ratings.rating.min(), ratings.rating.max()
min_rating, max_rating

In [None]:
def get_embed(ni, nf):
    """
    :param ni: number of unique indices
    :param nf: number of factors
    """
    embed = nn.Embedding(ni, nf)
    embed.weight.data.uniform_(-0.02, 0.02)
    return embed 

In [None]:
class EmbeddingDotBias(nn.Module):
    def __init__(self, n_users, n_items, n_factors, min_rating, max_rating):
        super().__init__()
        self.min_rating = min_rating
        self.max_rating = max_rating
        embeds = [(n_users, n_factors), (n_users,1), (n_items, n_factors), (n_items,1)]
        (self.u, self.ub, self.m, self.mb) = [get_embed(*e) for e in embeds]
    
    def forward(self, cats, _):
        users,items = cats[:,0],cats[:,1]
        res = (self.u(users)* self.m(items)).sum(1)
        res = res + self.ub(users).squeeze() + self.mb(items).squeeze()
        res = F.sigmoid(res) * (max_rating-min_rating) + min_rating
        return res

In [None]:
wd=2e-4
model = EmbeddingDotBias(n_users, n_movies, n_factors, min_rating, max_rating).cuda()
opt = optim.SGD(model.parameters(), 1e-1, weight_decay=wd, momentum=0.9)

In [None]:
fit(model, data, 3, opt, F.mse_loss)

### Neural Network



In [None]:
class EmbeddingNet(nn.Module):
    def __init__(self, n_users, n_items, n_factors, nh=10, p1=0.5, p2=0.5):
        super().__init__()
        embeds = [(n_users, n_factors), (n_users,1), (n_items, n_factors), (n_items,1)]
        (self.u, self.ub, self.m, self.mb) = [get_embed(*e) for e in embeds]
        self.lin1 = nn.Linear(n_factors*2, nh)
        self.lin2 = nn.Linear(nh, 1)
        self.drop1 = nn.Dropout(p1)
        self.drop2 = nn.Dropout(p2)
        
    def forward(self, cats, conts):
        users,items = cats[:,0],cats[:,1]
        x = self.drop1(torch.cat([self.u(users),self.m(items)], dim=1))
        x = self.drop2(F.relu(self.lin1(x)))
        x = self.lin2(x) + self.ub(users) + self.mb(items)
        return F.sigmoid(x) * (max_rating-min_rating+1) + min_rating-0.5

In [None]:
wd=1e-6
model = EmbeddingNet(n_users, n_movies, n_factors=40, nh=10).cuda()
opt = optim.Adam(model.parameters(), 1e-3, weight_decay=wd)

In [None]:
fit(model, data, 3, opt, F.mse_loss)

In [None]:
set_lrs(opt, 1e-4)
fit(model, data, 3, opt, F.mse_loss)

In [None]:
np.sqrt(0.77328)