In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *
from fastai.column_data import *

## Data: 

http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [2]:
path = "data/movie-lens/"

In [3]:
ratings = pd.read_csv(os.path.join(path, "ratings.csv"))
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movies = pd.read_csv(os.path.join(path, "movies.csv"))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
val_idxs = get_cv_idxs(len(ratings))

## Collaborative filtering scratch

### dot product example

In [6]:
a = T([[1,2],[3,4]])
b = T([[2,2],[10,10]])
a, b

(
  1  2
  3  4
 [torch.cuda.LongTensor of size 2x2 (GPU 0)], 
   2   2
  10  10
 [torch.cuda.LongTensor of size 2x2 (GPU 0)])

In [7]:
a*b


  2   4
 30  40
[torch.cuda.LongTensor of size 2x2 (GPU 0)]

In [8]:
(a*b).sum(1)


  6
 70
[torch.cuda.LongTensor of size 2 (GPU 0)]

In [9]:
class DotProduct(nn.Module):
    def forward(self, u, m):
        return (u*m).sum(1)

In [10]:
model = DotProduct()

In [11]:
model(a, b)


  6
 70
[torch.cuda.LongTensor of size 2 (GPU 0)]

### dot product collaborative filtering

In [12]:
u_uniq = ratings.userId.unique()
user2idx = {u_id:i for i,u_id in enumerate(u_uniq)}
ratings.userId = ratings.userId.apply(lambda x: user2idx[x])

m_uniq = ratings.movieId.unique()
movie2idx = {m_id:i for i,m_id in enumerate(m_uniq)}
ratings.movieId = ratings.movieId.apply(lambda x: movie2idx[x])

n_users  = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()

In [13]:
class EmbeddingDot(nn.Module):
    def __init__(self, n_users, n_movies, n_factors):
        """
        :param n_users: number of unique users
        :type  n_users: int
        
        :param n_movies: number of unique movies
        :type  n_users: int
        
        :param n_factors: size of embedding matrix for users and movies
        :type  n_factors: int
        """
        super().__init__()
        self.u = nn.Embedding(n_users, n_factors)
        self.m = nn.Embedding(n_movies, n_factors)
        self.u.weight.data.uniform_(0,0.05)
        self.m.weight.data.uniform_(0,0.05)
        
    def forward(self, cats, conts):
        """
        :param cats: the categorical indices for users and movies
        :type  cats: ndarray
        :param conts: continuous values, does not apply here but data generator requires it
        """
        users,movies = cats[:,0],cats[:,1]
        u,m = self.u(users),self.m(movies)
        return (u*m).sum(1)

In [14]:
x = ratings.drop(['rating', 'timestamp'], axis=1)
y = ratings['rating'].astype(np.float32)

In [15]:
data = ColumnarModelData.from_data_frame(path, val_idxs, x, y, ['userId', 'movieId'], 64)

In [16]:
n_factors = 40
model = EmbeddingDot(n_users, n_movies, n_factors).cuda()

In [17]:
wd = 1e-5
lr = .1
opt = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=.9)

In [18]:
fit(model, data, 3, opt, F.mse_loss)

epoch      trn_loss   val_loss                                 
    0      1.79038    1.665684  
    1      1.124391   1.321408                                 
    2      0.907507   1.240982                                  



[1.2409823]

In [19]:
set_lrs(opt, 0.01)
fit(model, data, 3, opt, F.mse_loss)

epoch      trn_loss   val_loss                                  
    0      0.728844   1.160673  
    1      0.699707   1.1492                                    
    2      0.703547   1.141901                                  



[1.1419011]

### Add bias

In [20]:
min_rating, max_rating = ratings.rating.min(), ratings.rating.max()
min_rating, max_rating

(0.5, 5.0)

In [21]:
def get_embed(ni, nf):
    """
    :param ni: number of unique indices
    :param nf: number of factors
    """
    embed = nn.Embedding(ni, nf)
    embed.weight.data.uniform_(-0.02, 0.02)
    return embed 

In [22]:
class EmbeddingDotBias(nn.Module):
    def __init__(self, n_users, n_items, n_factors, min_rating, max_rating):
        super().__init__()
        self.min_rating = min_rating
        self.max_rating = max_rating
        embeds = [(n_users, n_factors), (n_users,1), (n_items, n_factors), (n_items,1)]
        (self.u, self.ub, self.m, self.mb) = [get_embed(*e) for e in embeds]
    
    def forward(self, cats, _):
        users,items = cats[:,0],cats[:,1]
        res = (self.u(users)* self.m(items)).sum(1)
        res = res + self.ub(users).squeeze() + self.mb(items).squeeze()
        res = F.sigmoid(res) * (max_rating-min_rating) + min_rating
        return res

In [23]:
wd=2e-4
model = EmbeddingDotBias(n_users, n_movies, n_factors, min_rating, max_rating).cuda()
opt = optim.SGD(model.parameters(), 1e-1, weight_decay=wd, momentum=0.9)

In [24]:
fit(model, data, 3, opt, F.mse_loss)

epoch      trn_loss   val_loss                                  
    0      0.845015   0.835147  
    1      0.843948   0.819057                                  
    2      0.770014   0.809659                                  



[0.8096589]