<a href="https://colab.research.google.com/github/MaggiePN92/fastai/blob/master/fastai_chap8_CollaborativeFiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Collaborative Filtering Deep Dive

Recommending items and products to users by learning latent factors in the data. 

In [1]:
!pip install fastai --upgrade

Collecting fastai
[?25l  Downloading https://files.pythonhosted.org/packages/5b/53/edf39e15b7ec5e805a0b6f72adbe48497ebcfa009a245eca7044ae9ee1c6/fastai-2.3.0-py3-none-any.whl (193kB)
[K     |█▊                              | 10kB 19.7MB/s eta 0:00:01[K     |███▍                            | 20kB 23.1MB/s eta 0:00:01[K     |█████                           | 30kB 12.0MB/s eta 0:00:01[K     |██████▊                         | 40kB 9.9MB/s eta 0:00:01[K     |████████▌                       | 51kB 7.5MB/s eta 0:00:01[K     |██████████▏                     | 61kB 8.0MB/s eta 0:00:01[K     |███████████▉                    | 71kB 7.9MB/s eta 0:00:01[K     |█████████████▌                  | 81kB 8.2MB/s eta 0:00:01[K     |███████████████▏                | 92kB 8.0MB/s eta 0:00:01[K     |█████████████████               | 102kB 8.2MB/s eta 0:00:01[K     |██████████████████▋             | 112kB 8.2MB/s eta 0:00:01[K     |████████████████████▎           | 122kB 8.2MB/s eta 0:

In [2]:
from fastai.collab import *
from fastai.tabular.all import *

In [3]:
path = untar_data(URLs.ML_100k)

In [4]:
ratings = pd.read_csv(path/'u.data', delimiter="\t", header=None, names=["user", "movie", "rating", "timestamp"])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
#x0 = sci-fi, x1 = action, x2 = old movie
last_skywalker = np.array([0.98, 0.9, -0.9])
#user1 likes scifi, action, but not old movies
user1 = np.array([0.9,0.8,-0.6])
#match is given by dot product of the two arrays:
(user1*last_skywalker).sum()

2.1420000000000003

In [6]:
#representation of casablanca:
casablanca = np.array([-0.99, -0.3, 0.8])
#dot product
(user1*casablanca).sum()

-1.611

#Learning the latent factors; optimizing parameters with SDG
1. Randomly initialize parameters. Also need to consider how many parameters to use.
2. Calculate predictions by doing dot product of each user and each movie. 
3. Calculate loss. 



In [7]:
movies = pd.read_csv(path/"u.item", delimiter="|", encoding='latin-1',
                     usecols=(0,1), names=('movie', 'title'), header=None)

In [8]:
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [9]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [10]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,856,Contact (1997),4
1,23,Snow White and the Seven Dwarfs (1937),4
2,437,"Clockwork Orange, A (1971)",4
3,268,Legends of the Fall (1994),3
4,26,Toy Story (1995),3
5,257,Farewell My Concubine (1993),5
6,32,"Close Shave, A (1995)",3
7,790,"Piano, The (1993)",3
8,390,Seven Years in Tibet (1997),3
9,595,Sabrina (1995),3


In [11]:
n_users = len(dls.classes["user"])
n_movies = len(dls.classes["title"])
n_factors = 5

# We represent all the users as a n_users X n_factors matrix.
# We have set n_factors = 5, this means that there are 5 underlying
# factors that will decide wether the user likes the movie or not
user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [12]:
# Instead of looking up in an index we can 
# replace indices with one hot encoded vectors
one_hot_3 = one_hot(3, n_users).float()
one_hot_3

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [13]:
user_factors.t() @ one_hot_3

tensor([-1.4917, -0.3704, -1.4252,  0.0949,  0.3254])

In [14]:
user_factors[3]

tensor([-1.4917, -0.3704, -1.4252,  0.0949,  0.3254])

##Embeddings
In practice one hot encoded matrices are wastefull as they use alot of memory. Instead we use an embedding layer. This lets us index into a vector using an integer. Embeddings also calculates its derivative such that it equals derivatives from one-hot encoded matrices. 

The main purpose of embeddings is to store values of our learnable parameters. These parameters are updated by for example stochastic gradient descent. 

In [15]:
class DotProduct(Module):
  def __init__(self, n_users, n_movies, n_factors):
    self.user_factors = Embedding(n_users, n_factors)
    self.move_factors = Embedding(n_movies, n_factors)

  def forward(self, x):
    ''' x is a tensor of shape batch_size x 2, where
    x[:,0] is user_ids, and x[:,1] is movie_ids'''
    users = self.user_factors(x[:,0])
    movies = self.move_factors(x[:,1])
    return (users * movies).sum(dim=1)

In [16]:
x,y = dls.one_batch()
x.shape

torch.Size([64, 2])

In [17]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [18]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,1.303441,1.301044,00:07
1,1.102998,1.093542,00:07
2,0.981838,0.980346,00:07
3,0.862055,0.895499,00:07
4,0.80303,0.876709,00:07


In [19]:
# Improve model by squishing preds to be in range 0,5.5
class DotProduct(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    self.user_factors = Embedding(n_users, n_factors)
    self.move_factors = Embedding(n_movies, n_factors)
    self.y_range = y_range

  def forward(self, x):
    ''' x is a tensor of shape batch_size x 2, where
    x[:,0] is user_ids, and x[:,1] is movie_ids'''
    users = self.user_factors(x[:,0])
    movies = self.move_factors(x[:,1])
    return sigmoid_range((users * movies).sum(dim=1), *self.y_range)

model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.991631,0.990609,00:07
1,0.878195,0.898601,00:07
2,0.682354,0.85743,00:07
3,0.497177,0.861834,00:07
4,0.372969,0.866411,00:07


In [20]:
# Improve model by including biases. This will acount for factors
# such as some movies being bad, some users being very picky, etc. 
class DotProduct(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    self.user_factors = Embedding(n_users, n_factors)
    self.user_bias = Embedding(n_users, 1)
    self.move_factors = Embedding(n_movies, n_factors)
    self.movie_bias = Embedding(n_movies, 1)
    self.y_range = y_range

  def forward(self, x):
    ''' x is a tensor of shape batch_size x 2, where
    x[:,0] is user_ids, and x[:,1] is movie_ids'''
    users = self.user_factors(x[:,0])
    movies = self.move_factors(x[:,1])
    res = (users * movies).sum(dim=1, keepdim=True)
    res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
    return sigmoid_range(res, *self.y_range)

model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.933391,0.926575,00:08
1,0.846912,0.847258,00:08
2,0.612961,0.856949,00:08
3,0.425557,0.880454,00:08
4,0.281849,0.887169,00:08


## Weight Decay/L2 Regularization

Basically just adding the sum of the squared weights to the loss function. This will encourage your weights to be as small as possible. This will in turn prevent overfitting because smaller weights allow for less complex function. A complex function will learn every nook and cranny of the training data, in other words, overfit. 

In code loss will look like this: 
```
  loss_with_wd = loss + wd * (parameters**2).sum()
```
But in practice this will inefficient and maybe numerically unstable to compute, instead we do:
```
  parameters.grad += wd * 2 * parameters
```
We could also just drop the 2, and make wd twice as big. 

In [21]:
#how to use wd in fastai
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.945662,0.930266,00:08
1,0.850719,0.864835,00:08
2,0.751393,0.825384,00:08
3,0.588174,0.811431,00:08
4,0.494911,0.812531,00:08


## Creating our own embedding module

In [22]:
#without parameter method:
class T(Module):
  def __init__(self): self.a = torch.ones(3)

L(T().parameters())

(#0) []

In [23]:
#with parameter method:
class T(Module):
  def __init__(self): self.a = nn.Parameter(torch.ones(3))

L(T().parameters())

(#1) [Parameter containing:
tensor([1., 1., 1.], requires_grad=True)]

In [24]:
class T(Module):
  def __init__(self): self.a = nn.Linear(1, 3, bias=False)

t = T()
L(t.parameters())

(#1) [Parameter containing:
tensor([[ 0.1355],
        [-0.5475],
        [-0.6349]], requires_grad=True)]

In [25]:
def create_params(size):
  return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

In [26]:
class DotProductBias(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0, 5.5)):
    self.user_factors = create_params([n_users, n_factors])
    self.user_bias = create_params([n_users])
    self.movie_factors = create_params([n_movies, n_factors])
    self.movie_bias = create_params([n_movies])
    self.y_range = y_range

  def forward(self, x):
    users = self.user_factors[x[:, 0]]
    movies = self.movie_factors[x[:,1]]
    res = (users*movies).sum(dim=1)
    res += self.user_bias[x[:,0]] + self.movie_bias[x[:,1]]
    return sigmoid_range(res, *self.y_range)

In [27]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.966748,0.941589,00:08
1,0.854382,0.86417,00:08
2,0.72963,0.824773,00:08
3,0.566922,0.810274,00:08
4,0.480777,0.811169,00:08


## Interpreting Embeddings and biases

In [28]:
# Easiest to interpret biases
# Five least popular movies based on biases
movie_bias = learn.model.movie_bias.squeeze()
idxs = movie_bias.argsort()[:5]
[dls.classes["title"][i] for i in idxs]

['Children of the Corn: The Gathering (1996)',
 'Crow: City of Angels, The (1996)',
 'Lawnmower Man 2: Beyond Cyberspace (1996)',
 'Vampire in Brooklyn (1995)',
 'Mortal Kombat: Annihilation (1997)']

In [29]:
# Easiest to interpret biases
# Five most popular movies based on biases
movie_bias = learn.model.movie_bias.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes["title"][i] for i in idxs]

['Titanic (1997)',
 'Silence of the Lambs, The (1991)',
 'Good Will Hunting (1997)',
 'Star Wars (1977)',
 'Shawshank Redemption, The (1994)']

In [30]:
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.932994,0.940473,00:08
1,0.855167,0.870247,00:08
2,0.744216,0.824168,00:08
3,0.581505,0.806946,00:08
4,0.510952,0.807026,00:08


In [31]:
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 50)
  (i_weight): Embedding(1665, 50)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1665, 1)
)

In [32]:
# Easiest to interpret biases
# Five most popular movies based on biases
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes["title"][i] for i in idxs]

['Titanic (1997)',
 "Schindler's List (1993)",
 'Silence of the Lambs, The (1991)',
 'Star Wars (1977)',
 'L.A. Confidential (1997)']

In [33]:
# Finding movie similar to Silence of the lambs
movie_factors = learn.model.i_weight.weight
idx = dls.classes["title"].o2i["Silence of the lambs, The (1991)"]
distances = nn.CosineSimilarity(dim=1)(movie_factors, movie_factors[idx][None])
idx = distances.argsort(descending=True)[1]
dls.classes["title"][idx]

'Perez Family, The (1995)'

## Deep learning for Collaborative Filtering

In [34]:
# Finds recommended embedding sizes with heuristics
embs = get_emb_sz(dls)
embs

[(944, 74), (1665, 102)]

In [40]:
class CollabNN(Module):
  def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100):
    self.user_factors = Embedding(*user_sz)
    self.item_factors = Embedding(*item_sz)
    self.layers = nn.Sequential(
        # in-features=74*102, out-features=100
        nn.Linear(user_sz[1]+item_sz[1], n_act),
        nn.ReLU(),
        nn.Linear(n_act, 1)
    )
    self.y_range = y_range

  def forward(self, x):
    embs = self.user_factors(x[:,0]), self.item_factors(x[:,1])
    x = self.layers(torch.cat(embs, dim=1))
    return sigmoid_range(x, *self.y_range)

In [41]:
model = CollabNN(*embs)

In [42]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)

epoch,train_loss,valid_loss,time
0,0.940103,0.942958,00:08
1,0.887098,0.89397,00:08
2,0.863678,0.867159,00:08
3,0.802532,0.85911,00:08
4,0.754143,0.859439,00:08


In [48]:
learn = collab_learner(dls, use_nn=True, y_range=(0,5.5), layers=[100,50])
learn.fit_one_cycle(5, 5e-3, wd=0.01)

epoch,train_loss,valid_loss,time
0,0.979215,0.954394,00:10
1,0.918499,0.892874,00:10
2,0.856404,0.868337,00:10
3,0.809979,0.854968,00:10
4,0.738982,0.8622,00:10


In [47]:
n_users

944