# Checking embeddings
In this notebook we check the meaning of the received embeddings.

Steps:  
1. Import data
1. Import model
1. Load model weights
1. Load model weights
1. Get embeddings from the "Embedding" layer
1. Find similar movies (dot, Cosine similarity, knn)

## Data

In [1]:
import pandas as pd

In [240]:
workspace_path = 'data/movielens'
dataset_name = 'ml-latest'

ratings = pd.read_csv(f"{workspace_path}/{dataset_name}/ratings.csv")
movies = pd.read_csv(f"{workspace_path}/{dataset_name}/movies.csv")
user_count = ratings["userId"].nunique()
movie_count = movies["movieId"].nunique()

In [241]:
import numpy as np

# Define our input and labels data X,Y
X = ratings[['userId','movieId']]
Y = ratings['rating'].astype(np.float32)

# Let's set the split ratio and run the split
from sklearn.model_selection import train_test_split
random_state = 7
test_size = 0.2

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)
datasets = {'train': (X_train, Y_train), 'test': (X_test, Y_test)}

print(f'Training dataset sample size: {len(X_train):,} positive samples ({len(X_train)/len(X)*100:.0f}%)')
print(f'Test dataset sample size: {len(X_test):,} positive samples ({len(X_test)/len(X)*100:.0f}%)')
print(f'Total dataset sample size: {len(X):,} positive samples (100%)')

Training dataset sample size: 22,202,755 positive samples (80%)
Test dataset sample size: 5,550,689 positive samples (20%)
Total dataset sample size: 27,753,444 positive samples (100%)


## Model

In [242]:
import torch
from torch import nn

class NeuralColabFilteringNet(nn.Module):
    """
    Creates an NCF (Neural Collaborative Filtering) network, with configurable model architecture

    Args:
    user_count(int): Number of unique users in the dataset
    movie_count(int): Number of unique movies in the dataset
    embedding_size(int)[Optional]: Size of the user and movie embedding, defaults to 32
    hidden_layers(tuple)[Optional]: Tuple of integers defining the number of hidden MLP layers and the number of units in each layer, defaults to (64,32,16,8)
    dropout_rate(float)[Optional]: Dropout rate to apply after each layer in the range of [0 1], defaults to None
    output_range(tuple)[Optional]: Represents the output range, defaults to (1 5) per the star ratings
    """
    def __init__(self, 
                    user_count,
                    movie_count,
                    embedding_size=32,
                    hidden_layers=(64, 32, 16, 8),
                    dropout_rate=None,
                    output_range=(1, 5)):
        super().__init__()
        
        # Initialize embedding hash sizes
        self.user_hash_size = user_count
        self.movie_hash_size = movie_count

        # Initialize the model architecture components
        self.user_embedding = nn.Embedding(user_count, embedding_size)
        self.movie_embedding = nn.Embedding(movie_count, embedding_size)
        self.MLP = self._gen_MLP(embedding_size, hidden_layers, dropout_rate)
        if (dropout_rate):
            self.dropout = nn.Dropout(dropout_rate)
        
        # Initialize output normalization parameters 
        assert output_range and len(output_range) == 2, "output_range has to be a tuple with two integers"
        self.norm_min = min(output_range)
        self.norm_range = abs(output_range[0] - output_range[1]) + 1

        self._init_params()

    def _gen_MLP(self, embedding_size, hidden_layers_units, dropout_rate):
        "Generates the MLP portion of the model architecture"

        assert (embedding_size * 2) == hidden_layers_units[0], "First input layer number of units has to be equal to twice the embedding size!"
        
        hidden_layers = []
        input_units = hidden_layers_units[0]

        for num_units in hidden_layers_units[1:]:
            hidden_layers.append(nn.Linear(input_units, num_units))
            hidden_layers.append(nn.ReLU())
            if (dropout_rate):
                hidden_layers.append(nn.Dropout(dropout_rate))
            input_units = num_units
        
        hidden_layers.append(nn.Linear(hidden_layers_units[-1], 1))
        hidden_layers.append(nn.Sigmoid())
        return nn.Sequential(*hidden_layers)

    def _init_params(self):
        "Initializes model parameters"
        def weights_init(m):
            if type(m) == nn.Linear:
                torch.nn.init.xavier_uniform_(m.weight)
                m.bias.data.fill_(0.01)
                
        self.user_embedding.weight.data.uniform_(-0.05, 0.05)
        self.movie_embedding.weight.data.uniform_(-0.05, 0.05)
        self.MLP.apply(weights_init)

    def forward(self, user_id, movie_id):
        "Computes forward pass"
        user_features = self.user_embedding(user_id % self.user_hash_size)
        movie_features = self.movie_embedding(movie_id % self.movie_hash_size)
        x = torch.cat([user_features, movie_features], dim=1)
        if hasattr(self, 'dropout'):
            x = self.dropout(x)
        x = self.MLP(x)
        normalized_output = x * self.norm_range + self.norm_min
        return normalized_output

In [243]:
# Now let's instantiate an NCF model and try a prediction with uninitialized weights

n1 = 8 # 2^8 = 256
n2 = 4 # 2^4 = 16

embedding_size = 2**(n1-1)

hidden_layers = [2 ** i for i in range(n1, n2-1, -1)]

ncf = NeuralColabFilteringNet(user_count, movie_count, 
                              embedding_size = embedding_size,
                              hidden_layers = hidden_layers,
                              dropout_rate = 0.2)

print(f"Our model architecture:\n\n{ncf}\n")

# Let's look at the model size
num_params = sum(p.numel() for p in ncf.parameters())
print(f'Number of model parameters: {num_params:,}, model training size: {num_params*4/(1024**2):.2f} MB')

Our model architecture:

NeuralColabFilteringNet(
  (user_embedding): Embedding(283228, 128)
  (movie_embedding): Embedding(58098, 128)
  (MLP): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=32, out_features=16, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=16, out_features=1, bias=True)
    (13): Sigmoid()
  )
  (dropout): Dropout(p=0.2, inplace=False)
)

Number of model parameters: 43,733,505, model training size: 166.83 MB


In [244]:
# We define an iterator to go over the dataset in batches
import numpy as np
import math

class DatasetBatchIterator:
    "Iterates over labaled dataset in batches"
    def __init__(self, X, Y, batch_size, shuffle=True):
        self.X = np.asarray(X)
        self.Y = np.asarray(Y)

        if shuffle:
            index = np.random.permutation(X.shape[0])
            X = self.X[index]
            Y = self.Y[index]

        self.batch_size = batch_size
        self.n_batches = int(math.ceil(X.shape[0] / batch_size))
        self._current = 0 
        
    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        X_batch = torch.LongTensor(self.X[k*bs:(k + 1)*bs])
        Y_batch = torch.FloatTensor(self.Y[k*bs:(k + 1)*bs])

        return self._current, X_batch, Y_batch.view(-1, 1)

## Load model weights

In [245]:
import os

if not os.path.exists("model_weights/"):
    !mkdir model_weights

# weights_path = os.getcwd() + "/model_weights/NCF_Drop_32emb_64lin.pt"

weights_path = os.getcwd() + "/model_weights/NCF_Drop_128emb_256_16lin.pt"
# # Save model weights
# torch.save(min_loss_model_weights, weights_path)

# Load model weights
min_loss_model_weights = torch.load(weights_path)

In [249]:
ncf.load_state_dict(min_loss_model_weights)
ncf.eval()

NeuralColabFilteringNet(
  (user_embedding): Embedding(283228, 128)
  (movie_embedding): Embedding(58098, 128)
  (MLP): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=32, out_features=16, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=16, out_features=1, bias=True)
    (13): Sigmoid()
  )
  (dropout): Dropout(p=0.2, inplace=False)
)

## Embeddings

In [250]:
len(ncf.user_embedding.weight)

283228

In [251]:
user_count

283228

In [252]:
len(ncf.movie_embedding.weight)

58098

In [253]:
movie_count

58098

In [255]:
uniq_users = ratings.userId.unique()

In [256]:
uniq_movies = ratings.movieId.unique()

In [258]:
user_emb = ncf.user_embedding.weight.detach().numpy()
movie_emb = ncf.movie_embedding.weight.detach().numpy()

In [259]:
user_emb

array([[-0.09503482, -0.06502121,  0.06571186, ...,  0.0398813 ,
        -0.09698401, -0.01278059],
       [-0.0548203 , -0.10764199, -0.08189886, ..., -0.01252179,
         0.1079265 ,  0.0142182 ],
       [-0.06652274,  0.0210144 , -0.06497038, ..., -0.09766809,
         0.08952854, -0.02556563],
       ...,
       [ 0.15745182, -0.05319775,  0.05585574, ..., -0.01980898,
         0.05177536, -0.16711544],
       [ 0.1082719 ,  0.0763229 , -0.06817226, ...,  0.08068144,
        -0.07056607, -0.03356112],
       [-0.02867287,  0.1243482 ,  0.04882769, ...,  0.01821621,
        -0.07035436, -0.01609253]], dtype=float32)

In [260]:
user_emb_dict = {}
user_hash_size = user_count


for user in uniq_users:
    user_emb_dict[user] = user_emb[user % user_hash_size]


In [261]:
user_emb_dict

{1: array([-5.48202991e-02, -1.07641988e-01, -8.18988606e-02, -3.47185768e-02,
        -5.08594066e-02,  2.50858646e-02,  1.78463366e-02,  8.39128867e-02,
        -4.76862192e-02, -7.92239085e-02,  4.42954898e-03,  8.97774771e-02,
         2.90049016e-02, -7.17941858e-03, -8.02199394e-02, -1.07205426e-02,
         1.57282669e-02,  5.00116199e-02,  6.46709055e-02, -1.54023906e-02,
        -3.01640797e-02, -2.45362483e-02, -7.58340359e-02, -5.52736148e-02,
        -4.74666990e-02,  5.53004481e-02, -8.96308422e-02, -6.86216727e-02,
         7.32083842e-02, -8.22049901e-02,  4.92582247e-02,  1.89782847e-02,
        -5.14841564e-02, -2.93716155e-02,  1.47585079e-01,  9.26424749e-03,
         7.37884864e-02, -1.25384450e-01, -1.19941585e-01,  2.77117863e-02,
        -2.69949213e-02,  8.00201222e-02, -1.40423253e-01,  2.62270067e-02,
        -5.82778975e-02,  9.37420055e-02, -6.60758391e-02,  1.17089234e-01,
         1.14291273e-02, -1.22144513e-01,  1.20970950e-01,  4.35110889e-02,
        -

In [262]:
movie_emb_dict = {}
movie_hash_size = movie_count

for movie in uniq_movies:
    movie_emb_dict[movie] = movie_emb[movie % movie_hash_size]

In [265]:
# Take a random movie from the 'Avengers' series

movies[movies['title'].str.contains("Avengers")]

Unnamed: 0,movieId,title,genres
2070,2153,"Avengers, The (1998)",Action|Adventure
10894,44020,Ultimate Avengers (2006),Action|Animation|Children|Sci-Fi
17997,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
23463,110132,Avengers Confidential: Black Widow & Punisher ...,Action|Animation|Sci-Fi
25032,115727,Crippled Avengers (Can que) (Return of the 5 D...,Action|Adventure
27550,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi
27559,122912,Avengers: Infinity War - Part I (2018),Action|Adventure|Sci-Fi
32775,135979,Next Avengers: Heroes of Tomorrow (2008),Action|Animation|Children|Sci-Fi
32870,136257,Avengers Grimm (2015),Action|Adventure|Fantasy
36904,145676,3 Avengers (1964),(no genres listed)


In [266]:
movie_id = 122892
movies[movies['movieId'] == movie_id]

Unnamed: 0,movieId,title,genres
27550,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi


In [267]:
movie_emb_dict[movie_id]

array([ 0.03812801, -0.09669143, -0.01079082, -0.16399263,  0.08767818,
        0.2260411 ,  0.13418286, -0.15750699,  0.10643953, -0.05058208,
       -0.19586489, -0.05596394,  0.07928997, -0.05433825,  0.2511212 ,
       -0.10453089,  0.26081708, -0.02745582, -0.23292144, -0.438127  ,
        0.379992  ,  0.06842461,  0.16827725, -0.22062752,  0.12868688,
        0.01663423, -0.07367929,  0.26103544, -0.4287415 , -0.13763827,
       -0.21020289, -0.1564632 ,  0.14350657,  0.12034833, -0.2077343 ,
        0.01168218, -0.03688817,  0.06990851, -0.06230417,  0.4892541 ,
       -0.31348854,  0.31845132, -0.33465606, -0.3364028 , -0.1984845 ,
       -0.36999276, -0.0296915 ,  0.14359878, -0.03445637,  0.07700819,
        0.39304447,  0.11496862, -0.03274806,  0.27618366, -0.297685  ,
        0.15564801,  0.10036033,  0.10022227, -0.14922427,  0.2677797 ,
       -0.24883488, -0.02545203,  0.10679349, -0.26274318,  0.05473942,
        0.00377311,  0.0983139 ,  0.13237806,  0.00702369, -0.19

## Find similar movies

### Dot

In [268]:
recomend_dict = {}

vec = movie_emb_dict[movie_id]

for movie, emb in movie_emb_dict.items():
    recomend_dict[movie] = vec.dot(emb)


In [269]:
from operator import itemgetter
recomend_list_sort = sorted(recomend_dict.items(), key=itemgetter(1), reverse = True)

#### Not bad, but it could be better :) It seems better to use other methods to find similar movies

In [270]:
for movie in recomend_list_sort[:11]:
    print(movies[movies['movieId'] == movie[0]].title)
# recomend_list_sort

27550    Avengers: Age of Ultron (2015)
Name: title, dtype: object
6587    Bollywood/Hollywood (2002)
Name: title, dtype: object
17997    Avengers, The (2012)
Name: title, dtype: object
9755    2009: Lost Memories (2002)
Name: title, dtype: object
37799    Starring Adam West (2013)
Name: title, dtype: object
27559    Avengers: Infinity War - Part I (2018)
Name: title, dtype: object
6607    Daisy Miller (1974)
Name: title, dtype: object
6430    Pirates of the Caribbean: The Curse of the Bla...
Name: title, dtype: object
27477    Night of Dark Shadows (1971)
Name: title, dtype: object
52437    The Humanity Bureau (2017)
Name: title, dtype: object
13983    Harry Potter and the Half-Blood Prince (2009)
Name: title, dtype: object


### Cosine similarity

In [271]:

from scipy import spatial


recomend_dict = {}

vec = movie_emb_dict[movie_id]

for movie, emb in movie_emb_dict.items():
    recomend_dict[movie] = 1 - spatial.distance.cosine(vec, emb)



  dist = 1.0 - uv / np.sqrt(uu * vv)


In [272]:
from operator import itemgetter
recomend_list_sort = sorted(recomend_dict.items(), key=itemgetter(1), reverse = True)

#### Cosine similarity works better already

In [273]:
for movie in recomend_list_sort[:11]:
    print(movies[movies['movieId'] == movie[0]].title)

27550    Avengers: Age of Ultron (2015)
Name: title, dtype: object
6587    Bollywood/Hollywood (2002)
Name: title, dtype: object
17138    Thor (2011)
Name: title, dtype: object
36374    Crying Fist (2005)
Name: title, dtype: object
21101    Iron Man 3 (2013)
Name: title, dtype: object
43015    Future Kick (1991)
Name: title, dtype: object
22251    Thor: The Dark World (2013)
Name: title, dtype: object
17997    Avengers, The (2012)
Name: title, dtype: object
9755    2009: Lost Memories (2002)
Name: title, dtype: object
37799    Starring Adam West (2013)
Name: title, dtype: object
17615    Captain America: The First Avenger (2011)
Name: title, dtype: object


### KNN

In [274]:
movies[movies['movieId'] == movie_id]

Unnamed: 0,movieId,title,genres
27550,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi


In [275]:
from sklearn.neighbors import NearestNeighbors

k = 10

knn = NearestNeighbors(n_neighbors=k)
knn.fit(movie_emb)

vec = movie_emb_dict[movie_id]

# select indices of k nearest neighbours
neighbours = knn.kneighbors([vec], return_distance = True)
print(neighbours)

(array([[0.        , 1.27364165, 1.30933016, 1.37235365, 1.37338674,
        1.3875308 , 1.50298672, 1.55686629, 1.56729938, 1.59410331]]), array([[ 6696, 28234, 44027, 19463, 47974, 30042,  1101, 56059,  1608,
        31647]]))


In [276]:
recomend_dict = {}

for movie, emb in movie_emb_dict.items():
    for i, index in enumerate(neighbours[1][0]):
        if np.array_equal(emb, movie_emb[index]):
            recomend_dict[movie] = neighbours[0][0][i]

In [277]:
from operator import itemgetter
recomend_list_sort = sorted(recomend_dict.items(), key=itemgetter(1))

#### KNN also works no worse than cosine distance

In [278]:
for movie in recomend_list_sort[:11]:
    print(movies[movies['movieId'] == movie[0]].title)

27550    Avengers: Age of Ultron (2015)
Name: title, dtype: object
6587    Bollywood/Hollywood (2002)
Name: title, dtype: object
17138    Thor (2011)
Name: title, dtype: object
36374    Crying Fist (2005)
Name: title, dtype: object
21101    Iron Man 3 (2013)
Name: title, dtype: object
43015    Future Kick (1991)
Name: title, dtype: object
15271    Iron Man 2 (2010)
Name: title, dtype: object
32628    Nirvana: Live! Tonight! Sold Out!! (1994)
Name: title, dtype: object
58036    Francis Bacon: A Brush with Violence (2017)
Name: title, dtype: object
22251    Thor: The Dark World (2013)
Name: title, dtype: object
17615    Captain America: The First Avenger (2011)
Name: title, dtype: object


### Looking at the results, we can say that the embeddings we received really describe the films