In [3]:
!pip install pytorch-lightning

Collecting pytorch-lightning
[?25l  Downloading https://files.pythonhosted.org/packages/e6/13/fb401b8f9d9c5e2aa08769d230bb401bf11dee0bc93e069d7337a4201ec8/pytorch_lightning-1.2.7-py3-none-any.whl (830kB)
[K     |████████████████████████████████| 839kB 18.1MB/s 
Collecting future>=0.17.1
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 39.7MB/s 
Collecting fsspec[http]>=0.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/62/11/f7689b996f85e45f718745c899f6747ee5edb4878cadac0a41ab146828fa/fsspec-0.9.0-py3-none-any.whl (107kB)
[K     |████████████████████████████████| 112kB 41.9MB/s 
[?25hCollecting PyYAML!=5.4.*,>=5.1
[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)
[K     |████████████████████████████████| 276kB

In [4]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy
import pytorch_lightning as pl

In [None]:
ratings = pd.read_csv('drive/MyDrive/Colab Notebooks/data/ratings_new.csv')
with open('movie_to_index.pkl', 'rb') as movie_mapping:
    movie_to_index = pickle.load(movie_mapping)
with open('user_to_index.pkl', 'rb') as user_mapping:
    user_to_index = pickle.load(user_mapping)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ratings.movieId = ratings.movieId.apply(lambda x: movie_to_index[x])
ratings.userId = ratings.userId.apply(lambda x: user_to_index[x])
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,2.0,1256677210
1,0,1,3.5,1256677486
2,1,2,3.5,1113766176
3,1,3,4.5,1113766820
4,1,4,3.5,1113766824


In [None]:
n_users=int(ratings.userId.nunique())
n_movies=int(ratings.movieId.nunique())
min_rating, max_rating = ratings.rating.min(),ratings.rating.max()

In [None]:
print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        n_users, n_movies, min_rating, max_rating
    )
)

Number of users: 181664, Number of Movies: 21639, Min rating: 0.5, Max rating: 5.0


In [None]:
# df = ratings.sample(frac=1, random_state=42)
df = ratings.copy()
X = df[['userId', 'movieId']]
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
dataset_sizes = {'train': len(X_train), 'val': len(X_val)}

In [None]:
df.movieId.nunique()

21639

In [None]:
df.userId.nunique()

181664

In [None]:
class MovieDataset(Dataset):

    def __init__(self, users, movies, ratings):
        self.users, self.movies, self.ratings = users, movies, ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]

In [None]:
train_data = MovieDataset(X_train.userId.values, X_train.movieId.values, y_train.values)
valid_data = MovieDataset(X_val.userId.values, X_val.movieId.values, y_val.values)
test_data = MovieDataset(X_test.userId.values, X_test.movieId.values, y_test.values)
datasets = {'train':train_data, 'val':valid_data}
dataloaders = {x: DataLoader(datasets[x], batch_size=64, num_workers=2)
              for x in ['train', 'val']}



In [5]:
class LightningEmbeddingModel(pl.LightningModule):

    def __init__(self, num_users, num_movies, n_factors=100,
                 embedding_dropout=0.5, dropouts=0.2):
        super().__init__()
        self.user_embed = nn.Embedding(num_users, n_factors)
        self.movie_embed = nn.Embedding(num_movies, n_factors)
        self.drop_embedding = nn.Dropout(embedding_dropout)
        self.drop_1 = nn.Dropout(dropouts*2)
        self.drop_2 = nn.Dropout(dropouts)
        self.fc1 = nn.Linear(in_features=2*n_factors, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=64)
        self.output = nn.Linear(in_features=64, out_features=1)
        # self.output = nn.Linear(in_features=128, out_features=1)


    def forward(self, users, movies):
        user_embedded = self.user_embed(users)
        movie_embedded = self.movie_embed(movies)
        vector = torch.cat([user_embedded, movie_embedded], dim=-1)
        vector = self.drop_embedding(vector)
        vector = nn.ReLU()(self.fc1(vector))
        vector = self.drop_1(vector)
        vector = nn.ReLU()(self.fc2(vector))
        vector = self.drop_2(vector)
        pred = nn.Sigmoid()(self.output(vector))
        return pred

    def training_step(self, train_batch, batch_idx):
        user, movie, target = train_batch
        output = self.forward(user, movie)
        target = target.view(-1, 1)
        loss = nn.BCELoss()(output.float(), target.float())
        # Logging to TensorBoard by default
        self.log('train_loss', loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        user, movie, target = val_batch
        output = self.forward(user, movie)
        target = target.view(-1, 1)
        loss = nn.BCELoss()(output.float(), target.float())
        self.log('val_loss', loss)

    def configure_optimizers(self):
        wd = 1e-5
        optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=wd)
        return optimizer

In [None]:
model = LightningEmbeddingModel(n_users, n_movies)
trainer = pl.Trainer(max_epochs=3, gpus=-1)
trainer.fit(model, dataloaders['train'], dataloaders['val'])
trainer.save_checkpoint("drive/MyDrive/Colab Notebooks/data/model_3.ckpt")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type      | Params
---------------------------------------------
0 | user_embed     | Embedding | 18.2 M
1 | movie_embed    | Embedding | 2.2 M 
2 | drop_embedding | Dropout   | 0     
3 | drop_1         | Dropout   | 0     
4 | drop_2         | Dropout   | 0     
5 | fc1            | Linear    | 25.7 K
6 | fc2            | Linear    | 8.3 K 
7 | output         | Linear    | 65    
---------------------------------------------
20.4 M    Trainable params
0         Non-trainable params
20.4 M    Total params
81.457    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

In [None]:
movie_embeddings = pd.DataFrame(model.movie_embed.weight.data.numpy())
movie_embeddings.to_pickle("drive/MyDrive/Colab Notebooks/data/nn_embeddings_3.pkl")

In [None]:
# experiment = LightningEmbeddingModel.load_from_checkpoint("drive/MyDrive/Colab Notebooks/data/example.ckpt") 
trainer = pl.Trainer(max_epochs=4, gpus=-1, resume_from_checkpoint="drive/MyDrive/Colab Notebooks/data/example.ckpt")
trainer.fit(model)
trainer.save_checkpoint("drive/MyDrive/Colab Notebooks/data/model_2.ckpt")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type      | Params
---------------------------------------------
0 | user_embed     | Embedding | 18.2 M
1 | movie_embed    | Embedding | 2.2 M 
2 | drop_embedding | Dropout   | 0     
3 | drop_1         | Dropout   | 0     
4 | drop_2         | Dropout   | 0     
5 | fc1            | Linear    | 25.7 K
6 | output         | Linear    | 129   
---------------------------------------------
20.4 M    Trainable params
0         Non-trainable params
20.4 M    Total params
81.425    Total estimated model params size (MB)
Restored states from the checkpoint file at drive/MyDrive/Colab Notebooks/data/example.ckpt


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…


