In [98]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from datetime import datetime


In [99]:
# lr = 0.00001
batch_size = 512
n_epochs = 50

hidden_size = 1024

train_ratio = 0.8

embedding_dim = 10


In [100]:
df = pd.read_csv(
    "../../data/MovieRatings/rating.csv",
    sep=",",
    header=0,
)

df.head()


In [None]:
user_ids = df.userId.astype("category").cat.codes.values
movie_ids = df.movieId.astype("category").cat.codes.values
targets = df["rating"].values - 2.5


In [None]:
amount_users = len(set(user_ids))
amount_movies = len(set(movie_ids))

print(f'Total amount of users: {amount_users}')
print(f'Total amount of movies: {amount_movies}')

Total amount of users: 138493
Total amount of movies: 26744


In [None]:
n_samples = df.shape[0]

shuffle = np.random.permutation(n_samples)
user_ids, movie_ids, targets = user_ids[shuffle], movie_ids[shuffle], targets[shuffle]


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device {device} is being used")


Device cuda is being used


In [None]:
n_to_train = int(n_samples * train_ratio)

train_users = user_ids[:n_to_train]
test_users = user_ids[n_to_train:]
train_movies = movie_ids[:n_to_train]
test_movies = movie_ids[n_to_train:]

train_targets = targets[:n_to_train]
test_targets = targets[n_to_train:]

train_data = (train_users, train_movies, train_targets)
test_data = (test_users, test_movies, train_targets)

In [None]:
class Recommender(nn.Module):
    def __init__(
        self,
        users_num,
        movies_num,
        embedding_dim,
        hidden_size
    ):
        super(Recommender, self).__init__()
        self.embed_user = nn.Embedding(
            num_embeddings=users_num,
            embedding_dim=embedding_dim,
        )
        self.embed_movie = nn.Embedding(
            num_embeddings=movies_num,
            embedding_dim=embedding_dim,
        )
        
        self.fc1 = nn.Linear(2 * embedding_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, users, movies):
        users_emb = self.embed_user(users)
        movies_emb = self.embed_movie(movies)

        merged_inp = torch.concat((users_emb, movies_emb), dim=1)
        
        outputs = self.fc1(merged_inp)
        outputs = F.relu(outputs)
        
        outputs = self.fc2(outputs)

        return outputs


In [None]:
model = Recommender(
    users_num=amount_users,
    movies_num=amount_movies,
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
)

model.to(device)


Recommender(
  (embed_user): Embedding(138493, 10)
  (embed_movie): Embedding(26744, 10)
  (fc1): Linear(in_features=20, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1, bias=True)
)

In [None]:
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.MSELoss()


In [None]:
def train(train_data, test_data, batch_size=1024, n_epochs=100):
    train_users, train_movies, train_targets = train_data
    test_users, test_movies, test_targets = test_data

    train_losses = np.zeros(n_epochs)
    test_losses = np.zeros(n_epochs)

    batches_per_epoch_train = int(np.floor(len(train_users) / batch_size))
    batches_per_epoch_test = int(np.floor(len(test_users) / batch_size))

    for it in range(n_epochs):

        start_time = datetime.now()

        shuffle = np.random.permutation(len(train_users))
        train_users, train_movies, train_targets = (
            train_users[shuffle],
            train_movies[shuffle],
            train_targets[shuffle],
        )

        train_loss = []

        for j in range(batches_per_epoch_train):

            users = train_users[j * batch_size : (j + 1) * batch_size]
            movies = train_movies[j * batch_size : (j + 1) * batch_size]
            targets = train_targets[j * batch_size : (j + 1) * batch_size]

            users = torch.from_numpy(users).long()
            movies = torch.from_numpy(movies).long()
            targets = torch.from_numpy(targets.astype(np.float32))

            targets = targets.view(-1, 1)

            users = users.to(device)
            movies = movies.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()

            outputs = model(users, movies)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        test_loss = []

        for j in range(batches_per_epoch_test):

            users = test_users[j * batch_size : (j + 1) * batch_size]
            movies = test_movies[j * batch_size : (j + 1) * batch_size]
            targets = test_targets[j * batch_size : (j + 1) * batch_size]

            users = torch.from_numpy(users).long()
            movies = torch.from_numpy(movies).long()
            targets = torch.from_numpy(targets.astype(np.float32))

            targets = targets.view(-1, 1)

            users = users.to(device)
            movies = movies.to(device)
            targets = targets.to(device)

            outputs = model(users, movies)
            loss = criterion(outputs, targets)
            test_loss.append(loss.item())

        train_losses[it] = np.mean(train_loss)
        test_losses[it] = np.mean(test_loss)

        end_time = datetime.now()

        print(
            f"Iteration: {it+1:4.0f}/{n_epochs}\tTrain Loss: {train_losses[it]:.6f}\tTest Loss: {test_losses[it]:.6f}\tDuration: {end_time-start_time}"
        )

    return train_losses, test_losses


In [None]:
train_losses, test_losses = train(
    train_data=train_data,
    test_data=test_data,
    batch_size=batch_size,
    n_epochs=n_epochs,
)


Iteration:    1/50	Train Loss: 1.256336	Test Loss: 1.137572	Duration: 0:00:16.553168
Iteration:    2/50	Train Loss: 1.127809	Test Loss: 1.133870	Duration: 0:00:10.177979
Iteration:    3/50	Train Loss: 1.121054	Test Loss: 1.131502	Duration: 0:00:10.836483
Iteration:    4/50	Train Loss: 1.115034	Test Loss: 1.129196	Duration: 0:00:10.387449
Iteration:    5/50	Train Loss: 1.109368	Test Loss: 1.127372	Duration: 0:00:10.400712
Iteration:    6/50	Train Loss: 1.104193	Test Loss: 1.125761	Duration: 0:00:10.630368
Iteration:    7/50	Train Loss: 1.099587	Test Loss: 1.124830	Duration: 0:00:10.409916
Iteration:    8/50	Train Loss: 1.095595	Test Loss: 1.124383	Duration: 0:00:10.476919
Iteration:    9/50	Train Loss: 1.092209	Test Loss: 1.124360	Duration: 0:00:10.537162
Iteration:   10/50	Train Loss: 1.089273	Test Loss: 1.125045	Duration: 0:00:10.479528
Iteration:   11/50	Train Loss: 1.086628	Test Loss: 1.126184	Duration: 0:00:10.361758


KeyboardInterrupt: 

In [None]:
plt.plot(train_losses, label="Train Loss")
plt.plot(test_losses, label="Test loss")
plt.legend()


NameError: name 'plt' is not defined