# Matrix Factorization

This notebook implements matrix factorization for the MovieLens dataset.

## Imports and Libraries

In [1]:
import torch
import pandas as pd
import numpy as np
import time
import os

from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import time
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from torch import nn
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
seed_value = 42
np.random.seed(seed_value)

## Section 1: Load the Movielens data, do the needed preprocessing and merging

In [2]:
#Get all the data paths
current_directory = os.getcwd()
#general_path = os.path.join(current_directory, 'project2_Data')
general_path = os.path.join(current_directory)

movies_path = os.path.join(general_path, 'movies.csv')
links_path = os.path.join(general_path, 'links.csv')
tags_path = os.path.join(general_path, 'tags.csv')

In [3]:
# Training and test ratings sets
training_path = os.path.join(general_path, 'train_ratings.csv')
test_path = os.path.join(general_path, 'test_set_no_ratings.csv')

In [4]:
movies = pd.read_csv(movies_path)
links = pd.read_csv(links_path)

movies = pd.concat([movies,links],axis=1)   # concatenate the two infromation dataframes for the movies
movies = movies.loc[:, ~movies.columns.duplicated()] # remove the duplicate columns
movies.head(1)

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0


In [5]:
tags = pd.read_csv(tags_path)
tags.rename(columns={'userId':'user_tag_id'},inplace=True)
grouped_tags = tags.groupby(['user_tag_id', 'movieId']).agg(list).reset_index()
grouped_tags.drop(columns='timestamp',inplace=True)
grouped_tags.head(1)

Unnamed: 0,user_tag_id,movieId,tag
0,2,60756,"[funny, Highly quotable, will ferrell]"


In [6]:
#Prepare the training dataset
train = pd.read_csv(training_path)
train.drop(columns='timestamp',inplace=True)
test = pd.read_csv(test_path)

test.drop(columns='Id',inplace=True)
test['rating'] = -1

frames=[train,test]
ratings = pd.concat(frames)

merged_train = ratings.merge(movies,on='movieId',how='left')

test = pd.read_csv(test_path)

#Add the corresponding tags for each user-movieid combination
merged_full = pd.merge(merged_train, grouped_tags, how='left', left_on=['userId', 'movieId'], right_on=['user_tag_id', 'movieId'])
merged_full.drop(columns='user_tag_id',inplace=True)
merged_full['rating_count_per_user']=merged_full.groupby('userId')['rating'].transform("count")

In [7]:
merged_full.drop(columns=['genres','tmdbId'],inplace=True)
merged_full

Unnamed: 0,userId,movieId,rating,title,imdbId,tag,rating_count_per_user
0,509,7347,3.0,Secret Window (2004),363988,,467
1,326,71462,4.0,"Cove, The (2009)",1313104,,152
2,57,2115,3.0,Indiana Jones and the Temple of Doom (1984),87469,,476
3,610,1127,4.0,"Abyss, The (1989)",96754,,1302
4,462,2409,2.0,Rocky II (1979),79817,,455
...,...,...,...,...,...,...,...
100831,380,5048,-1.0,Snow Dogs (2002),281373,,1218
100832,434,54272,-1.0,"Simpsons Movie, The (2007)",462538,,233
100833,226,5989,-1.0,Catch Me If You Can (2002),264464,,507
100834,607,1320,-1.0,Alien³ (a.k.a. Alien 3) (1992),103644,,187


In [8]:
full_movies_path = os.path.join(general_path, 'full_movielens_genres.csv')
full_movies = pd.read_csv(full_movies_path)
full_movies = full_movies.groupby('imdbId')['genres'].first().reset_index()
full_movies

Unnamed: 0,imdbId,genres
0,0,carousel productions|vision view entertainment...
1,1,documentary
2,3,comedy|animation
3,5,drama
4,8,documentary
...,...,...
45412,7068896,comedy
45413,7078780,horror|science fiction
45414,7078926,action|science fiction|war
45415,7104950,documentary


In [9]:
merged_full = pd.merge(merged_full, full_movies, on='imdbId', how='left')
merged_full

Unnamed: 0,userId,movieId,rating,title,imdbId,tag,rating_count_per_user,genres
0,509,7347,3.0,Secret Window (2004),363988,,467,thriller|mystery
1,326,71462,4.0,"Cove, The (2009)",1313104,,152,documentary
2,57,2115,3.0,Indiana Jones and the Temple of Doom (1984),87469,,476,adventure|action
3,610,1127,4.0,"Abyss, The (1989)",96754,,1302,adventure|action|thriller|science fiction
4,462,2409,2.0,Rocky II (1979),79817,,455,drama
...,...,...,...,...,...,...,...,...
100831,380,5048,-1.0,Snow Dogs (2002),281373,,1218,adventure|comedy|family
100832,434,54272,-1.0,"Simpsons Movie, The (2007)",462538,,233,animation|comedy|family
100833,226,5989,-1.0,Catch Me If You Can (2002),264464,,507,drama|crime
100834,607,1320,-1.0,Alien³ (a.k.a. Alien 3) (1992),103644,,187,science fiction|action|horror


In [10]:
print(f"Number of unique user IDs: {merged_full['userId'].unique().size}")
print(f"Number of unique movie IDs: {merged_full['movieId'].unique().size}")

Number of unique user IDs: 610
Number of unique movie IDs: 9724


In [11]:
merged_full['rating_count_per_user'].describe()

count    100836.000000
mean        603.892816
std         653.007108
min          20.000000
25%         148.000000
50%         385.000000
75%         836.000000
max        2698.000000
Name: rating_count_per_user, dtype: float64

In [12]:
average_ratings_per_user = merged_full['rating_count_per_user'].mean()
print(f"Average number of ratings per user: {average_ratings_per_user:.2f}")

Average number of ratings per user: 603.89


## Section 2.2 Matrix Factorization Implementation 2


In [13]:
test.drop(columns='Id',inplace=True)
test['rating'] = -1

frames=[train,test]
ratings = pd.concat(frames) #concatenat e train and test in order to create the matrix later
ratings = ratings.merge(movies, on='movieId', how='left') #merge with movies
ratings.drop(columns=['title','imdbId','tmdbId'],inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,genres
0,509,7347,3.0,Mystery|Thriller
1,326,71462,4.0,Documentary
2,57,2115,3.0,Action|Adventure|Fantasy
3,610,1127,4.0,Action|Adventure|Sci-Fi|Thriller
4,462,2409,2.0,Action|Drama


In [14]:
ratings

Unnamed: 0,userId,movieId,rating,genres
0,509,7347,3.0,Mystery|Thriller
1,326,71462,4.0,Documentary
2,57,2115,3.0,Action|Adventure|Fantasy
3,610,1127,4.0,Action|Adventure|Sci-Fi|Thriller
4,462,2409,2.0,Action|Drama
...,...,...,...,...
100831,380,5048,-1.0,Adventure|Children|Comedy
100832,434,54272,-1.0,Animation|Comedy
100833,226,5989,-1.0,Crime|Drama
100834,607,1320,-1.0,Action|Horror|Sci-Fi|Thriller


In [15]:
no_genres_rows = ratings[ratings['genres'] == '(no genres listed)']

# Iterate through rows and update 'genres' based on 'merged_full'
for index, row in no_genres_rows.iterrows():
    movie_id = row['movieId']
    corresponding_genre = merged_full.loc[merged_full['movieId'] == movie_id, 'genres'].values
    new_genre = corresponding_genre[0] if len(corresponding_genre) > 0 and not pd.isna(corresponding_genre[0]) else 'Action'
    ratings.at[index, 'genres'] = new_genre

In [16]:
ratings['genres'] = ratings['genres'].str.lower()
ratings

Unnamed: 0,userId,movieId,rating,genres
0,509,7347,3.0,mystery|thriller
1,326,71462,4.0,documentary
2,57,2115,3.0,action|adventure|fantasy
3,610,1127,4.0,action|adventure|sci-fi|thriller
4,462,2409,2.0,action|drama
...,...,...,...,...
100831,380,5048,-1.0,adventure|children|comedy
100832,434,54272,-1.0,animation|comedy
100833,226,5989,-1.0,crime|drama
100834,607,1320,-1.0,action|horror|sci-fi|thriller


In [17]:
d = defaultdict(LabelEncoder)
cols_cat = ['userId', 'movieId']
for c in cols_cat:
    d[c].fit(ratings[c].unique())
    ratings[c] = d[c].transform(ratings[c])
ratings.head(3)

Unnamed: 0,userId,movieId,rating,genres
0,508,4893,3.0,mystery|thriller
1,325,7127,4.0,documentary
2,56,1575,3.0,action|adventure|fantasy


In [18]:
grouped_rating = ratings.groupby('movieId')['genres'].first().reset_index()
grouped_rating

Unnamed: 0,movieId,genres
0,0,adventure|animation|children|comedy|fantasy
1,1,adventure|children|fantasy
2,2,comedy|romance
3,3,comedy|drama|romance
4,4,comedy
...,...,...
9719,9719,action|animation|comedy|fantasy
9720,9720,animation|comedy|fantasy
9721,9721,drama
9722,9722,action|animation


In [19]:
n_genres = set('|'.join(grouped_rating['genres']).split('|'))
n_genres

{'action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'film-noir',
 'horror',
 'imax',
 'music',
 'musical',
 'mystery',
 'romance',
 'sci-fi',
 'science fiction',
 'thriller',
 'tv movie',
 'war',
 'western'}

In [20]:
# Create a dictionary mapping movie IDs to genre vectors using one-hot encoding
movie_to_genre_vector = {}

# Convert genres column to a list of lists
genres_list = [genres.split('|') for genres in grouped_rating['genres']]


# Use MultiLabelBinarizer for one-hot encoding
mlb = MultiLabelBinarizer()
genre_vectors = mlb.fit_transform(genres_list)

for idx, row in grouped_rating.iterrows():
    movie_id = row['movieId']
    genre_vector = torch.FloatTensor(genre_vectors[idx])


    # Update the dictionary
    movie_to_genre_vector[movie_id] = genre_vector


In [21]:
train_ratings = ratings[ratings.rating != -1]
test_ratings = ratings[ratings.rating == -1]

In [22]:
train_ratings.reset_index(drop=True,inplace=True)
test_ratings.reset_index(drop=True,inplace=True)
train_ratings

Unnamed: 0,userId,movieId,rating,genres
0,508,4893,3.0,mystery|thriller
1,325,7127,4.0,documentary
2,56,1575,3.0,action|adventure|fantasy
3,609,855,4.0,action|adventure|sci-fi|thriller
4,461,1808,2.0,action|drama
...,...,...,...,...
80663,41,2986,4.0,action|adventure|thriller
80664,363,116,4.0,comedy
80665,479,4601,4.0,comedy|drama
80666,5,748,3.0,drama


In [23]:
from sklearn.model_selection import train_test_split
#df_train, df_val = train_test_split(train_ratings, test_size=0.25, random_state=42)
df_train = train_ratings
#print(df_train.shape, df_val.shape)

#### Create MovieDataset and Dataloader for training

In [24]:
class MovieRatingDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

        self.x_user_movie = list(zip(self.dataframe.userId.values, self.dataframe.movieId.values))
        self.y_rating = self.dataframe.rating.values
        #vectorizer = CountVectorizer(tokenizer=lambda x: x.split('|'), binary=True)
        #genre_vectors = vectorizer.fit_transform(self.dataframe['genres']).toarray()
        #self.x_genres = torch.from_numpy(genre_vectors).float()

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        #taking the index instead of the id
        #user_movie_pair = torch.from_numpy(np.array([self.user_id_to_index[self.dataframe.iloc[idx, 0]], self.movie_id_to_index[self.dataframe.iloc[idx, 1]]]))
        #rating = torch.FloatTensor([self.dataframe.iloc[idx, 2]])
        movie_id = self.x_user_movie[idx][1]
        genre_vector = movie_to_genre_vector[movie_id.item()]
        #return user_movie_pair, rating
        return self.x_user_movie[idx], self.y_rating[idx],  genre_vector#self.x_genres[idx]


# Create the DataLoader
batch_size = 16

train_dataset = MovieRatingDataset(df_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

#validation_dataset = MovieRatingDataset(df_val)
#validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

test_dataset = MovieRatingDataset(test_ratings)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [25]:
xb, yb ,gb= next(iter(train_dataloader))
print(xb)
print(yb)
print(gb)

[tensor([ 97,  81, 598, 245, 380, 121, 112,  47, 371, 461, 607, 447, 121, 110,
        524, 473]), tensor([5213, 4416, 2898, 7888, 1485, 8871,  191, 3814,  811,  981,  464, 3757,
        3520, 1939,  789, 1043])]
tensor([3.5000, 3.0000, 2.5000, 4.0000, 3.0000, 3.5000, 3.0000, 4.0000, 3.0000,
        4.5000, 1.5000, 1.5000, 4.5000, 4.0000, 2.5000, 5.0000],
       dtype=torch.float64)
tensor([[0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0.],
        [1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

#### Create the Matrix Class

In [26]:

def sigmoid_range(x, low, high):
    """ Sigmoid function with range (low, high) """
    return torch.sigmoid(x) * (high-low) + low
class MatrixFactorization(nn.Module):

    def __init__(self, n_users, n_movies, n_factors,n_genre):
        super(MatrixFactorization, self).__init__()

        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.genre_factors = nn.Linear(n_genre, n_factors)

        self.user_bias = nn.Parameter(torch.zeros(n_users).normal_(0, 0.01))
        self.movie_bias = nn.Parameter(torch.zeros(n_movies).normal_(0, 0.01))

        self.offset = nn.Parameter(torch.zeros(1).normal_(0, 0.01))

        self.user_factors.weight.data.uniform_(0., 0.05)
        self.movie_factors.weight.data.uniform_(0., 0.05)
        self.genre_factors.weight.data.uniform_(0., 0.05)

    def forward(self, user, item , genre_vector):

        user_emb = self.user_factors(user)
        item_emb = self.movie_factors(item)
        genre_emb = self.genre_factors(genre_vector)

        item_emb += genre_emb

        element_product = (user_emb*item_emb).sum(1)

        user_b = self.user_bias[user]
        item_b = self.movie_bias[item]

        element_product += user_b + item_b + self.offset

        return sigmoid_range(element_product, 0,5.5)


#### Training Loop

#### Adjust N_FACTORS, LR, and WEIGHT_DECAY to achieve a validation RMSE better than the current 0.850, which gives in the testing set 0.840
.


In [27]:
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR,ReduceLROnPlateau

n_users = len(ratings['userId'].unique())
n_movies = len(ratings['movieId'].unique())
n_genres = len(set('|'.join(grouped_rating['genres']).split('|')))

n_factors = 15

model = MatrixFactorization(n_users, n_movies, n_factors,n_genres)
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.06)
scheduler = ReduceLROnPlateau(optimizer, verbose=True)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
#0.827 num_factors = 10, lr=0.001 , wd=0.05 batch 32
#0.826 num_factors = 15, lr=0.001 , wd=0.06 batch 32
#0.825 num_factors = 15, lr=0.001 , wd=0.06 batch 16
#0.8   num_factors = 15, lr=0.001 , wd=0.06 batch 16 #scheduler


MatrixFactorization(
  (user_factors): Embedding(610, 15)
  (movie_factors): Embedding(9724, 15)
  (genre_factors): Linear(in_features=22, out_features=15, bias=True)
)

In [28]:
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score

epoch_train_losses, epoch_val_losses, epoch_val_rmse = [], [], []

# Training loop
num_epochs = 7

for epoch in tqdm(range(num_epochs), desc='Training', unit='epoch'):
    train_losses, val_losses = [], []
    predicted_ratings, true_ratings = [], []

    for batch_x, batch_y,batch_genres in train_dataloader:

        users = batch_x[0].to(device)
        movies = batch_x[1].to(device)

        genres = batch_genres.to(device)
        batch_y = batch_y.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(users,movies,genres)
        # Create a mask for non-zero ratings
        #mask = (batch_y != 0).float()

        # Filter out rows with zero ratings using the mask
        #non_zero_outputs = outputs[mask.squeeze() == 1]
        #non_zero_batch_y = batch_y[mask.squeeze() == 1]
        #total_training_samples += non_zero_outputs.shape[0]

        # Calculate the loss
        #loss = criterion(non_zero_outputs, non_zero_batch_y)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
    scheduler.step(loss)

    '''
    lpreds, lratings = [], []

    with torch.no_grad():
        for batch_x_val, batch_y_val,batch_genres_val in validation_dataloader:

            users = batch_x_val[0].to(device)
            movies = batch_x_val[1].to(device)

            genres = batch_genres_val.to(device)

            batch_y_val = batch_y_val.to(device, dtype=torch.float)

            # Forward pass
            val_outputs = model(users,movies,genres)

            # Create a mask for non-zero ratings
            #val_mask = (batch_y_val != 0).float()

            # Filter out rows with zero ratings using the mask
            #non_zero_val_outputs = val_outputs[val_mask.squeeze() == 1]
            #non_zero_val_batch_y = batch_y_val[val_mask.squeeze() == 1]
            #total_validation_samples+=non_zero_val_outputs.shape[0]

            # Calculate the validation loss
            #val_loss = criterion(non_zero_val_outputs,non_zero_val_batch_y)

            val_loss = criterion(val_outputs,batch_y_val)

            val_losses.append(val_loss.item())

            predicted_ratings.extend(val_outputs.cpu().numpy())
            true_ratings.extend(batch_y_val.cpu().numpy())

    val_rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))

    epoch_val_rmse.append(val_rmse)
    '''
    # Start logging
    epoch_train_loss = np.mean(train_losses)
    #epoch_val_loss = np.mean(val_losses)
    epoch_train_losses.append(epoch_train_loss)
    #epoch_val_losses.append(epoch_val_loss)

    s = (f'Epoch: {epoch}, Train Loss: {epoch_train_loss:0.3f}, ' )
    #     f'Val Loss: {epoch_val_loss:0.3f}, Val RMSE: {val_rmse:0.3f}')
    print(s)

print('Training complete!')
'''
VALIDATION
Training:  14%|█▍        | 1/7 [00:14<01:25, 14.22s/epoch]Epoch: 0, Train Loss: 0.873, Val Loss: 0.796, Val RMSE: 0.892
Training:  29%|██▊       | 2/7 [00:30<01:16, 15.22s/epoch]Epoch: 1, Train Loss: 0.741, Val Loss: 0.742, Val RMSE: 0.861
Training:  43%|████▎     | 3/7 [00:45<01:01, 15.49s/epoch]Epoch: 2, Train Loss: 0.671, Val Loss: 0.715, Val RMSE: 0.846
Training:  57%|█████▋    | 4/7 [01:01<00:46, 15.53s/epoch]Epoch: 3, Train Loss: 0.623, Val Loss: 0.703, Val RMSE: 0.838
Training:  71%|███████▏  | 5/7 [01:17<00:31, 15.53s/epoch]Epoch: 4, Train Loss: 0.583, Val Loss: 0.698, Val RMSE: 0.835
Training:  86%|████████▌ | 6/7 [01:32<00:15, 15.56s/epoch]Epoch: 5, Train Loss: 0.550, Val Loss: 0.696, Val RMSE: 0.834
Training: 100%|██████████| 7/7 [01:48<00:00, 15.54s/epoch]Epoch: 6, Train Loss: 0.521, Val Loss: 0.692, Val RMSE: 0.832

TRAINING
Training:  14%|█▍        | 1/7 [00:08<00:51,  8.52s/epoch]Epoch: 0, Train Loss: 0.868,
Training:  29%|██▊       | 2/7 [00:20<00:51, 10.36s/epoch]Epoch: 1, Train Loss: 0.735,
Training:  43%|████▎     | 3/7 [00:30<00:41, 10.31s/epoch]Epoch: 2, Train Loss: 0.666,
Training:  57%|█████▋    | 4/7 [00:40<00:30, 10.03s/epoch]Epoch: 3, Train Loss: 0.617,
Training:  71%|███████▏  | 5/7 [00:49<00:19,  9.94s/epoch]Epoch: 4, Train Loss: 0.579,
Training:  86%|████████▌ | 6/7 [00:59<00:09,  9.96s/epoch]Epoch: 5, Train Loss: 0.547,
Training: 100%|██████████| 7/7 [01:09<00:00,  9.99s/epoch]Epoch: 6, Train Loss: 0.518,

Training complete
'''

Training:  14%|█▍        | 1/7 [00:20<02:02, 20.42s/epoch]

Epoch: 0, Train Loss: 0.851, 


Training:  29%|██▊       | 2/7 [00:43<01:50, 22.18s/epoch]

Epoch: 1, Train Loss: 0.716, 


Training:  43%|████▎     | 3/7 [01:09<01:34, 23.70s/epoch]

Epoch: 2, Train Loss: 0.655, 


Training:  57%|█████▋    | 4/7 [01:35<01:13, 24.63s/epoch]

Epoch: 3, Train Loss: 0.613, 


Training:  71%|███████▏  | 5/7 [01:58<00:48, 24.11s/epoch]

Epoch: 4, Train Loss: 0.580, 


Training:  86%|████████▌ | 6/7 [02:25<00:25, 25.23s/epoch]

Epoch: 5, Train Loss: 0.554, 


Training: 100%|██████████| 7/7 [02:50<00:00, 24.30s/epoch]

Epoch: 6, Train Loss: 0.532, 
Training complete!





'\nVALIDATION\nTraining:  14%|█▍        | 1/7 [00:14<01:25, 14.22s/epoch]Epoch: 0, Train Loss: 0.873, Val Loss: 0.796, Val RMSE: 0.892\nTraining:  29%|██▊       | 2/7 [00:30<01:16, 15.22s/epoch]Epoch: 1, Train Loss: 0.741, Val Loss: 0.742, Val RMSE: 0.861\nTraining:  43%|████▎     | 3/7 [00:45<01:01, 15.49s/epoch]Epoch: 2, Train Loss: 0.671, Val Loss: 0.715, Val RMSE: 0.846\nTraining:  57%|█████▋    | 4/7 [01:01<00:46, 15.53s/epoch]Epoch: 3, Train Loss: 0.623, Val Loss: 0.703, Val RMSE: 0.838\nTraining:  71%|███████▏  | 5/7 [01:17<00:31, 15.53s/epoch]Epoch: 4, Train Loss: 0.583, Val Loss: 0.698, Val RMSE: 0.835\nTraining:  86%|████████▌ | 6/7 [01:32<00:15, 15.56s/epoch]Epoch: 5, Train Loss: 0.550, Val Loss: 0.696, Val RMSE: 0.834\nTraining: 100%|██████████| 7/7 [01:48<00:00, 15.54s/epoch]Epoch: 6, Train Loss: 0.521, Val Loss: 0.692, Val RMSE: 0.832\n\nTRAINING\nTraining:  14%|█▍        | 1/7 [00:08<00:51,  8.52s/epoch]Epoch: 0, Train Loss: 0.868, \nTraining:  29%|██▊       | 2/7 [00:20

### Generate Predictions

In [29]:
test_ratings

Unnamed: 0,userId,movieId,rating,genres
0,431,7316,-1.0,action|adventure|drama|romance|war
1,287,412,-1.0,action|thriller
2,598,3217,-1.0,action|crime|thriller
3,41,2248,-1.0,adventure|animation|children|comedy|crime|fant...
4,74,1210,-1.0,action|adventure|thriller
...,...,...,...,...
20163,379,3661,-1.0,adventure|children|comedy
20164,433,6513,-1.0,animation|comedy
20165,225,4153,-1.0,crime|drama
20166,606,1008,-1.0,action|horror|sci-fi|thriller


In [30]:
xb, yb ,gb= next(iter(test_dataloader))
print(xb)
print(yb)
print(gb)

[tensor([431]), tensor([7316])]
tensor([-1.], dtype=torch.float64)
tensor([[1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 1., 0.]])


In [31]:
len(test_dataloader)

20168

In [32]:
# Create a list to store the predicted ratings
predicted_ratings = []

# Iterate over rows in the test dataframe
model.eval()
for user_movie_Id,_ ,genre_vector in test_dataloader:
    user_id = user_movie_Id[0].to(device)
    movie_id = user_movie_Id[1].to(device)

    genre_vector_ = genre_vector.to(device)
    #user_movie_pair = torch.tensor([user_index, movie_index]).to(device)
    predicted_rating = model(user_id,movie_id,genre_vector_)

    # Append the predicted rating to the list
    predicted_ratings.append(predicted_rating.item())


In [33]:
test_ratings['rating'] = predicted_ratings


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ratings['rating'] = predicted_ratings


In [34]:
test_ratings

Unnamed: 0,userId,movieId,rating,genres
0,431,7316,3.436499,action|adventure|drama|romance|war
1,287,412,3.396884,action|thriller
2,598,3217,2.673477,action|crime|thriller
3,41,2248,3.624809,adventure|animation|children|comedy|crime|fant...
4,74,1210,3.548621,action|adventure|thriller
...,...,...,...,...
20163,379,3661,3.483441,adventure|children|comedy
20164,433,6513,3.648717,animation|comedy
20165,225,4153,4.131797,crime|drama
20166,606,1008,3.798532,action|horror|sci-fi|thriller


In [35]:
test_ratings

Unnamed: 0,userId,movieId,rating,genres
0,431,7316,3.436499,action|adventure|drama|romance|war
1,287,412,3.396884,action|thriller
2,598,3217,2.673477,action|crime|thriller
3,41,2248,3.624809,adventure|animation|children|comedy|crime|fant...
4,74,1210,3.548621,action|adventure|thriller
...,...,...,...,...
20163,379,3661,3.483441,adventure|children|comedy
20164,433,6513,3.648717,animation|comedy
20165,225,4153,4.131797,crime|drama
20166,606,1008,3.798532,action|horror|sci-fi|thriller


In [36]:
test_ratings.reset_index(inplace=True)
test_ratings.rename(columns={'index': 'Id'}, inplace=True)
test_ratings

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ratings.rename(columns={'index': 'Id'}, inplace=True)


Unnamed: 0,Id,userId,movieId,rating,genres
0,0,431,7316,3.436499,action|adventure|drama|romance|war
1,1,287,412,3.396884,action|thriller
2,2,598,3217,2.673477,action|crime|thriller
3,3,41,2248,3.624809,adventure|animation|children|comedy|crime|fant...
4,4,74,1210,3.548621,action|adventure|thriller
...,...,...,...,...,...
20163,20163,379,3661,3.483441,adventure|children|comedy
20164,20164,433,6513,3.648717,animation|comedy
20165,20165,225,4153,4.131797,crime|drama
20166,20166,606,1008,3.798532,action|horror|sci-fi|thriller


In [37]:
test_ratings.drop(columns=['userId','movieId','genres'],inplace=True)
test_ratings

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ratings.drop(columns=['userId','movieId','genres'],inplace=True)


Unnamed: 0,Id,rating
0,0,3.436499
1,1,3.396884
2,2,2.673477
3,3,3.624809
4,4,3.548621
...,...,...
20163,20163,3.483441
20164,20164,3.648717
20165,20165,4.131797
20166,20166,3.798532


In [37]:
test_ratings

Unnamed: 0,Id,rating
0,0,3.503304
1,1,3.267769
2,2,2.637213
3,3,3.433707
4,4,3.279942
...,...,...
20163,20163,3.512777
20164,20164,3.565756
20165,20165,4.130539
20166,20166,3.771762


In [38]:
test_ratings.to_csv('ratings_lluka32.csv', index=False)