<a href="https://colab.research.google.com/github/NaomiKemi/InClassAssignments/blob/main/Embeddings_for_Recommendation_%2B_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv("ml-latest-small/ratings.csv")

len(df)

df.tail(100)

user_ids = df["userId"].unique().tolist()
movie_ids = df["movieId"].unique().tolist()

len(movie_ids)

#Non-sequential list of ids
movie_ids[:6]

#Manually making the dictionary
movie_id_to_index = {
    31: 1,
    1029: 2,
    1061: 3
}

#Use a movie id to look up an index
movie_id_to_index[31]

#Make a dictionary mapping ids (keys) to indexes (values)
user_id_to_index = {x: i for i, x in enumerate(user_ids)}
movie_id_to_index = {x: i for i, x in enumerate(movie_ids)}

#Make a new column in the dataframe which contains the appropriate index for each user and movie
df["user_index"] = [user_id_to_index[i] for i in df["userId"]]
df["movie_index"] = [movie_id_to_index[i] for i in df["movieId"]]

df.head(5)

df["rating"].describe()

df["rating"] = MinMaxScaler().fit_transform(df["rating"].values.reshape(-1, 1))

df["rating"].describe()

#Inputs
x = df[["user_index", "movie_index"]]
#Outputs
y = df["rating"]
#Get train-test split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

In [None]:
!pip install torch torchvision torchaudio

#import library
import torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
class LouisNet(torch.nn.Module):

  #Override __init__()
    def __init__(self):
        super().__init__()
        print("__init__ called")

        #Override forward()
    def forward(self, inputs):
        print("\nforwards pass (new batch)")
        print(inputs,"\n")
        #return the output (its just the input, unchanged)
        return inputs

        #Make a new instance of LouisNet
louisNet = LouisNet()
loss_fn = torch.nn.MSELoss()

#Fake dataset
x = torch.FloatTensor([[1],[2],[3],[4]])
y = torch.FloatTensor([[2],[3],[4],[5]])

#Do a forwards pass
prediction = louisNet(x)
loss = loss_fn(prediction, y)

class RecommenderNet(torch.nn.Module):
    def __init__(self, num_users, num_movies, embedding_size=20):
        super().__init__()
        self.user_embedding = torch.nn.Embedding(num_users, embedding_size)
        self.user_bias = torch.nn.Embedding(num_users, 1)
        self.movie_embedding = torch.nn.Embedding(num_movies, embedding_size)
        self.movie_bias = torch.nn.Embedding(num_movies, 1)
        self.sig = torch.nn.Sigmoid()

    def forward(self, inputs):
        #Split out indexes
        user_indexes = inputs[:, 0]
        movie_indexes = inputs[:, 1]
        #Forward pass on embedding layer
        user_vector = self.user_embedding(user_indexes)
        user_bias = self.user_bias(user_indexes).flatten()
        movie_vector = self.movie_embedding(movie_indexes)
        movie_bias = self.movie_bias(movie_indexes).flatten()
        #Dot product
        dot = (user_vector * movie_vector).sum(1)
        with_bias = dot + user_bias + movie_bias
        #Activation function
        output = self.sig(with_bias)
        return output


__init__ called

forwards pass (new batch)
tensor([[1.],
        [2.],
        [3.],
        [4.]]) 



In [None]:
#Pick Embedding size
EMBEDDING_SIZE = 16
#Make new object (calls __init__())
num_users = len(user_ids)
num_movies = len(movie_ids)
model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

#Make a subclass to hold our dataset (movie - user pairs (input) and a rating (label))
class MoviesDataset(Dataset):
    def __init__(self, X,y):
        self.X = torch.IntTensor(X)
        self.y = torch.FloatTensor(y)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
#Use our train - validation split to make DataLoader objects
train_dl = DataLoader(MoviesDataset(x_train.values,y_train.values), batch_size=64, shuffle=True)
validation_dl = DataLoader(MoviesDataset(x_val.values,y_val.values), batch_size=64, shuffle=True)

epochs = 10
#Use Mean Squared Error as a loss function
loss_fn = torch.nn.MSELoss()
#Use the Adam algorithm to update the weights based on the loss
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)


In [None]:
#Use a for loop to repeat for the desired number of epochs
for i in range(epochs):

    model.train(True)

    #Use a for loop for each batch (provided by the Dataloader)
    running_loss = 0.0
    for (index, batch) in enumerate(train_dl):

        #Get batch
        inputs, labels = batch
        model.zero_grad()

        #Forward pass
        prediction = model(inputs)

        #Get Loss
        loss = loss_fn(prediction, labels)

        #Update weights (back prop)
        loss.backward()
        optimizer.step()
        running_loss += loss

    avg_loss = running_loss / (index + 1)

    model.train(False)

    #Now try with the validation set (no need to update weights, just get loss)
    running_vloss = 0.0
    for index, vdata in enumerate(validation_dl):
        vinputs, vlabels = vdata
        voutputs = model(vinputs)
        vloss = loss_fn(voutputs, vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (index + 1)
    print('Loss {} Validation Loss {}'.format(avg_loss, avg_vloss))

Loss 0.17559689283370972 Validation Loss 0.12125011533498764
Loss 0.07939629256725311 Validation Loss 0.08459701389074326
Loss 0.049161769449710846 Validation Loss 0.07112640887498856
Loss 0.037231411784887314 Validation Loss 0.06597204506397247
Loss 0.031229011714458466 Validation Loss 0.06466044485569
Loss 0.027740823104977608 Validation Loss 0.06203359737992287
Loss 0.025415191426873207 Validation Loss 0.06271978467702866
Loss 0.023779675364494324 Validation Loss 0.06316044926643372
Loss 0.02260366827249527 Validation Loss 0.06383461505174637
Loss 0.021718613803386688 Validation Loss 0.0635344460606575


In [None]:
torch.save(model.state_dict(), 'model_weights.pth')

model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()

num_users, EMBEDDING_SIZE, model.user_embedding

(671, 16, Embedding(671, 16))

In [None]:
#Get the movie data so we can map back to names
movie_data = pd.read_csv("ml-latest-small/movies.csv")
movie_data.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [None]:
def get_top_n(user = 0, n = 10):
    #Get Movie Names
    top_n_indexes = get_top_n_indexes(user, n)
    top_n = get_names_for_indexes(top_n_indexes)
    return top_n

def get_names_for_indexes(indexes):
    return [movie_data[movie_data["movieId"]==movie_ids[i]]["title"].item() for i in indexes]

def get_top_n_indexes(user = 0, n = 10):
    #For one user, make a pair with every movie index
    x = torch.IntTensor([[user, i] for i in np.arange(num_movies)])
    #Predict
    predicted_ratings = model(x)
    #Get Top-N indexes
    top_n_indexes = predicted_ratings.argsort()[-n:]
    return top_n_indexes

In [None]:
#Random users top 10
get_top_n(np.random.randint(num_users))

['Yu-Gi-Oh! (2004)',
 'Sympathy for Mr. Vengeance (Boksuneun naui geot) (2002)',
 'Witless Protection (2008)',
 'Cops (1922)',
 'Friends with Money (2006)',
 'Crossover (2006)',
 'Crimson Pirate, The (1952)',
 'All is Bright (2013)',
 'Gaslight (1940)',
 'Nacho Libre (2006)']

In [26]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Task 1 - Diversity
def calculate_diversity(user_top_10_movies):
    similarity_matrix = cosine_similarity(user_top_10_movies)
    diversity_matrix = 1 - similarity_matrix
    mean_diversity = np.mean(diversity_matrix)
    return mean_diversity

# Task 1 - Novelty
def calculate_novelty(user_top_10_movies, ratings_df):
    mean_ratings = ratings_df.loc[user_top_10_movies].mean()
    mean_novelty = mean_ratings.mean()
    return mean_novelty

all_user_top_10_movies = []
ratings_df = pd.read_csv("ml-latest-small/ratings.csv")

diversity_scores = []
novelty_scores = []

# Calculate diversity and novelty scores for each user's top 10 movies
for user_top_10_movies in all_user_top_10_movies:
    diversity_scores.append(calculate_diversity(user_top_10_movies))
    novelty_scores.append(calculate_novelty(user_top_10_movies, ratings_df))

# Calculate mean diversity and novelty for the whole dataset
mean_diversity = np.mean(diversity_scores)
mean_novelty = np.mean(novelty_scores)

print("Mean Diversity:", mean_diversity)
print("Mean Novelty:", mean_novelty)


Mean Diversity: nan
Mean Novelty: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [27]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Assuming you have a movie_embeddings matrix of shape (num_movies, embedding_dimension)
movie_titles = [...]  # List of movie titles corresponding to each embedding

# Filter for movies with average rating > 5
filtered_movies = ratings_df[ratings_df["average_rating"] > 5]
filtered_movie_ids = filtered_movies["movie_id"].tolist()

# Filter movie_embeddings and movie_titles based on filtered_movie_ids
filtered_embeddings = movie_embeddings[filtered_movie_ids]
filtered_titles = [movie_titles[i] for i in filtered_movie_ids]

# Perform dimensionality reduction using PCA
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(filtered_embeddings)

# Plot the top 30 best-rated movies on a 2D graph
num_movies_to_plot = 30
top_movies_embeddings = reduced_embeddings[:num_movies_to_plot]
top_movies_titles = filtered_titles[:num_movies_to_plot]

plt.figure(figsize=(10, 10))
plt.scatter(top_movies_embeddings[:, 0], top_movies_embeddings[:, 1])

# Label each point with the title
for i, title in enumerate(top_movies_titles):
    plt.annotate(title, (top_movies_embeddings[i, 0], top_movies_embeddings[i, 1]))

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Top 30 Best Rated Movies (PCA)")

plt.show()


KeyError: ignored