In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [8]:
import pandas as pd
import numpy as np

ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1',
                      usecols=['user_id', 'movie_id', 'rating'])

# movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1',
#                      usecols=['movie_id', 'title', 'genres'])

In [None]:
import pandas as pd
import numpy as np

ratings = pd.read_csv('/content/drive/MyDrive/Uni/MIR/ratings.csv', sep='\t', encoding='latin-1',
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])

movies = pd.read_csv('/content/drive/MyDrive/Uni/MIR/movies.csv', sep='\t', encoding='latin-1',
                     usecols=['movie_id', 'title', 'genres'])

In [None]:
movies.head(5)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id
0,1,1193,5,0,1192
1,1,661,3,0,660
2,1,914,3,0,913
3,1,3408,4,0,3407
4,1,2355,5,0,2354


### Content Based

**Objective**: Build a Content-Based Recommendation system that computes similarity between movies based on movie genres. It will suggest movies that are most similar to a particular movie based on its genre.

**Dataset**:
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Preprocess Data**:
   - Split the `genres` column into separate genre strings.
   - Fill any missing values in the `genres` column and convert it to string.

2. **Compute TF-IDF Matrix**:
   - Use `TfidfVectorizer` to transform the genres into a TF-IDF matrix.

3. **Calculate Cosine Similarity**:
   - Compute cosine similarity between the TF-IDF matrices of movies.

4. **Recommend Movies**:
   - Create a function that gets movie recommendations based on the cosine similarity score of movie genres.

In [None]:
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].fillna("").astype('str')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
def genre_recommendations(title, n=10):
    #TODO
    idx = movies[movies["title"]==title].index[0]
    sim_idx = cosine_sim[idx].argsort()[-(n+1):][::-1]
    sim_idx = sim_idx[sim_idx != idx]
    return movies.loc[sim_idx, "title"]

In [None]:
# Example
genre_recommendations('GoldenEye (1995)')

788                     Daylight (1996)
1467                    Anaconda (1997)
1513                     Con Air (1997)
978                 Maximum Risk (1996)
1693                   Firestorm (1998)
724                    Rock, The (1996)
825               Chain Reaction (1996)
345     Clear and Present Danger (1994)
543           Surviving the Game (1994)
3686          Perfect Storm, The (2000)
Name: title, dtype: object


### Collaborative Filtering

**Objective**: Implement a collaborative filtering movie recommendation system to recommend movies to a user based on similar users' spreferences.

**Dataset**:

- **Ratings Data**: DataFrame with columns `user_id`, `movie_id`, and `rating`.
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Preprocess Data**:
   - Merge `ratings` and `movies` DataFrames on `movie_id`.
   - Split `genres` column into separate rows.

2. **Create User-Genre Matrix**:
   - Create a matrix where rows are users and columns are genres.
   - Each cell represents the ratio of movies watched by the user in that genre to the total movies watched by the user.

3. **Calculate User Similarity**:
   - Use cosine similarity to calculate the similarity between users.

4. **Find Top N Similar Users**:
   - Find the top N users with the most similar preferences to the given user.

5. **Recommend Movies**:
   - Recommend movies that similar users have watched but the target user has not.
   - Return titles and genres of the top 10 recommended movies.

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/Uni/MIR/ratings.csv', sep='\t', encoding='latin-1',
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])

movies = pd.read_csv('/content/drive/MyDrive/Uni/MIR/movies.csv', sep='\t', encoding='latin-1',
                     usecols=['movie_id', 'title', 'genres'])

In [None]:
data = pd.merge(ratings, movies, on='movie_id')

In [None]:
data.head(5)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id,title,genres
0,1,1193,5,0,1192,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,1,1192,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,11,1192,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,14,1192,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,16,1192,One Flew Over the Cuckoo's Nest (1975),Drama


In [None]:
data = data.assign(genres=data['genres'].str.split('|')).explode('genres')

In [None]:
data.head(5)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id,title,genres
0,1,1193,5,0,1192,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,1,1192,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,11,1192,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,14,1192,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,16,1192,One Flew Over the Cuckoo's Nest (1975),Drama


In [None]:
genre_dict = {g:i for i, g in enumerate(data["genres"].unique())}
user_genre = np.zeros(shape=[data["user_emb_id"].unique().shape[0], len(genres)])
user_genre_counts = data.groupby(['user_emb_id', 'genres']).size()
for i in range(user_genre.shape[0]):
    for g in user_genre_counts[i].keys():
        user_genre[i, genre_dict[g]] = user_genre_counts[i, g]
user_genre = np.apply_along_axis(lambda row: row/np.sum(row), 1, user_genre)
cosine_sim = cosine_similarity(user_genre)

In [None]:
cosine_sim.shape

(6040, 6040)

In [None]:
def get_top_n_similar_users(user_id, n=5):
    #TODO
    sim_idx = cosine_sim[user_id].argsort()[-(n+1):][::-1]
    sim_idx = sim_idx[sim_idx != user_id]
    return sim_idx

In [None]:
def recommend_movies(user_id, n_similar_users=5, n_recommendations=10):
    #TODO
    user_id -= 1
    sim_users = get_top_n_similar_users(user_id, n_similar_users)
    recoms = list()
    not_watched = ratings[~ratings['movie_id'].isin(ratings.loc[ratings['user_emb_id']==user_id, 'movie_id'])]
    for user in sim_users:
        recoms.extend(not_watched.loc[not_watched['user_emb_id'] == user , 'movie_id'].unique())
        if len(recoms) >= n_recommendations:
            break
    return movies.loc[movies['movie_id'].isin(recoms), ['title', 'genres']][:n_recommendations]

In [None]:
# Example
user_id = 1
recommended_movies = recommend_movies(user_id, n_similar_users=5, n_recommendations=10)

In [None]:
recommended_movies

Unnamed: 0,title,genres
0,Toy Story (1995),Animation|Children's|Comedy
2,Grumpier Old Men (1995),Comedy|Romance
7,Tom and Huck (1995),Adventure|Children's
20,Get Shorty (1995),Action|Comedy|Drama
33,Babe (1995),Children's|Comedy|Drama
59,"Indian in the Cupboard, The (1995)",Adventure|Children's|Fantasy
151,Batman Forever (1995),Action|Adventure|Comedy|Crime
156,Casper (1995),Adventure|Children's
206,Waterworld (1995),Action|Adventure
313,Stargate (1994),Action|Adventure|Sci-Fi


### SVD (Singular Value Decomposition)



**Objective**: Implement an SVD-based recommendation system to recommend movies to users by decomposing the user-item interaction matrix into latent factors.

**Dataset**:
- **Ratings Data**: DataFrame with columns `user_id`, `movie_id`, and `rating`.
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Preprocess Data**:
   - Merge `ratings` and `movies` DataFrames on `movie_id`.
   - Create a user-item matrix where rows represent users, columns represent movies, and the values are the ratings.

2. **Decompose Matrix using SVD**:
   - Apply Singular Value Decomposition (SVD) to decompose the user-item matrix into three matrices: $U$, $\Sigma$, and $V^T$.

3. **Reconstruct Matrix**:
   - Reconstruct the user-item matrix using the top $k$ singular values to reduce dimensionality.

4. **Predict Ratings**:
   - Use the reconstructed matrix to predict ratings for all user-item pairs.

5. **Recommend Movies**:
   - Recommend the top 20 movies with the highest predicted ratings for a given user that the user hasn't rated yet.

In [None]:
n_users = ratings.user_id.unique().shape[0]
n_movies = ratings.movie_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 6040 | Number of movies = 3706


Fill na elements.

In [None]:
Ratings = ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
Ratings.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# calculate rating matrix
Ratings.loc[Ratings.loc[:, 3941] != 0, 3941]

user_id
117     1.0
139     3.0
501     1.0
546     1.0
749     3.0
770     2.0
798     1.0
808     3.0
1017    3.0
1246    2.0
1461    1.0
1680    4.0
1755    1.0
1851    2.0
1941    1.0
2380    1.0
2414    1.0
2491    1.0
2635    5.0
2694    4.0
3824    3.0
4979    1.0
Name: 3941, dtype: float64

In [None]:
U, sigma, Vt = np.linalg.svd(Ratings)
sigma = np.diag(sigma)

In [None]:
print(U.shape)
print(Vt.shape)
print(sigma.shape)

(6040, 6040)
(3706, 3706)
(3706, 3706)


In [None]:
reduced_dataset = np.dot(np.dot(U[:, :k], sigma[:k, :k]), Vt[:k, :])
reduced_dataset.shape

(6040, 3706)

In [None]:
k=20
all_user_predicted_ratings = np.dot(np.dot(U[:, :k], sigma[:k, :k]), Vt[:k, :])

In [None]:
preds = pd.DataFrame(all_user_predicted_ratings, columns=sorted(ratings.movie_id.unique()))
preds.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,3.236188,0.691441,-0.004421,-0.013907,0.125572,-0.282993,-0.064685,0.167259,-0.100287,-0.048986,...,-0.013743,0.003684,0.039305,-0.006088,-0.073551,0.302123,-0.074654,-0.019404,-0.003208,0.069246
1,1.271428,0.424805,0.102387,0.057807,0.098287,0.738487,0.094775,0.045328,0.185549,1.402783,...,-0.045296,-0.002519,-0.003512,0.040962,-0.036473,0.157463,-0.176261,-0.022356,-0.00131,0.033546
2,1.279739,0.162583,0.102715,-0.069212,-0.021542,0.163654,-0.123966,0.013215,0.022383,0.551677,...,-0.028523,-0.002628,0.012135,0.004124,-0.029672,0.230783,-0.127389,-0.041387,-0.019541,-0.138831
3,0.258133,-0.112573,0.019786,0.060456,0.024741,0.295179,-0.006009,0.000252,-0.022433,0.080312,...,0.015833,-0.00035,-0.003163,-0.024258,-0.026182,-0.009095,0.046987,-0.019518,0.01512,-0.084392
4,1.073362,0.115839,-0.210225,0.142634,-0.228213,1.43893,-0.265505,-0.002904,-0.058044,0.265604,...,0.084047,-0.00199,-0.013753,-0.033778,0.011687,-0.011893,0.493642,0.022157,0.07439,0.168795


In [None]:
sorted_user_predictions = preds.loc[0].sort_values(ascending=False)[:20]
print(sorted_user_predictions)
print(sorted_user_predictions.index)


1       3.236188
3114    2.913051
595     2.739554
364     2.628260
588     2.509639
2355    2.394129
318     2.134559
1097    2.108181
34      2.071577
919     1.987473
527     1.918955
2081    1.910613
594     1.907050
593     1.900157
2762    1.896184
1282    1.762962
1022    1.729939
1907    1.703682
1028    1.668760
1270    1.619850
Name: 0, dtype: float64
Index([   1, 3114,  595,  364,  588, 2355,  318, 1097,   34,  919,  527, 2081,
        594,  593, 2762, 1282, 1022, 1907, 1028, 1270],
      dtype='int64')


In [None]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [None]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):

    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = predictions.loc[user_row_number].sort_values(ascending=False)

    user_data = original_ratings[original_ratings['user_id'] == userID]

    # merfe movies data with user_data
    user_full = pd.merge(user_data, movies, on='movie_id').sort_values('rating', ascending=False)

    print(f'User {userID} has already rated {user_full.shape[0]} movies.')
    print(f'Recommending highest {num_recommendations} predicted ratings movies not already rated.')

    top_not_rated = sorted_user_predictions.drop(user_full['movie_id'])[:num_recommendations]
    recommendations = movies[movies['movie_id'].isin(top_not_rated.index)]
    for i in recommendations.index:
        recommendations.loc[i, 'pred'] = top_not_rated[recommendations.loc[i, 'movie_id']]

    return user_full, recommendations

In [None]:
already_rated, predictions = recommend_movies(preds, 4375, movies, ratings, 20)

User 4375 has already rated 325 movies.
Recommending highest 20 predicted ratings movies not already rated.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations.loc[i, 'pred'] = top_not_rated[recommendations.loc[i, 'movie_id']]


In [None]:
predictions

Unnamed: 0,movie_id,title,genres,pred
10,11,"American President, The (1995)",Comedy|Drama|Romance,3.160617
108,110,Braveheart (1995),Action|Drama|War,3.63888
335,339,While You Were Sleeping (1995),Comedy|Romance,2.533779
535,539,Sleepless in Seattle (1993),Comedy|Romance,3.615341
584,588,Aladdin (1992),Animation|Children's|Comedy|Musical,2.309403
585,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller,3.116467
770,780,Independence Day (ID4) (1996),Action|Sci-Fi|War,2.359743
1081,1097,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi,2.55004
1201,1219,Psycho (1960),Horror|Thriller,2.262839
1239,1259,Stand by Me (1986),Adventure|Comedy|Drama,2.213104


In [None]:
ratings.head(20)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id
0,1,1193,5,0,1192
1,1,661,3,0,660
2,1,914,3,0,913
3,1,3408,4,0,3407
4,1,2355,5,0,2354
5,1,1197,3,0,1196
6,1,1287,5,0,1286
7,1,2804,5,0,2803
8,1,594,4,0,593
9,1,919,4,0,918


In [None]:
already_rated.head(20)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id,title,genres
195,4375,3250,5,4374,3249,Alive (1993),Drama
285,4375,175,5,4374,174,Kids (1995),Drama
199,4375,3257,5,4374,3256,"Bodyguard, The (1992)",Action|Drama|Romance|Thriller
65,4375,3809,5,4374,3808,What About Bob? (1991),Comedy
210,4375,2688,5,4374,2687,"General's Daughter, The (1999)",Drama|Thriller
270,4375,150,5,4374,149,Apollo 13 (1995),Drama
271,4375,2710,5,4374,2709,"Blair Witch Project, The (1999)",Horror
125,4375,349,5,4374,348,Clear and Present Danger (1994),Action|Adventure|Thriller
60,4375,288,5,4374,287,Natural Born Killers (1994),Action|Thriller
141,4375,47,5,4374,46,Seven (Se7en) (1995),Crime|Thriller


In [None]:
# Top 20 movies that User 4375 has rated
already_rated.head(20)

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id,title,genres
195,4375,3250,5,4374,3249,Alive (1993),['Drama']
285,4375,175,5,4374,174,Kids (1995),['Drama']
199,4375,3257,5,4374,3256,"Bodyguard, The (1992)","['Action', 'Drama', 'Romance', 'Thriller']"
65,4375,3809,5,4374,3808,What About Bob? (1991),['Comedy']
210,4375,2688,5,4374,2687,"General's Daughter, The (1999)","['Drama', 'Thriller']"
270,4375,150,5,4374,149,Apollo 13 (1995),['Drama']
271,4375,2710,5,4374,2709,"Blair Witch Project, The (1999)",['Horror']
125,4375,349,5,4374,348,Clear and Present Danger (1994),"['Action', 'Adventure', 'Thriller']"
60,4375,288,5,4374,287,Natural Born Killers (1994),"['Action', 'Thriller']"
141,4375,47,5,4374,46,Seven (Se7en) (1995),"['Crime', 'Thriller']"


#### Evaluation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

train_data, test_data = train_test_split(ratings, train_size=0.2)

train_ratings = train_data['rating']
R_train = train_ratings.values
user_ratings_mean_train = np.mean(R_train)
Ratings_demeaned_train = R_train - user_ratings_mean_train

train_data['rating'] = Ratings_demeaned_train

M_train = train_data.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)

U_train, sigma_train, Vt_train = np.linalg.svd(M_train)
sigma_train = np.diag(sigma_train)

k=20
all_user_predicted_ratings_train = np.dot(np.dot(U_train[:, :k], sigma_train[:k, :k]), Vt_train[:k, :])
preds_train = pd.DataFrame(all_user_predicted_ratings_train, columns=sorted(train_data.movie_id.unique()))

def predict_rating(user_id, movie_id):
    user_row_number = user_id - 1
    if movie_id in preds_train.columns and user_row_number in preds_train.index:
        pred = preds_train.loc[user_row_number, movie_id]
    else:
        pred = np.nan
    return pred

test_data['predicted_rating'] = test_data.apply(lambda x: predict_rating(x['user_id'], x['movie_id']), axis=1)

test_data.dropna(inplace=True)

rmse = sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 3.746094029678399


### Nueral Network Model (Recommender Model)



**Objective**: Implement a Recommender model to recommend movies to a user based on similar users' preferences.

**Dataset**:
- **Ratings Data**: DataFrame with columns `user_id`, `movie_id`, and `rating`.
- **Movies Data**: DataFrame with columns `movie_id`, `title`, and `genres`.

**Steps**:

1. **Define Dataset and DataLoader**:
   - Create a custom PyTorch `Dataset` for ratings.
   - Create a DataLoader for batching and shuffling data.

2. **Define the Neural Network**:
   - Create a neural network with embedding layers for users and movies.

3. **Train the Model**:
   - Train the model using Mean Squared Error loss and Adam optimizer.
   - Save model checkpoints.

4. **Evaluate the Model**:
   - Calculate RMSE on the entire dataset.

5. **Predict Ratings for Unrated Movies**:
   - Predict and recommend top 10 unrated movies for a given user.

In [9]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

In [2]:
# Define the dataset
class RatingsDataset(Dataset):
    def __init__(self, ratings, device):
        #TODO
        self.users = torch.from_numpy(ratings['user_id'].to_numpy()).to(device)
        self.movies = torch.from_numpy(ratings['movie_id'].to_numpy()).to(device)
        self.ratings = torch.from_numpy(ratings['rating'].to_numpy()).to(device)

    def __len__(self):
        #TODO
        return len(self.ratings)

    def __getitem__(self, idx):
        #TODO
        return self.users[idx], self.movies[idx], self.ratings[idx]

# Define the neural network
class RecommenderNet(nn.Module):
    def __init__(self, num_users, num_movies, embedding_size=50):
        super(RecommenderNet, self).__init__()
        #TODO

        self.u = nn.Embedding(num_users, embedding_size)
        self.m = nn.Embedding(num_movies, embedding_size)
        self.hidden = nn.Sequential(
            nn.Linear(embedding_size*2, embedding_size*2),
            nn.ReLU(),
            nn.Linear(embedding_size*2, embedding_size),
            nn.ReLU(),
            nn.Linear(embedding_size, embedding_size//2),
            nn.ReLU(),
        )
        self.fc = nn.Linear(embedding_size//2, 1)


    def forward(self, user, movie):
        #TODO
        users = torch.nn.functional.one_hot(user)
        movies = torch.nn.functional.one_hot(movie)
        features = torch.cat([self.u(user), self.m(movie)], dim=1)
        x = self.hidden(features)
        out = torch.sigmoid(self.fc(x))
        return out


# Create the dataset and dataloader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = RatingsDataset(ratings, device)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model, loss function and optimizer

model = RecommenderNet(len(ratings.user_id.unique()), len(ratings.movie_id.unique()))
model.to(device)

lr = 1e-3
wd = 1e-5
n_epochs = 10
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

# Training loop
model.train()
for epoch in range(n_epochs):
    for X_batch_u, X_batch_m, y_batch in dataloader:
        y_pred = model(X_batch_u, X_batch_m)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


# Evaluation
model.eval()
y_pred = model(ratings['user_id'], ratings['movie_id'])
rmse = sqrt(mean_squared_error(ratings['rating'], y_pred))
print(f'Root Mean Squared Error: {rmse}')



Root Mean Squared Error: 2.482759101375018


In [None]:
# Example

sample_user_id = 1
sample_user_data = ratings[ratings['user_id'] == sample_user_id]
sample_dataset = RatingsDataset(sample_user_data)
sample_dataloader = DataLoader(sample_dataset, batch_size=1, shuffle=False)

print(f'Sample predictions for user ID {sample_user_id}:')
with torch.no_grad():
    for user, movie, rating in sample_dataloader:
        output = model(user, movie).squeeze()
        print(f'Movie ID: {movie.item()}, Predicted Rating: {output.item()}, Actual Rating: {rating.item()}')

In [None]:
def predict_unrated_movies(user_id, model, ratings, movies):
    #TODO
    not_watched = ratings.loc[~ratings['movie_id'].isin(ratings.loc[ratings['user_id']==user_id, 'movie_id']), 'movie_id']
    pred = movies[movies['movie_id'].isin(not_watched)]
    pred['predicted_rating'] = pred.apply(lambda x: model(user_id, x.movie_id))

# Example
user_id = 1
predictions = predict_unrated_movies(user_id, model, ratings, movies)
print(predictions)

      movie_id                             title             genres  \
1235      3338            For All Mankind (1989)        Documentary   
227       1704          Good Will Hunting (1997)              Drama   
1792      1797                    Everest (1998)        Documentary   
114        318  Shawshank Redemption, The (1994)              Drama   
2638       669                  Aparajito (1956)              Drama   
100        920         Gone with the Wind (1939)  Drama|Romance|War   
3341       598            Window to Paris (1994)             Comedy   
497        428              Bronx Tale, A (1993)              Drama   
74        1198    Raiders of the Lost Ark (1981)   Action|Adventure   
3359      2503           Apple, The (Sib) (1998)              Drama   

      predicted_rating  
1235          4.692185  
227           4.664774  
1792          4.663356  
114           4.659920  
2638          4.656652  
100           4.645355  
3341          4.635198  
497           4.63

### GMM (Gaussian Mixture Model)

**Objective**: Use a Gaussian Mixture Model to analyze and cluster the click data based on the number of clicks from different locations, aiming to identify distinct patterns of user behavior across 10 countries.

**Dataset**:
- **Click Data**: DataFrame with columns `link_id`, `location`, and `number_of_clicks`.

**Steps**:

1. **Preprocess Data**:
   - Ensure the dataset contains 10 distinct countries.
   - Create a matrix where rows represent different links and columns represent the number of clicks from each country.
   - Normalize the number of clicks to account for different scales.

2. **Fit GMM**:
   - Apply a Gaussian Mixture Model (GMM) to the click data matrix to identify clusters of links with similar click patterns across different countries.

3. **Evaluate Model**:
   - Analyze the resulting model by calculating the log lokelihood, BIC and AIC metrics.


In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt

file_path = 'gmm-dataset.csv'
df = pd.read_csv(file_path)


gm = GaussianMixture(n_components=2, random_state=42).fit(df)

log_likelihood = # TODO
bic = # TODO
aic = # TODO

print(f"Log-Likelihood: {log_likelihood}")
print(f"BIC: {bic}")
print(f"AIC: {aic}")