In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
import time
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import minmax_scale
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Loading datasets
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

X = ratings.drop(columns='rating')
y = ratings["rating"].values

In [None]:
X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.5, random_state=101)


In [None]:
#Creating training dataset
train_df = X_train.copy()
train_df["rating"] = y_train

train_df

Unnamed: 0,userId,movieId,timestamp,rating
75849,477,6708,1200941623,3.5
77555,483,3301,1415576202,4.5
79980,500,3083,1005528017,1.0
19382,125,65642,1474375988,4.5
15549,102,454,835876151,4.0
...,...,...,...,...
5695,41,1641,1458939146,2.0
73542,474,1544,974666859,1.0
83281,527,2742,1033173581,1.0
83467,531,593,1032961647,5.0


In [None]:
# Prepare the data
movies['genres'] = movies['genres'].str.split('|')
movies['year'] = movies['title'].str.extract('\((\d{4})\)')
movies['title'] = movies['title'].str.replace('(\(\d{4}\))', '').str.strip()

  movies['title'] = movies['title'].str.replace('(\(\d{4}\))', '').str.strip()


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [None]:
# Create a DataFrame with one-hot encoded genres
genres_dummies = movies['genres'].apply(pd.Series).stack().str.get_dummies().sum(level=0)

# Concatenate the new DataFrame with the original 'movies' DataFrame
movies_with_genres = pd.concat([movies[['movieId', 'title']], genres_dummies], axis=1)

movies_with_genres.head()


  genres_dummies = movies['genres'].apply(pd.Series).stack().str.get_dummies().sum(level=0)


Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Count the number of entries where 'no genres listed' column has a value of 1
no_genres_count = movies_with_genres['(no genres listed)'].sum()

print(f"Number of entries with 'no genres listed': {no_genres_count}")


Number of entries with 'no genres listed': 34


In [None]:
# Filter out entries with 'no genres listed'
movies_with_genres_filtered = movies_with_genres[movies_with_genres['(no genres listed)'] == 0]

movies_with_genres_filtered.head()


Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Remove the column '(no genres listed)'
movies_with_genres = movies_with_genres_filtered.drop('(no genres listed)', axis=1)

movies_with_genres.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Generating a new dataframe with rating score count (cols) vs genres (rows)
genres = list(movies_with_genres)[2:]
merged = pd.merge(ratings, movies_with_genres, on='movieId')

cleaned = merged[['movieId', 'userId', 'rating', 'Action', 'Adventure', 'Animation', "Children", 'Comedy',
                  'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical',
                  'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'title']]

movie_ratings = cleaned.sort_values('movieId')
rating_scale = [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]

genres

['Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [None]:
# Generate a list of dictionaries to store counts for each genre and rating
genre_rating_counts = []

for genre in genres:
    genre_dict = {'Genre': genre}

    for rating in rating_scale:
        genre_list = movie_ratings.loc[(movie_ratings[genre] == 1) & (movie_ratings['rating'] == rating)]
        genre_rated_count = len(genre_list)
        genre_dict[f'Rated {rating}'] = genre_rated_count

    genre_rating_counts.append(genre_dict)

genre_df = pd.DataFrame(genre_rating_counts)
genre_df

Unnamed: 0,Genre,Rated 0.5,Rated 1,Rated 1.5,Rated 2,Rated 2.5,Rated 3,Rated 3.5,Rated 4,Rated 4.5,Rated 5
0,Action,449,904,577,2548,1777,6331,4153,7678,2468,3750
1,Adventure,306,627,415,1769,1352,4838,3285,6392,2027,3150
2,Animation,80,116,96,346,365,1279,1051,1988,682,985
3,Children,169,301,161,721,530,2054,1205,2358,648,1061
4,Comedy,632,1317,895,3405,2530,8306,5086,9659,2794,4429
5,Crime,152,321,204,982,772,3116,2057,4621,1769,2687
6,Documentary,6,16,2,33,42,163,228,415,161,153
7,Drama,405,795,485,2339,1922,7541,5514,12360,4217,6350
8,Fantasy,178,286,214,893,719,2364,1634,2988,1040,1518
9,Film-Noir,8,6,4,33,17,106,108,292,116,180


In [None]:
genres_rating_count = []
for genre in genres:
    for rating in rating_scale:
        genre_list = movie_ratings.loc[movie_ratings[genre] == 1]
        genre_rated_count = len(genre_list.loc[genre_list['rating'] == rating])
        genres_rating_count.append([genre, rating, genre_rated_count])

# Function for extracting the count of a specific rating of all genres
def get_rating_count(score):
    rating_count = []
    for genre in genres:
        genre_list = movie_ratings.loc[(movie_ratings[genre] == 1) & (movie_ratings['rating'] == score)]
        genre_rated_count = len(genre_list)
        rating_count.append(genre_rated_count)
    return rating_count

def get_total_rating_count():
    all_ratings = []
    for genre in genres:
        genre_list = movie_ratings.loc[movie_ratings[genre] == 1]
        all_ratings.extend([genre] * len(genre_list))

    # Ensure the length of all_ratings is a multiple of the number of rating categories
    total_len = len(all_ratings)
    remainder = total_len % len(rating_scale)
    if remainder != 0:
        all_ratings.extend([''] * (len(rating_scale) - remainder))

    totals = [sum(1 for r in all_ratings if r == genre) for genre in genres]
    return totals

# Check if '(no genres listed)' is in the list of genres
if '(no genres listed)' in genres:
    genres.remove('(no genres listed)')

# Print the lengths of the arrays to identify the issue
print("Rated 0.5 length:", len(get_rating_count(0.5)))
print("Rated 1 length:", len(get_rating_count(1)))
print("Rated 1.5 length:", len(get_rating_count(1.5)))
print("Rated 2 length:", len(get_rating_count(2)))
print("Rated 2.5 length:", len(get_rating_count(2.5)))
print("Rated 3 length:", len(get_rating_count(3)))
print("Rated 3.5 length:", len(get_rating_count(3.5)))
print("Rated 4 length:", len(get_rating_count(4)))
print("Rated 4.5 length:", len(get_rating_count(4.5)))
print("Rated 5 length:", len(get_rating_count(5)))
print("Total nr of ratings length:", len(get_total_rating_count()))

# Create the DataFrame
genre_df = pd.DataFrame({'Genre': [genre for genre in genres],
                         'Rated 0.5': get_rating_count(0.5),
                         'Rated 1': get_rating_count(1),
                         'Rated 1.5': get_rating_count(1.5),
                         'Rated 2': get_rating_count(2),
                         'Rated 2.5': get_rating_count(2.5),
                         'Rated 3': get_rating_count(3),
                         'Rated 3.5': get_rating_count(3.5),
                         'Rated 4': get_rating_count(4),
                         'Rated 4.5': get_rating_count(4.5),
                         'Rated 5': get_rating_count(5),
                         'Total nr of ratings': get_total_rating_count()})
genre_df = pd.DataFrame({'Genre': [genre for genre in genres], 'Rated 0.5': get_rating_count(0.5) , 'Rated 1': get_rating_count(1), 'Rated 1.5': get_rating_count(1.5), 'Rated 2': get_rating_count(2), 'Rated 2.5': get_rating_count(2.5),
                          'Rated 3': get_rating_count(3), 'Rated 3.5': get_rating_count(3.5), 'Rated 4': get_rating_count(4), 'Rated 4.5': get_rating_count(4.5), 'Rated 5': get_rating_count(5),
                          'Total nr of ratings': get_total_rating_count()})
genre_df["Average rating"] = ((genre_df["Rated 0.5"] * 0.5) + genre_df["Rated 1"] + (genre_df["Rated 1.5"] * 1.5) + (genre_df["Rated 2"] * 2) + (genre_df["Rated 2.5"] * 2.5) + (genre_df["Rated 3"] * 3) +
                              (genre_df["Rated 3.5"] * 3.5) + (genre_df["Rated 4"] * 4) + (genre_df["Rated 4.5"] * 4.5) + (genre_df["Rated 5"] * 5)) / genre_df['Total nr of ratings']
genre_df

Rated 0.5 length: 19
Rated 1 length: 19
Rated 1.5 length: 19
Rated 2 length: 19
Rated 2.5 length: 19
Rated 3 length: 19
Rated 3.5 length: 19
Rated 4 length: 19
Rated 4.5 length: 19
Rated 5 length: 19
Total nr of ratings length: 19


Unnamed: 0,Genre,Rated 0.5,Rated 1,Rated 1.5,Rated 2,Rated 2.5,Rated 3,Rated 3.5,Rated 4,Rated 4.5,Rated 5,Total nr of ratings,Average rating
0,Action,449,904,577,2548,1777,6331,4153,7678,2468,3750,30635,3.447984
1,Adventure,306,627,415,1769,1352,4838,3285,6392,2027,3150,24161,3.508609
2,Animation,80,116,96,346,365,1279,1051,1988,682,985,6988,3.629937
3,Children,169,301,161,721,530,2054,1205,2358,648,1061,9208,3.412956
4,Comedy,632,1317,895,3405,2530,8306,5086,9659,2794,4429,39053,3.384721
5,Crime,152,321,204,982,772,3116,2057,4621,1769,2687,16681,3.658294
6,Documentary,6,16,2,33,42,163,228,415,161,153,1219,3.797785
7,Drama,405,795,485,2339,1922,7541,5514,12360,4217,6350,41928,3.656184
8,Fantasy,178,286,214,893,719,2364,1634,2988,1040,1518,11834,3.491001
9,Film-Noir,8,6,4,33,17,106,108,292,116,180,870,3.920115


In [None]:
# Adding the movie features (genre, release year) to the training dataset
content_train_df = pd.merge(train_df, movies_with_genres, on='movieId')
content_train_df.drop(columns=['timestamp', 'movieId', 'title'], inplace=True)  # Remove useless features

content_train_df

Unnamed: 0,userId,rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,477,3.5,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,356,4.0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,226,4.0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,610,3.5,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,288,3.0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70547,567,1.5,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
70548,560,3.0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
70549,125,3.5,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
70550,462,5.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Creating a list of lists with the target attribute (movie ratings), grouped by userID
y_grouped_by_user = content_train_df.groupby(["userId"])
y_train_listed = []

for i, j in y_grouped_by_user:
    y_train_listed.append(j["rating"].values)

y_train_listed[0]

  for i, j in y_grouped_by_user:


array([5., 4., 4., 5., 5., 4., 4., 4., 5., 5., 4., 5., 4., 5., 5., 5., 5.,
       5., 3., 5., 4., 5., 5., 5., 5., 4., 5., 5., 4., 4., 3., 4., 4., 4.,
       5., 5., 4., 3., 5., 4., 5., 3., 5., 5., 4., 5., 3., 5., 3., 4., 5.,
       5., 4., 4., 5., 4., 5., 4., 4., 5., 3., 5., 4., 5., 4., 3., 5., 5.,
       5., 5., 4., 5., 5., 5., 5., 4., 5., 4., 5., 5., 5., 5., 3., 3., 4.,
       5., 4., 4., 5., 5., 4., 3., 5., 2., 4., 3., 5., 4., 5., 5., 5., 3.,
       3., 5., 5., 5., 4., 5., 4., 5., 5., 5., 5., 2., 5., 5., 5., 5., 5.,
       4., 5., 4., 4., 4., 2., 5., 4., 4., 5., 5., 5., 5., 4., 5., 5., 5.,
       5., 5., 5., 3., 3., 4., 3., 5., 5., 5., 5., 5., 5., 5., 4., 5., 5.,
       4., 5., 5., 5., 4., 5., 4., 4., 4., 4., 4., 4.])

In [None]:
# Creating a list of dataframes with the feature set (movie info about genres and release year), grouped by userID
content_train_df.drop(columns='rating', inplace=True)
x_grouped_by_user = content_train_df.groupby(["userId"])
x_train_listed = []

for user_id, group in x_grouped_by_user:
    x_train_listed.append(group.drop(columns='userId'))

x_train_listed[0]

  for user_id, group in x_grouped_by_user:


Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
327,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0
448,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0
986,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1012,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
1263,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62661,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
63509,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
64143,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
65354,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [None]:
# Creating a new dataframe for the movies data
all_movies = movies_with_genres.drop(columns=['title', 'movieId'])
all_movies

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9738,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9739,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9740,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Creating a 2 dimensional matrix for the validation data in order to make it easier to calculate RMSE.

# Listing the user ID's in the same order as in the grouped dataframes
user_ids = []
for user_id, group in x_grouped_by_user:
    user_ids.append(user_id)

# Listing the movie IDs in the same order as in the movies dataset
movie_ids = movies_with_genres["movieId"].values

# Creating the matrix
df_val = X_val.copy()
df_val["rating"] = y_val
validation_matrix = pd.DataFrame(index=user_ids, columns=movie_ids)
for array in df_val.to_records():
    user = array['userId']
    movie = array['movieId']
    true_rating = array['rating']
    validation_matrix.loc[user][movie] = true_rating

validation_matrix

  for user_id, group in x_grouped_by_user:


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,,,,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# Assigning the different machine learning algorithms to be implemented in the models to a dictionary
ml_algorithms = {"Linear regression": LinearRegression(), "Lasso": Lasso(alpha=1.0, max_iter=10000),
                 "KNN_7": KNeighborsRegressor(n_neighbors=7),}

# Saving lists
CBF_models_listed = []
RMSE_CBF_listed = []

# For every machine learning algorithm in the dictionary:
for name, ml_alg in ml_algorithms.items():
    # Create an empty list for predictions
    CBF_predictions = []

    # For each user in the training dataset:
    for i, x in enumerate(x_train_listed):
        # Fit a machine learning model
        ml_alg.fit(x_train_listed[i], y_train_listed[i])
        # Predict all the ratings for this user for all movies
        prediction = ml_alg.predict(all_movies)
        prediction = np.clip(prediction, 0.5, 5)  # Predictions must be minimum 0.5, maximum 5

        CBF_predictions.append(prediction)

    df_predict = pd.DataFrame(CBF_predictions, index=user_ids, columns=movie_ids)

    # Create a dataframe with only the predictions for the movies-user combinations that appear in the validation set
    num_actual = validation_matrix.to_numpy().flatten()[validation_matrix.notna().to_numpy().flatten()]
    num_predict = df_predict.to_numpy().flatten()[validation_matrix.notna().to_numpy().flatten()]

    # Calculate the RMSE for the content-based filtering model and add the result to the lists
    RMSE_CBF_listed.append(sqrt(mean_squared_error(num_predict, num_actual)))
    CBF_models_listed.append(name)


# Printing the results
RMSE_CBF_df = pd.DataFrame({"Model": CBF_models_listed, "RMSE": RMSE_CBF_listed})
print("RMSE of different content-based filtering models without the year of release feature:")
RMSE_CBF_df

  model = cd_fast.enet_coordinate_descent(


RMSE of different content-based filtering models without the year of release feature:


Unnamed: 0,Model,RMSE
0,Linear regression,0.971877
1,Lasso,0.943488
2,KNN_7,0.947474


In [None]:
# Running the best content-based filtering model so far
model = Lasso(alpha=1.0, max_iter=10000)
CBF_predictions = []

for i, j in enumerate(x_train_listed):
    model.fit(x_train_listed[i], y_train_listed[i])
    prediction = model.predict(all_movies)
    prediction = np.clip(prediction, 0.5, 5)
    CBF_predictions.append(prediction)

CBF_model = pd.DataFrame(CBF_predictions, index=user_ids, columns=movie_ids)

  model = cd_fast.enet_coordinate_descent(


**COLLABORATIVE FILTERING**

In [None]:
train_df.head()

Unnamed: 0,userId,movieId,timestamp,rating
75849,477,6708,1200941623,3.5
77555,483,3301,1415576202,4.5
79980,500,3083,1005528017,1.0
19382,125,65642,1474375988,4.5
15549,102,454,835876151,4.0


In [None]:
#Calculating the Pearson Distance between all users in the training data
user_matrix = train_df.pivot(index='userId', columns='movieId', values='rating')

user_matrix = user_matrix.sub(user_matrix.mean(axis=1), axis=0)

# Replace NaN with 0.0
user_matrix = user_matrix.fillna(0.0)

In [None]:
user_dist_matrix = 1 - user_matrix.T.corr()
user_dist_matrix


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.999096,1.006102,0.955555,0.925946,1.059580,1.026439,0.968421,1.007093,1.010789,...,0.985837,0.979998,1.007381,1.016413,1.024569,1.004491,0.956766,0.950406,0.966357,0.972688
2,0.999096,0.000000,1.000000,1.000000,1.000000,1.000000,1.016127,1.000000,1.000000,0.940668,...,0.991519,1.000000,0.999763,1.000000,1.000000,1.000098,1.000000,1.002658,1.000000,0.987964
3,1.006102,1.000000,0.000000,1.000000,1.000000,1.006681,1.000000,1.000000,1.000000,1.000000,...,0.981677,1.000000,0.984386,1.000000,1.000000,1.033552,1.018175,1.012319,1.000000,0.982514
4,0.955555,1.000000,1.000000,0.000000,1.058511,0.995839,0.976590,1.007364,1.000000,0.934624,...,1.068304,0.994397,1.020133,1.031062,0.981521,1.003005,0.948547,1.022033,1.023136,0.990882
5,0.925946,1.000000,1.000000,1.058511,0.000000,1.064905,0.994083,1.156730,1.000000,1.044650,...,1.018673,0.880664,0.959562,0.970350,1.048505,0.985872,0.959635,1.012449,1.000000,1.003414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,1.004491,1.000098,1.033552,1.003005,0.985872,0.999072,0.984814,0.979197,0.977637,1.034548,...,0.984614,0.983255,0.948565,0.966388,0.978032,0.000000,0.981128,0.958009,0.968046,0.947757
607,0.956766,1.000000,1.018175,0.948547,0.959635,0.964193,0.989806,0.956837,1.019892,0.985954,...,0.966814,0.896443,0.961341,0.983407,1.020324,0.981128,0.000000,0.971529,0.963761,0.977792
608,0.950406,1.002658,1.012319,1.022033,1.012449,0.969714,0.974284,0.952424,0.973807,1.059254,...,0.946803,0.969459,0.982870,0.986618,1.017981,0.958009,0.971529,0.000000,0.939487,0.960487
609,0.966357,1.000000,1.000000,1.023136,1.000000,0.951998,1.019909,0.911276,1.000000,1.060620,...,0.950630,0.922213,0.986043,0.827499,1.049777,0.968046,0.963761,0.939487,0.000000,1.000870


In [None]:
#Predicting ratings for every user with K-NN
ml_algorithms = {'kNN-5': 5, 'kNN-10': 10, 'kNN-20': 20, 'kNN-30': 30, 'kNN-40': 40, "kNN-60": 50}

models_CF = []
RMSE_CF = []

# Training the models and predicting for the users and movies in the validation data
for name, num_neighbours in ml_algorithms.items():
    predictions = []

    # For every rating in the validation data
    for index, row in X_val.iterrows():
        # If the movie is in the training data
        if row["movieId"] in X_train["movieId"].unique():
            users_rated_movie = X_train.loc[X_train['movieId'] == row['movieId'], 'userId']
            # Sort these users by similarity (Pearson distance)
            users_sorted = (user_dist_matrix.loc[row['userId'], users_rated_movie].sort_values())
            nearest_neighbours = users_sorted[:num_neighbours]
            # Extract the nearest neighbours' ratings data
            nn_data = train_df.loc[train_df['userId'].isin(nearest_neighbours.index.to_list())]
            # Calculate the weighted average of the nearest neighbours' ratings
            nearest_neighbours_avg_rating = np.average(nn_data.loc[train_df['movieId'] == row['movieId'], 'rating'],
                                                       axis=0, weights=(1/nearest_neighbours))
        else:
            nearest_neighbours_avg_rating = 3.5

        # Appending the prediction to the list of predictions
        if not np.isnan(nearest_neighbours_avg_rating):
            predictions.append(nearest_neighbours_avg_rating)
        else:
            predictions.append(3)

    models_CF.append(name)
    RMSE_CF.append(sqrt(mean_squared_error(y_val, predictions)))


RMSE_CF_dict = {"Model": models_CF, "RMSE": RMSE_CF}
RMSE_CF_df = pd.DataFrame(RMSE_CF_dict)
RMSE_CF_df

Unnamed: 0,Model,RMSE
0,kNN-5,1.008812
1,kNN-10,0.992045
2,kNN-20,0.987813
3,kNN-30,0.988659
4,kNN-40,0.989698
5,kNN-60,0.990831


In [None]:
# Rerunning the best model so far (kNN-20)
best_CF_model = []
RMSE_best_CF = []

CF_predictions = []

# For every movie in the validation data
for index, row in X_val.iterrows():
    # If that movie is in the training data
    if row["movieId"] in X_train["movieId"].unique():
        users_rated_movie = X_train.loc[X_train['movieId'] == row['movieId'], 'userId']
        users_sorted = (user_dist_matrix.loc[row['userId'], users_rated_movie].sort_values())
        # Select the nearest neighbours
        nearest_neighbours = users_sorted[:20]
        # Extract the nearest neighbours' ratings data
        nn_data = train_df.loc[train_df['userId'].isin(nearest_neighbours.index.to_list())]
        # Calculate the weighted average of the nearest neighbours' ratings
        nearest_neighbours_avg_rating = np.average(nn_data.loc[train_df['movieId'] == row['movieId'], 'rating'],
                                                   axis=0, weights=(1/nearest_neighbours))
    else:
        nearest_neighbours_avg_rating = 3.5

    if not np.isnan(nearest_neighbours_avg_rating):
        CF_predictions.append(nearest_neighbours_avg_rating)
    else:
        CF_predictions.append(4)

**HYBRID**

In [None]:
CBF_predictions = []
for index, row in X_val.iterrows():
    userId, movieId = row["userId"], row["movieId"]

    # Check if the movieId exists in CBF_model
    if movieId in CBF_model.index and userId in CBF_model.columns:
        user_predictions = CBF_model.loc[userId, movieId]
        CBF_predictions.append(user_predictions)
    else:
        CBF_predictions.append(np.nan)  # Add a placeholder for missing predictions

# Calculating the predictions for the different hybrid "models"
print("RMSE combined approach (Lasso and KNN-20):")
weighted_avgs = [(0.8, 0.2), (0.75, 0.25), (0.7, 0.3), (0.65, 0.35), (0.6, 0.4), (0.55, 0.45), (0.5, 0.5), (0.45, 0.55), (0.4, 0.6), (0.35, 0.65), (0.3, 0.7), (0.25, 0.75), (0.20, 0.80)]

filtered_CF_predictions = [y_pred for y_pred in CF_predictions if not np.isnan(y_pred)]


valid_CBF_predictions = [y_pred for y_pred in CBF_predictions if not np.isnan(y_pred)]
valid_filtered_CF_predictions = [y_pred for y_pred in filtered_CF_predictions if not np.isnan(y_pred)]

# Ensure that both arrays have the same length
min_length = min(len(valid_CBF_predictions), len(valid_filtered_CF_predictions))

for weight in weighted_avgs:
    # Calculate combined predictions using the current weight
    combined_predictions = np.array([y_pred * weight[0] for y_pred in np.array(valid_CBF_predictions[:min_length])]) + np.array([y_pred * weight[1] for y_pred in np.array(valid_filtered_CF_predictions[:min_length])])

    # Calculate RMSE only if there are valid predictions
    if len(combined_predictions) > 0:
        min_length = min(len(y_val), len(combined_predictions))
        rmse_combined = sqrt(mean_squared_error(y_val[:min_length], combined_predictions[:min_length]))
        print(f"RMSE for combined approach with CBF weighted {weight[0]} and CF weighted {weight[1]}: \n", rmse_combined, "\n")
    else:
        print(f"No valid predictions for the current weight combination: CBF weighted {weight[0]} and CF weighted {weight[1]}.\n")


RMSE combined approach (Lasso and KNN-20):
RMSE for combined approach with CBF weighted 0.8 and CF weighted 0.2: 
 1.0432209008945907 

RMSE for combined approach with CBF weighted 0.75 and CF weighted 0.25: 
 1.0291536339518264 

RMSE for combined approach with CBF weighted 0.7 and CF weighted 0.3: 
 1.0162326688705503 

RMSE for combined approach with CBF weighted 0.65 and CF weighted 0.35: 
 1.0045022414479008 

RMSE for combined approach with CBF weighted 0.6 and CF weighted 0.4: 
 0.994004501830115 

RMSE for combined approach with CBF weighted 0.55 and CF weighted 0.45: 
 0.9847788721503914 

RMSE for combined approach with CBF weighted 0.5 and CF weighted 0.5: 
 0.9768613950810228 

RMSE for combined approach with CBF weighted 0.45 and CF weighted 0.55: 
 0.9702840945599751 

RMSE for combined approach with CBF weighted 0.4 and CF weighted 0.6: 
 0.9650743722133971 

RMSE for combined approach with CBF weighted 0.35 and CF weighted 0.65: 
 0.9612544639379719 

RMSE for combined 