In [None]:
import pandas as pd
import re1
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
films_df = pd.read_csv('Films_metadata.csv')
genres_df = pd.read_csv('Genres_encoded.csv')

films_df.head()

Unnamed: 0.1,Unnamed: 0,movieID,title,genres,imdb_link,tmdb_link,users_avg_ratings_to_movie
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://www.imdb.com/title/tt0114709/,https://www.themoviedb.org/movie/862/,3.92093
1,1,2,Jumanji (1995),Adventure|Children|Fantasy,https://www.imdb.com/title/tt0113497/,https://www.themoviedb.org/movie/8844/,3.431818
2,2,3,Grumpier Old Men (1995),Comedy|Romance,https://www.imdb.com/title/tt0113228/,https://www.themoviedb.org/movie/15602/,3.259615
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://www.imdb.com/title/tt0114885/,https://www.themoviedb.org/movie/31357/,2.357143
4,4,5,Father of the Bride Part II (1995),Comedy,https://www.imdb.com/title/tt0113041/,https://www.themoviedb.org/movie/11862/,3.071429


In [None]:
genres_df.head()

Unnamed: 0.1,Unnamed: 0,Mystery,Action,IMAX,Adventure,Sci-Fi,War,Thriller,Western,Crime,...,Romance,Horror,Documentary,Musical,Comedy,Drama,(no genres listed),Fantasy,Children,Animation
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,1
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,2,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
merged_df = pd.merge(films_df, genres_df, on='Unnamed: 0')

merged_df.set_index('movieID', inplace=True)



In [None]:
merged_df.head()

Unnamed: 0_level_0,Unnamed: 0,title,genres,imdb_link,tmdb_link,users_avg_ratings_to_movie,Mystery,Action,IMAX,Adventure,...,Romance,Horror,Documentary,Musical,Comedy,Drama,(no genres listed),Fantasy,Children,Animation
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://www.imdb.com/title/tt0114709/,https://www.themoviedb.org/movie/862/,3.92093,0,0,0,1,...,0,0,0,0,1,0,0,1,1,1
2,1,Jumanji (1995),Adventure|Children|Fantasy,https://www.imdb.com/title/tt0113497/,https://www.themoviedb.org/movie/8844/,3.431818,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
3,2,Grumpier Old Men (1995),Comedy|Romance,https://www.imdb.com/title/tt0113228/,https://www.themoviedb.org/movie/15602/,3.259615,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,3,Waiting to Exhale (1995),Comedy|Drama|Romance,https://www.imdb.com/title/tt0114885/,https://www.themoviedb.org/movie/31357/,2.357143,0,0,0,0,...,1,0,0,0,1,1,0,0,0,0
5,4,Father of the Bride Part II (1995),Comedy,https://www.imdb.com/title/tt0113041/,https://www.themoviedb.org/movie/11862/,3.071429,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
columns_to_drop = ['Unnamed: 0', 'genres', 'imdb_link', 'tmdb_link']
merged_df.drop(columns=[col for col in columns_to_drop if col in merged_df.columns], inplace=True)


def extract_title_and_year(title):
    match = re.search(r'^(.*)\s\((\d{4})\)$', title)
    if match:
        return match.group(1), int(match.group(2))
    else:
        return title, None

merged_df[['title', 'year']] = merged_df['title'].apply(lambda x: pd.Series(extract_title_and_year(x)))


if merged_df['year'].notnull().all():
    scaler = MinMaxScaler()
    merged_df['year_normalized'] = scaler.fit_transform(merged_df[['year']])
else:
    print("Warning: Some titles are missing a valid year.")



In [None]:
merged_df['year'].isna().sum()

np.int64(24)

We Wanted to implement the year as a feature then normalize it but there are 24 missing years so we would'nt do it

In [None]:
merged_df.drop(columns=['year'], inplace=True)
print(merged_df.columns)

Index(['title', 'users_avg_ratings_to_movie', 'Mystery', 'Action', 'IMAX',
       'Adventure', 'Sci-Fi', 'War', 'Thriller', 'Western', 'Crime',
       'Film-Noir', 'Romance', 'Horror', 'Documentary', 'Musical', 'Comedy',
       'Drama', '(no genres listed)', 'Fantasy', 'Children', 'Animation'],
      dtype='object')


In [None]:
genre_features = merged_df.iloc[:, -genres_df.shape[1]:]

similarity_matrix = cosine_similarity(genre_features)

similarity_df = pd.DataFrame(similarity_matrix, index=merged_df.index, columns=merged_df.index)


In [None]:
genre_features

Unnamed: 0_level_0,users_avg_ratings_to_movie,Mystery,Action,IMAX,Adventure,Sci-Fi,War,Thriller,Western,Crime,...,Romance,Horror,Documentary,Musical,Comedy,Drama,(no genres listed),Fantasy,Children,Animation
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.920930,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,1
2,3.431818,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,3.259615,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,2.357143,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,0
5,3.071429,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,4.000000,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
193583,3.500000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
193585,3.500000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
193587,3.500000,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
similarity_df

movieID,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.948394,0.859251,0.775746,0.894580,0.795419,0.857502,0.917764,0.827341,0.835167,...,0.864134,0.852539,0.871208,0.896465,0.835246,0.925579,0.948748,0.835246,0.864095,0.896465
2,0.948394,1.000000,0.818983,0.719404,0.848882,0.817462,0.815933,0.963453,0.850268,0.866628,...,0.775116,0.807515,0.841685,0.866086,0.858392,0.856660,0.866741,0.858392,0.827725,0.866086
3,0.859251,0.818983,1.000000,0.931689,0.959439,0.840022,0.999964,0.823179,0.873734,0.822034,...,0.866325,0.829801,0.931249,0.889989,0.882082,0.883461,0.894278,0.882082,0.850569,0.958248
4,0.775746,0.719404,0.931689,1.000000,0.872086,0.737886,0.932700,0.723090,0.767499,0.722084,...,0.784470,0.831985,0.920910,0.781777,0.774832,0.797207,0.809782,0.868751,0.747150,0.864693
5,0.894580,0.848882,0.959439,0.872086,1.000000,0.870690,0.957895,0.853232,0.905633,0.852045,...,0.902386,0.860096,0.969460,0.922481,0.914286,0.919711,0.931503,0.914286,0.881622,0.997566
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.925579,0.856660,0.883461,0.797207,0.919711,0.870893,0.881636,0.802583,0.920024,0.858776,...,0.942990,0.876460,0.895979,0.921954,0.860013,1.000000,0.973417,0.860013,0.947758,0.921954
193583,0.948748,0.866741,0.894278,0.809782,0.931503,0.820682,0.892625,0.804226,0.853618,0.803107,...,0.905218,0.887905,0.905357,0.931605,0.861774,0.973417,1.000000,0.861774,0.898822,0.931605
193585,0.835246,0.858392,0.882082,0.868751,0.914286,0.880444,0.878797,0.862790,0.915779,0.861590,...,0.834837,0.952563,0.971286,0.932815,0.924528,0.860013,0.861774,1.000000,0.891499,0.932815
193587,0.864095,0.827725,0.850569,0.747150,0.881622,0.910460,0.847402,0.831966,0.963799,0.898704,...,0.936442,0.918532,0.874147,0.963739,0.891499,0.947758,0.898822,0.891499,1.000000,0.899490


In [None]:
def recommend_movies(movie_id, top_n=5):
    if movie_id not in similarity_df:
        return f"Movie ID {movie_id} not found in dataset."

    sim_scores = similarity_df[movie_id].sort_values(ascending=False)

    sim_scores = sim_scores.iloc[1:top_n+1]

    return merged_df.loc[sim_scores.index][['title']]


In [None]:
recommend_movies(movie_id=10, top_n=5)


Unnamed: 0_level_0,title
movieID,Unnamed: 1_level_1
3984,Diamonds Are Forever
2993,Thunderball
63113,Quantum of Solace
2989,For Your Eyes Only
3635,"Spy Who Loved Me, The"


Note: Similarity scores here are affected by taking the global average rating of all users to all movie into account as a feature.

So in the implementation we won't take ratings as a feature to consider. You will find that in the inference_classes implementation.

Below is an implementation of the ContentBasedRecommender class ; not taken into account global average ratings of movies

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

class ContentBasedRecommender:
    def __init__(self, metadata_path, genres_path, ratings_path):
        self.metadata_path = metadata_path
        self.genres_path = genres_path
        self.ratings_path = ratings_path

        self.merged_df = None
        self.similarity_df = None
        self.ratings_df = None

        self._load_and_prepare_data()
        self._compute_similarity_matrix()
        self._load_ratings()

    def _load_and_prepare_data(self):
        films_df = pd.read_csv(self.metadata_path)
        genres_df = pd.read_csv(self.genres_path)

        self.merged_df = pd.merge(films_df, genres_df, on='Unnamed: 0')
        self.merged_df.set_index('movieID', inplace=True)

        columns_to_drop = ['Unnamed: 0', 'genres', 'imdb_link', 'tmdb_link']
        self.merged_df.drop(columns=[col for col in columns_to_drop if col in self.merged_df.columns], inplace=True)

        self.merged_df['title'] = self.merged_df['title'].astype(str)

    def _compute_similarity_matrix(self):
        non_feature_cols = ['title']
        genre_features = self.merged_df.drop(columns=non_feature_cols, errors='ignore')

        similarity_matrix = cosine_similarity(genre_features)
        self.similarity_df = pd.DataFrame(
            similarity_matrix,
            index=self.merged_df.index,
            columns=self.merged_df.index
        )

    def _load_ratings(self):
        self.ratings_df = pd.read_csv(self.ratings_path)
        self.ratings_df.columns = [col.lower() for col in self.ratings_df.columns]
        self.ratings_df['rating'] = self.ratings_df['rating'].astype(float).replace(0.0, pd.NA)

    def get_recs(self, movie_id, n=10):
        if movie_id not in self.similarity_df:
            raise ValueError(f"Movie ID {movie_id} not found in similarity matrix.")

        sim_scores = self.similarity_df[movie_id].sort_values(ascending=False)
        sim_scores = sim_scores.drop(movie_id)

        top_n_ids = sim_scores.head(n).index
        recs = self.merged_df.loc[top_n_ids][['title']].copy()
        recs['movieId'] = top_n_ids
        recs['similarity_score'] = sim_scores.loc[top_n_ids].values

        return recs[['movieId', 'title', 'similarity_score']].reset_index(drop=True)

    def get_user_recs(self, user_id, n=10, top_rated=3):
        if user_id not in self.ratings_df['userid'].unique():
            raise ValueError(f"User ID {user_id} not found in ratings data.")

        user_ratings = self.ratings_df[self.ratings_df['userid'] == user_id]
        user_ratings = user_ratings.dropna(subset=['rating'])
        user_rated_ids = set(user_ratings['movieid'])

        top_movies = user_ratings.sort_values(by='rating', ascending=False)['movieid'].tolist()[:top_rated]

        all_recs = pd.DataFrame()

        for movie_id in top_movies:
            if movie_id in self.similarity_df:
                recs = self.get_recs(movie_id, n=n * 2)
                all_recs = pd.concat([all_recs, recs], ignore_index=True)

        all_recs.drop_duplicates(subset='movieId', inplace=True)
        all_recs = all_recs[~all_recs['movieId'].isin(user_rated_ids)]

        return all_recs.sort_values(by='similarity_score', ascending=False).head(n).reset_index(drop=True)

    def get_cleaned_dataframe(self):
        return self.merged_df.copy()


In [None]:
recommender = ContentBasedRecommender(
    metadata_path='Films_metadata.csv',
    genres_path='Genres_encoded.csv',
    ratings_path='ratings.csv'
)

movie_id = 1
print(f"\nTop similar movies to Movie ID {movie_id}:\n")
print(recommender.get_recs(movie_id=movie_id, n=5))

user_id = 10  # example user ID
print(f"\nTop recommendations for User ID {user_id}:\n")
print(recommender.get_user_recs(user_id=user_id, n=5, top_rated=3))




Top similar movies to Movie ID 1:

   movieId                             title  similarity_score
0     4886             Monsters, Inc. (2001)          0.999985
1     3114                Toy Story 2 (1999)          0.999978
2     4016  Emperor's New Groove, The (2000)          0.999727
3   166461                      Moana (2016)          0.998389
4     2294                       Antz (1998)          0.996377

Top recommendations for User ID 10:

   movieId                                        title  similarity_score
0    91630  Mission: Impossible - Ghost Protocol (2011)          0.999859
1    97950          Man with the Iron Fists, The (2012)          0.995893
2    45442                              Poseidon (2006)          0.992271
3     2947                            Goldfinger (1964)          0.972618
4     2948                 From Russia with Love (1963)          0.972604
