In [73]:
# Import necessary libraries
import pandas as pd  # For data manipulation
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text to numerical features using TF-IDF
from sklearn.metrics.pairwise import linear_kernel  # For calculating cosine similarity between movie descriptions

In [74]:
# Load the datasets containing movies, user tags, and genome tags (tag metadata)
movies = pd.read_csv("/Users/dalyameharzi/Library/CloudStorage/OneDrive-GroupeESAIP/ING3/S6_Erasmus/project_block2/block2_project/archive/movie.csv")
tags = pd.read_csv("/Users/dalyameharzi/Library/CloudStorage/OneDrive-GroupeESAIP/ING3/S6_Erasmus/project_block2/block2_project/archive/tag.csv")

In [75]:
# Group all user tags for each movie and join them into a single string per movie
tags_grouped = tags.groupby("movieId")["tag"].apply(
    lambda x: " ".join(str(tag) for tag in x if pd.notnull(tag))
).reset_index()

In [76]:
# Merge the grouped tags into the main movies dataframe, on movieId
movies = movies.merge(tags_grouped, on="movieId", how="left")

In [77]:
# Fill any missing tags or genres with empty strings to avoid errors later
movies["tag"] = movies["tag"].fillna("")
movies["genres"] = movies["genres"].fillna("")

In [78]:
# Create a new column that combines genres and tags into a single 'description' string
movies["description"] = movies["genres"].str.replace("|", " ") + " " + movies["tag"]

In [79]:
# Convert the movie descriptions into TF-IDF vectors, removing English stop words
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["description"])  # Matrix of shape (num_movies, num_words)

In [80]:
# Compute cosine similarity between all pairs of movie descriptions
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [81]:
# Create a Pandas Series to map movie titles to their index positions in the dataframe
indices = pd.Series(movies.index, index=movies["title"]).drop_duplicates()

In [82]:
# Define the recommendation function
def recommend(title, n=10):
    # Get the index of the movie with the given title
    idx = indices.get(title)
    if idx is None:
        return f"Film '{title}' not found in the dataset."

    # Retrieve the similarity scores for the movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on similarity scores (excluding the movie itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]

    # Get the indices of the top n similar movies
    movie_indices = [i[0] for i in sim_scores]

    # Return the titles and genres of the recommended movies
    return movies[["title", "genres"]].iloc[movie_indices]

In [83]:
# Example usage: Get recommendations based on a specific movie title
film_recommande = recommend("Seven (a.k.a. Se7en) (1995)", 5)

In [84]:
# Display the recommendations
print("You may like:")
print(film_recommande)

You may like:
                            title                         genres
8274                   Saw (2004)        Horror|Mystery|Thriller
49     Usual Suspects, The (1995)         Crime|Mystery|Thriller
2873            Fight Club (1999)    Action|Crime|Drama|Thriller
6224              Identity (2003)  Crime|Horror|Mystery|Thriller
10928  Lucky Number Slevin (2006)            Crime|Drama|Mystery
