# Setting up

In [None]:
!pip install fuzzywuzzy
!pip install python-Levenshtein

import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process
from IPython.display import clear_output

## Note:

There were two datasets used in this project (the links are provided at the end of this cell).
Initially, two DataFrames were created using each dataset, then merged to create merged_df.
However, one of the datasets (485 MB) is too large to be pushed to GitHub. 
As a result, we instead saved the merged DataFrame to a new CSV in the directory named TMDB_merged_df.csv.
The merged_df seen in this program is loaded from that file.
The original code for loading and merging the two datasets is commented in the next three cells.

Links to the datasets used in the project:

https://www.kaggle.com/datasets/sankha1998/tmdb-top-10000-popular-movies-dataset (TMDb_updated.csv)

https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies (TMDB_movie_dataset_v11.csv)



In [None]:
# df_1 = pd.read_csv("TMDb_updated.csv",index_col = 0)

In [None]:
# df_2 = pd.read_csv("TMDB_movie_dataset_v11.csv")
# df_2["overview"] = df_2["overview"].fillna("")
# df_2 = df_2.drop_duplicates(subset=["title", "release_date"], keep="first")

In [None]:
# merged_df = df_1.merge(df_2[["title", "release_date", "overview","genres"]], on = ["title","overview"], how = "left")

# Loading the Merged Dataset

In [None]:
merged_df = pd.read_csv("TMDB_merged_df.csv", index_col = 0)

# Filtering Movies
In this dataset, there are some movies that have a vote_average but have no vote_counts, which doesn't make sense. There are also duplicates of the same movie. So we filtered it to get more accurate data. 

We only included the important data only (id, title, vote_average, vote_count, overview, genres).

In [None]:
filtered_df = merged_df[~(merged_df["vote_count"] == 0)] # Removes movies with no available data for vote count
filtered_df = filtered_df[["title", "release_date", "overview", "genres", "vote_average", "vote_count"]] # Filters only the details needed
filtered_df = filtered_df.drop_duplicates(subset=["title", "overview"], keep="first") # Removes duplicates
filtered_df["overview"] = filtered_df["overview"].fillna("") #For overviews with NaN values
filtered_df["genres"] = filtered_df["genres"].fillna("") # For genres with NaN values
filtered_df = filtered_df[~(filtered_df["genres"] == "")]
filtered_df["genres"] = filtered_df["genres"].apply(lambda x: x.split(', '))  # Separating the genres

pd.set_option('display.max_colwidth', None)
filtered_df = filtered_df.reset_index()
filtered_df = filtered_df.drop("index", axis = 1)

# Content-Based Filtering

## sklearn.preprocessing
The sklearn.preprocessing package provides several common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators. 

https://scikit-learn.org/0.19/modules/preprocessing.html#:~:text=The%20sklearn.,standardization%20of%20the%20data%20set.

#### MultiLabelBinarizer
MultiLabelBinarizer transforms between iterable of iterables and a multilabel format. 
Although a list of sets or tuples is a very intuitive format for multilabel data, it is unwieldy to process. This transformer converts between this intuitive format and the supported multilabel format: a (samples x classes) binary matrix indicating the presence of a class labe

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html



In [None]:
# One-hot encoding genres
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(filtered_df['genres'])

In [None]:
test_df = filtered_df.copy()
test_df["genre_vector"] = genres_encoded.tolist()

In [None]:
mean_vote_count = test_df["vote_count"].mean()
mean_vote_average = test_df["vote_average"].mean()
def bayesian_avg(vote_average, vote_count):
    bayesian_average = (mean_vote_count * mean_vote_average + vote_average * vote_count) / (mean_vote_count + vote_count)
    return round(bayesian_average, 1)
test_df["bayesian_avg"] = bayesian_avg(test_df["vote_average"], test_df["vote_count"])

In [None]:
def rating_similarity(input_bayesian_avg, test_bayesian_avg):
    euclidean_distance = abs(input_bayesian_avg - test_bayesian_avg)
    return 1 / (1 + euclidean_distance)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommendations(input_index):
    recommendations_df = test_df.copy()
    input_vector = np.array(recommendations_df.loc[input_index, "genre_vector"])
    genre_vectors_array = np.array(recommendations_df["genre_vector"].tolist())
    cosine_similarities = cosine_similarity([input_vector], genre_vectors_array)
    recommendations_df["genre_similarity"] = cosine_similarities[0].tolist()
    
    input_bayesian_avg = recommendations_df.loc[input_index, "bayesian_avg"]
    recommendations_df["rating_similarity"] = rating_similarity(input_bayesian_avg, recommendations_df["bayesian_avg"])

    alpha = 0.65
    recommendations_df["overall_similarity"] = alpha * recommendations_df["genre_similarity"] + (1 - alpha) * recommendations_df["rating_similarity"]
    recommendations_df = recommendations_df.drop(index = input_index)
    recommendations_df = recommendations_df.sort_values(by = "overall_similarity", ascending = False)

    return recommendations_df[["title", "overview","genres","vote_average","vote_count"]][0:10]

In [None]:
movie_list = test_df["title"].tolist()
found_movie_input = ""

while found_movie_input == "":
    input_movie = input("Hello! Please enter a movie you would like to receive recommendations for.")
    possible_matches_tuples = process.extractBests(input_movie, movie_list, score_cutoff = 80, limit = 10)
    possible_matches_list = [x[0] for x in possible_matches_tuples]
    if len(possible_matches_list) == 0:
        continue_input = input("It seems that movie does not exist in the database. Would you like to try again? Type N for No, and anything else for Yes.").strip()
        if "N" == continue_input.upper():
            print("Thank you, have a nice day!")
            break
        else:
            clear_output()
            print("Understood. Let's try again!")
            continue
    else:
        print("These were the top matches for your query.")
        possible_matches_df = test_df[test_df["title"].isin(possible_matches_list)]
        display(possible_matches_df[["title","release_date","overview","genres"]])
        found_movie_input = input("Does this show the movie you want? If so, type Y for Yes, and anything else for No.").strip()
        if "Y" != found_movie_input.upper():
            continue_input = input("Would you like to try another search? Type N for No, and anything else for Yes.").strip()
            if "N" == continue_input.upper():
                print("Thank you, have a nice day!")
                break
            else:
                clear_output()
                found_movie_input = ""
                print("Understood. Let's try again!")
else: 
    while True:
        input_index = input("Type the number on the left of the title to proceed.").strip()
        valid_index_list = [str(x) for x in possible_matches_df.index]
        if input_index in valid_index_list:
            clear_output()
            print("Thank you! Below are 10 recommendations for that movie:")
            display(recommendations(int(input_index)))
            break
        else: 
            print("It seems you inputted the wrong index. Please try again.")