In [138]:
# Import useful tools
import pandas as pd
import numpy as np
import ast

In [139]:
# Download WordNet for lemmatization
import nltk

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [140]:
# Load and preprocess dataset
df = pd.read_csv("movies_metadata_sampled_for_test.csv")
df.head()

Unnamed: 0,title,genres,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,vote_average,vote_count
0,Small Crimes,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",en,Small Crimes,"A disgraced former cop, fresh off a six-year p...",7.219022,"[{'name': 'Rooks Nest Entertainment', 'id': 34...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",2017/4/28,0,95,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",5.8,55
1,Up the Sandbox,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",en,Up the Sandbox,"A young wife and mother, bored with day-to-day...",0.13845,"[{'name': 'Barwood Films', 'id': 3645}, {'name...","[{'iso_3166_1': 'US', 'name': 'United States o...",1972/12/21,0,97,"[{'iso_639_1': 'en', 'name': 'English'}]",7.3,2
2,Bad Lieutenant,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",en,Bad Lieutenant,"While investigating a young nun's rape, a corr...",6.417037,"[{'name': 'Bad Lt. Productions', 'id': 11264}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1992/9/16,2019469,96,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",6.9,162
3,Satan's Little Helper,"[{'id': 27, 'name': 'Horror'}, {'id': 10749, '...",en,Satan's Little Helper,A naïve young boy unknowingly becomes the pawn...,2.233189,"[{'name': 'Intrinsic Value Films', 'id': 2828}...","[{'iso_3166_1': 'US', 'name': 'United States o...",2004/1/1,0,100,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",5.0,42
4,Sitcom,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",fr,Sitcom,The adventures of an upper-class suburban fami...,1.800582,"[{'name': 'Fidélité Productions', 'id': 147}]","[{'iso_3166_1': 'FR', 'name': 'France'}]",1998/5/27,0,80,"[{'iso_639_1': 'fr', 'name': 'Français'}]",6.4,27


In [141]:
# Handle NaN values
missing_values = df.isna().sum()
missing_values

title                   0
genres                  0
original_language       0
original_title          0
overview                2
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
vote_average            0
vote_count              0
dtype: int64

In [142]:
df = df.dropna(subset=['overview'])

In [143]:
missing_values = df.isna().sum()
missing_values

title                   0
genres                  0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
vote_average            0
vote_count              0
dtype: int64

In [144]:
df['genres'] = df['genres'].astype(str)

def extract_genres(genre_str):
    try:
        return " ".join(g['name'] for g in ast.literal_eval(genre_str) if isinstance(g, dict))
    except:
        return ""

df['genres_cleaned'] = df['genres'].apply(extract_genres)

In [145]:
df.head()

Unnamed: 0,title,genres,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,vote_average,vote_count,genres_cleaned
0,Small Crimes,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",en,Small Crimes,"A disgraced former cop, fresh off a six-year p...",7.219022,"[{'name': 'Rooks Nest Entertainment', 'id': 34...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",2017/4/28,0,95,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",5.8,55,Drama Comedy Thriller Crime
1,Up the Sandbox,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",en,Up the Sandbox,"A young wife and mother, bored with day-to-day...",0.13845,"[{'name': 'Barwood Films', 'id': 3645}, {'name...","[{'iso_3166_1': 'US', 'name': 'United States o...",1972/12/21,0,97,"[{'iso_639_1': 'en', 'name': 'English'}]",7.3,2,Drama Comedy
2,Bad Lieutenant,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",en,Bad Lieutenant,"While investigating a young nun's rape, a corr...",6.417037,"[{'name': 'Bad Lt. Productions', 'id': 11264}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1992/9/16,2019469,96,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",6.9,162,Crime Drama
3,Satan's Little Helper,"[{'id': 27, 'name': 'Horror'}, {'id': 10749, '...",en,Satan's Little Helper,A naïve young boy unknowingly becomes the pawn...,2.233189,"[{'name': 'Intrinsic Value Films', 'id': 2828}...","[{'iso_3166_1': 'US', 'name': 'United States o...",2004/1/1,0,100,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",5.0,42,Horror Romance Comedy
4,Sitcom,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",fr,Sitcom,The adventures of an upper-class suburban fami...,1.800582,"[{'name': 'Fidélité Productions', 'id': 147}]","[{'iso_3166_1': 'FR', 'name': 'France'}]",1998/5/27,0,80,"[{'iso_639_1': 'fr', 'name': 'Français'}]",6.4,27,Comedy Drama Thriller


In [146]:
# Combine 'overview' and 'genres_cleaned' into a new column
df['content'] = df['genres_cleaned'] + " " + df['overview']
df.head()

Unnamed: 0,title,genres,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,vote_average,vote_count,genres_cleaned,content
0,Small Crimes,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",en,Small Crimes,"A disgraced former cop, fresh off a six-year p...",7.219022,"[{'name': 'Rooks Nest Entertainment', 'id': 34...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",2017/4/28,0,95,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",5.8,55,Drama Comedy Thriller Crime,Drama Comedy Thriller Crime A disgraced former...
1,Up the Sandbox,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",en,Up the Sandbox,"A young wife and mother, bored with day-to-day...",0.13845,"[{'name': 'Barwood Films', 'id': 3645}, {'name...","[{'iso_3166_1': 'US', 'name': 'United States o...",1972/12/21,0,97,"[{'iso_639_1': 'en', 'name': 'English'}]",7.3,2,Drama Comedy,"Drama Comedy A young wife and mother, bored wi..."
2,Bad Lieutenant,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",en,Bad Lieutenant,"While investigating a young nun's rape, a corr...",6.417037,"[{'name': 'Bad Lt. Productions', 'id': 11264}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1992/9/16,2019469,96,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",6.9,162,Crime Drama,Crime Drama While investigating a young nun's ...
3,Satan's Little Helper,"[{'id': 27, 'name': 'Horror'}, {'id': 10749, '...",en,Satan's Little Helper,A naïve young boy unknowingly becomes the pawn...,2.233189,"[{'name': 'Intrinsic Value Films', 'id': 2828}...","[{'iso_3166_1': 'US', 'name': 'United States o...",2004/1/1,0,100,"[{'iso_639_1': 'it', 'name': 'Italiano'}, {'is...",5.0,42,Horror Romance Comedy,Horror Romance Comedy A naïve young boy unknow...
4,Sitcom,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",fr,Sitcom,The adventures of an upper-class suburban fami...,1.800582,"[{'name': 'Fidélité Productions', 'id': 147}]","[{'iso_3166_1': 'FR', 'name': 'France'}]",1998/5/27,0,80,"[{'iso_639_1': 'fr', 'name': 'Français'}]",6.4,27,Comedy Drama Thriller,Comedy Drama Thriller The adventures of an upp...


In [147]:
# Create TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['content'])  # Compute TF-IDF matrix

In [148]:
# Recommendation function
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies(user_input, top_n=5):
    user_tfidf = vectorizer.transform([user_input])
    cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix)
    
    # Compute similarity
    top_indices = cosine_sim.argsort()[0][-top_n:][::-1]
    similarity_scores = cosine_sim[0, top_indices]

    # Create DataFrame of recommended movies
    recommended_movies = df.iloc[top_indices][['title', 'vote_average', 'genres_cleaned']].copy()
    recommended_movies["similarity"] = similarity_scores

    # Extract and lemmatize the last word from user input
    input_genre = user_input.split()[-1].lower()
    input_genre = lemmatizer.lemmatize(input_genre)

    # Prioritize recommendations matching the genre
    genre_filtered_movies = recommended_movies[recommended_movies["genres_cleaned"].str.contains(input_genre, case=False, na=False)]

    # Return genre-matching movies if available; otherwise, return top similar movies
    return genre_filtered_movies if not genre_filtered_movies.empty else recommended_movies

In [163]:
# Run Example
if __name__ == "__main__":
    user_input = input('Please enter a movie genre preference (e.g., "I love thrilling action movies set in space, with a comedic twist."): ')  # User input
    recommendations = recommend_movies(user_input)

    # Ensure only recommendations with similarity > 0.05 are displayed
    recommendations = recommendations[recommendations["similarity"] > 0.05]

    # Display results
    if recommendations.empty:
        print("\nNo relevant recommendations found. Try using a different description.")
    else:
        print("\nTop recommended movies:")
        print(recommendations.to_string(index=False))

Please enter a movie genre preference (e.g., "I love thrilling action movies set in space, with a comedic twist."):  Ilove romantic movies with humor.



Top recommended movies:
                              title  vote_average genres_cleaned  similarity
                          Dreamboat           7.0         Comedy    0.159225
                        Bon appétit           5.3        Romance    0.150184
      Walking the Streets of Moscow           7.0 Romance Comedy    0.135700
Tig Notaro: Boyish Girl Interrupted           6.3         Comedy    0.103615
        The Magnificent Seven Ride!           5.5 Action Western    0.081413
