In [19]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data_path = os.path.join("data", "movie-data.csv")
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,id,title,genres,averageRating,numVotes,releaseYear
0,tt34604827,Fire,Crime,9.5,10121,2025
1,tt0111161,The Shawshank Redemption,Drama,9.3,3009622,1994
2,tt33175825,Attack on Titan the Movie: The Last Attack,"Action, Animation, Drama",9.3,11990,2024
3,tt0068646,The Godfather,"Crime, Drama",9.2,2100110,1972
4,tt0252487,The Chaos Class,Comedy,9.2,44120,1975


In [20]:
# 1. Combine relevant text columns into a single string
df['combined_text'] = df['title'].fillna('') + " " + df['genres'].fillna('')

In [21]:
# 2. Initialize and fit the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

In [22]:
def recommend_items(query, top_n=5):
    # 3A. Transform the query into a TF-IDF vector
    query_vector = vectorizer.transform([query])
    
    # 3B. Compute cosine similarity between query_vector and all item vectors
    sim_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # 3C. Get indices of the top_n most similar items
    top_indices = sim_scores.argsort()[::-1][:top_n]
    
    # 3D. Retrieve those items from the DataFrame
    top_items = df.iloc[top_indices].copy()
    top_items['similarity'] = sim_scores[top_indices]
    
    return top_items[['id', 'title', 'genres', 'similarity']]

In [23]:
user_query = "I love thrilling action movies set in space, with a comedic twist."
results = recommend_items(user_query, top_n=5)
results

Unnamed: 0,id,title,genres,similarity
720,tt1128075,Love Exposure,"Action, Comedy, Drama",0.417728
184,tt0062622,2001: A Space Odyssey,"Adventure, Sci-Fi",0.386343
952,tt0113703,Love Letter,"Drama, Romance",0.365917
436,tt0118694,In the Mood for Love,"Drama, Romance",0.347219
723,tt22488728,Love Today,"Comedy, Drama, Romance",0.335965
