In [None]:
import numpy as bp
import pandas as pd
pd.set_option('display.max_columns',None)
pd.set_option('display.width',500)
pd.set_option('display.expand_frame_repr',False)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
# 1. Load the dataset
file_path = '/content/MovieDataset.csv'  # Change this path to your actual CSV file path
df = pd.read_csv(file_path)

# Ensure CSV has 'title' and 'plot' columns
if 'title' not in df.columns or 'plot' not in df.columns:
    raise ValueError("CSV must contain 'title' and 'plot' columns")

# Handle missing values by filling them with an empty string
df['plot'] = df['plot'].fillna('')

# 2. User query (input description)
user_query = "I enjoy thrilling heist movies with unexpected twists"

# 3. Vectorization: TF-IDF for dataset plot summaries
vectorizer = TfidfVectorizer(stop_words='english')  # Consider using built-in stop words

# Fit the vectorizer on the dataset and transform the plots
tfidf_matrix = vectorizer.fit_transform(df['plot'])

# 4. Transform the user query using the same vectorizer
user_query_tfidf = vectorizer.transform([user_query])

# 5. Compute Cosine Similarity: Compute similarity between the user query and dataset items
cosine_similarities = cosine_similarity(user_query_tfidf, tfidf_matrix)

# 6. Get top 5 most similar items (movies)
top_n = 5  # Number of top recommendations to return

# Get the similarities and corresponding indices
similarities = cosine_similarities[0]
indices = list(range(len(similarities)))

# Sort: order the indices based on similarity from highest to lowest
indices.sort(key=lambda x: similarities[x], reverse=True)

# Get the top_n indices
top_indices = indices[:top_n]

# Output the recommended movies with a formatted display
for idx in top_indices:
    title = df.iloc[idx]['title']
    similarity_score = similarities[idx]
    print(f"{idx+1}. {title}\n   Similarity: {similarity_score:.4f}")

8. Pulp Fiction
   Similarity: 0.2202
14. The Prestige
   Similarity: 0.1333
63. The Grand Budapest Hotel
   Similarity: 0.1323
23. The Usual Suspects
   Similarity: 0.1320
64. Lost in Translation
   Similarity: 0.1046
