In [177]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
import re

In [178]:
df = pd.read_csv('Top_Movies.csv')

In [179]:
def preprocess_text(text):
    """Clean and preprocess text data"""
    lemmatizer = WordNetLemmatizer()
    if isinstance(text, str):
        # Convert to lowercase and remove special characters
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
        return text
    return ''

# Combine relevant features for content-based filtering
df['combined_features'] = df['Plot'] + ' ' + df['Genre'] + ' ' + df['Directors'] + ' ' + df['Stars']
df['combined_features'] = df['combined_features'].apply(preprocess_text)

In [180]:
def create_tfidf_matrix(text_data):
    """Create TF-IDF matrix from text data"""
    tfidf = TfidfVectorizer(stop_words='english')
    return tfidf.fit_transform(text_data)

# Create TF-IDF matrix
tfidf_matrix = create_tfidf_matrix(df['combined_features'])

In [181]:
def compute_similarity(tfidf_matrix):
    """Compute cosine similarity between items"""
    return cosine_similarity(tfidf_matrix)

# Calculate similarity matrix
similarity_matrix = compute_similarity(tfidf_matrix)

In [182]:
def extract_genre_from_query(query):
    """Extract genre preference from user query"""
    genre_keywords = {
        'action': 'Action',
        'comedy': 'Comedy',
        'drama': 'Drama',
        'thriller': 'Thriller',
        'horror': 'Horror',
        'romance': 'Romance'
    }
    
    query = query.lower()
    for keyword, genre in genre_keywords.items():
        if keyword in query:
            return genre
    return None

In [183]:
def combine_features(row):
    """
    Combine relevant features with appropriate weightage
    """
    plot = row['Plot'] if isinstance(row['Plot'], str) else ''
    genre = row['Genre'] if isinstance(row['Genre'], str) else ''
    
    # Give more weight to genre by repeating it
    return f"{plot} {genre} {genre}"

In [186]:
def get_recommendations(user_query, df, n_recommendations=5, similarity_matrix=None):
    """
    Generate movie recommendations based on user input
    
    Parameters:
    user_query (str): User's text description of preferences
    df (DataFrame): Movie dataset
    n_recommendations (int): Number of recommendations to return
    similarity_matrix: Pre-computed similarity matrix (optional)
    
    Returns:
    list: Top N movie recommendations
    """
    # Preprocess user query
    user_query = preprocess_text(user_query)
    
    # Extract genre preference
    preferred_genre = extract_genre_from_query(user_query)
    
    # Create or use pre-computed features
    if 'combined_features' not in df.columns:
        df['combined_features'] = df.apply(combine_features, axis=1)
        df['combined_features'] = df['combined_features'].apply(preprocess_text)
    
    # Create TF-IDF vectorizer with improved parameters
    tfidf = TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 2),  # Include bigrams
        max_features=5000,    # Limit features to most important ones
        min_df=2,            # Ignore terms that appear in less than 2 documents
        max_df=0.95          # Ignore terms that appear in more than 95% of documents
    )
    
    # Fit transform the features
    tfidf_matrix = tfidf.fit_transform(df['combined_features'])
    
    # Transform user query
    query_vector = tfidf.transform([user_query])
    
    # Calculate similarity scores
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)[0]
    
    # Apply genre filtering with weighted boost
    if preferred_genre:
        genre_mask = df['Genre'].str.contains(preferred_genre, case=False).values
        # Boost similarity scores for preferred genre
        similarity_scores = similarity_scores * (1 + 0.5 * genre_mask)
    
    # Get top recommendations
    movie_indices = np.argsort(similarity_scores)[::-1][:n_recommendations]
    
    recommendations = []
    for idx in movie_indices:
        if similarity_scores[idx] > 0:
            recommendations.append({
                'title': df.iloc[idx]['Movie Name'],
                'genre': df.iloc[idx]['Genre'],
                'plot': df.iloc[idx]['Plot'],
                'similarity_score': round(float(similarity_scores[idx]), 3)
            })
    
    return recommendations

# Example usage
user_query = "I love animation and action movies"
recommendations = get_recommendations(user_query, df)


In [187]:
# Display recommendations
print("User Query:", user_query)
print("\nRecommended Movies:")
for i, rec in enumerate(recommendations, 1):
    print(f"\n{i}. {rec['title']}")
    print(f"Genre: {rec['genre']}")
    print(f"Similarity Score: {rec['similarity_score']:.2f}")

User Query: I love animation and action movies

Recommended Movies:

1. The Incredibles
Genre: Animation, Action, Adventure
Similarity Score: 0.30

2. Mononoke-hime
Genre: Animation, Action, Adventure
Similarity Score: 0.30

3. Beauty and the Beast
Genre: Animation, Family, Fantasy
Similarity Score: 0.21

4. The Little Mermaid
Genre: Animation, Adventure, Family
Similarity Score: 0.21

5. Being John Malkovich
Genre: Comedy, Drama, Fantasy
Similarity Score: 0.11
