# **Task 1: Build the IMDb Top 100 Movie Dataset**

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin
import json

In [2]:
class IMDbTop100Scraper:
    def __init__(self):
        # Setup base URL and HTTP headers for session
        self.base_url = "https://www.imdb.com"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)

    # Fetch Top 250 movies page and extract up to 100 unique movie URLs
    def get_movie_links_from_top250(self):
        url = "https://www.imdb.com/chart/top/"
        try:
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            movie_links = []
            movie_ids = set()

            all_title_links = soup.find_all('a', href=re.compile(r'/title/tt\d+/'))

            for link in all_title_links:
                href = link.get('href', '')
                movie_id_match = re.search(r'/title/(tt\d+)/', href)
                if movie_id_match:
                    movie_id = movie_id_match.group(1)
                    if movie_id not in movie_ids:
                        movie_ids.add(movie_id)
                        clean_url = f"{self.base_url}/title/{movie_id}/"
                        movie_links.append(clean_url)

            if len(movie_links) < 100:
                backup_movies = self.get_backup_top100_list()
                for movie_id in backup_movies:
                    if movie_id not in movie_ids:
                        movie_ids.add(movie_id)
                        movie_links.append(f"{self.base_url}/title/{movie_id}/")
                        if len(movie_links) >= 100:
                            break

            return movie_links[:100]

        except Exception:
            backup_movies = self.get_backup_top100_list()
            return [f"{self.base_url}/title/{movie_id}/" for movie_id in backup_movies[:100]]

    # Return a hardcoded list of top 100 movie IDs
    def get_backup_top100_list(self):
        return [
            'tt0111161', 'tt0068646', 'tt0468569', 'tt0071562', 'tt0050083', 'tt0167260',
            'tt0108052', 'tt0110912', 'tt0120737', 'tt0060196', 'tt0109830', 'tt0167261',
            'tt0137523', 'tt1375666', 'tt0080684', 'tt0133093', 'tt0099685', 'tt0816692',
            'tt0073486', 'tt0114369', 'tt0038650', 'tt0102926', 'tt0047478', 'tt0120815',
            'tt0120689', 'tt0317248', 'tt0118799', 'tt0076759', 'tt0103064', 'tt0088763',
            'tt0245429', 'tt0054215', 'tt0110413', 'tt0172495', 'tt0021749', 'tt0253474',
            'tt0027977', 'tt0407887', 'tt0082971', 'tt0047396', 'tt0064116', 'tt0033467',
            'tt0056172', 'tt0095765', 'tt0078748', 'tt0053125', 'tt0034583', 'tt0078788',
            'tt0052357', 'tt0070735', 'tt0022100', 'tt0114814', 'tt0043014', 'tt0036775',
            'tt0482571', 'tt0050825', 'tt0075314', 'tt0032553', 'tt0045152', 'tt0082096',
            'tt0081505', 'tt0057012', 'tt0119698', 'tt0095327', 'tt0210945', 'tt0086190',
            'tt0084787', 'tt0031679', 'tt0042876', 'tt0051201', 'tt0105236', 'tt0040522',
            'tt0025316', 'tt0112573', 'tt0056592', 'tt0093058', 'tt0180093', 'tt0066921',
            'tt0086879', 'tt0044741', 'tt0338013', 'tt0072684', 'tt0046438', 'tt0062622',
            'tt0055630', 'tt0071853', 'tt0049406', 'tt0053604', 'tt0041959', 'tt0090605',
            'tt0208092', 'tt0032976', 'tt0113277', 'tt0077416', 'tt0050212', 'tt0046912',
            'tt0079944', 'tt0091251', 'tt0061512', 'tt0045152'
        ]

    # Scrape details from a single movie page
    def extract_movie_details(self, movie_url):
        try:
            response = self.session.get(movie_url, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            movie_data = {
                'Title': 'N/A',
                'Duration': 'N/A',
                'YearofRelease': 'N/A',
                'IMDbRating': 'N/A',
                'Description': 'N/A'
            }

            json_scripts = soup.find_all('script', type='application/ld+json')
            for script in json_scripts:
                try:
                    data = json.loads(script.string)
                    if isinstance(data, dict) and data.get('@type') == 'Movie':
                        if data.get('name'):
                            movie_data['Title'] = data.get('name')
                        if data.get('datePublished'):
                            year_match = re.search(r'(\d{4})', data.get('datePublished'))
                            if year_match:
                                movie_data['YearofRelease'] = year_match.group(1)
                        if data.get('description'):
                            desc = data.get('description')
                            movie_data['Description'] = desc[:500] + ('...' if len(desc) > 500 else '')
                        if data.get('aggregateRating', {}).get('ratingValue'):
                            movie_data['IMDbRating'] = str(data['aggregateRating']['ratingValue'])
                        if data.get('duration', '').startswith('PT'):
                            match = re.search(r'PT(?:(\d+)H)?(?:(\d+)M)?', data['duration'])
                            if match:
                                hours = int(match.group(1) or 0)
                                minutes = int(match.group(2) or 0)
                                movie_data['Duration'] = f"{hours}h {minutes}m" if hours > 0 else f"{minutes}m"
                        return movie_data
                except json.JSONDecodeError:
                    continue

            return movie_data

        except Exception:
            return None

    # Loop through movie URLs and scrape each movie
    def scrape_top_100_movies(self, delay=1.5):
        movie_links = self.get_movie_links_from_top250()
        movies_data = []

        for i, url in enumerate(movie_links, 1):
            movie_data = self.extract_movie_details(url)
            if movie_data and movie_data['Title'] != 'N/A':
                movies_data.append(movie_data)
            time.sleep(delay)

        return movies_data

    # Save the final data to a CSV file
    def save_to_csv(self, movies_data, filename='top_100_movies.csv'):
        if not movies_data:
            return False

        df = pd.DataFrame(movies_data)
        df['YearofRelease'] = df['YearofRelease'].apply(lambda x: self.clean_year(str(x)))
        df['IMDbRating'] = df['IMDbRating'].apply(lambda x: self.clean_rating(str(x)))
        df = df.drop_duplicates(subset=['Title'], keep='first')
        df['rating_numeric'] = pd.to_numeric(df['IMDbRating'], errors='coerce')
        df = df.sort_values('rating_numeric', ascending=False).drop('rating_numeric', axis=1)

        try:
            df.to_csv(filename, index=False, encoding='utf-8')
            return True
        except Exception:
            return False

    # Clean year value
    def clean_year(self, year_text):
        year_match = re.search(r'(\d{4})', str(year_text))
        if year_match:
            year = int(year_match.group(1))
            if 1900 <= year <= 2025:
                return str(year)
        return 'N/A'

    # Clean IMDb rating value
    def clean_rating(self, rating_text):
        rating_match = re.search(r'(\d+\.?\d*)', str(rating_text))
        if rating_match:
            rating = float(rating_match.group(1))
            if 0 <= rating <= 10:
                return str(rating)
        return 'N/A'

In [3]:
# Main script runner

def main():
    scraper = IMDbTop100Scraper()
    movies_data = scraper.scrape_top_100_movies(delay=1.5)

    if movies_data and len(movies_data) > 0:
        scraper.save_to_csv(movies_data, 'top_100_movies.csv')

if __name__ == "__main__":
    main()


In [4]:
df = pd.read_csv("/content/top_100_movies.csv")

In [5]:
df

Unnamed: 0,Title,Duration,YearofRelease,IMDbRating,Description
0,The Shawshank Redemption,2h 22m,1994,9.3,A banker convicted of uxoricide forms a friend...
1,The Godfather,2h 55m,1972,9.2,The aging patriarch of an organized crime dyna...
2,The Dark Knight,2h 32m,2008,9.0,When a menace known as the Joker wreaks havoc ...
3,The Godfather Part II,3h 22m,1974,9.0,The early life and career of Vito Corleone in ...
4,12 Angry Men,1h 36m,1957,9.0,The jury in a New York City murder trial is fr...
...,...,...,...,...,...
94,Rebecca,2h 10m,1940,8.1,A self-conscious woman juggles adjusting to he...
95,Cool Hand Luke,2h 7m,1967,8.0,A laid-back Southern man is sentenced to two y...
96,Stalker,2h 42m,1980,8.0,A guide leads two men through an area known as...
97,The Killing,1h 24m,1956,7.9,Crook Johnny Clay assembles a five-man team to...


# **Task 2: Query-Based Movie Lookup**

In [20]:
# Complete Movie Lookup System for Google Colab
# Run this entire cell first!

# Install required packages
print(" Installing required packages...")
!pip install fuzzywuzzy[speedup] -q
print("Packages installed successfully!")

# Import all required libraries
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re
from typing import Dict, List, Optional, Tuple
import os

print("Libraries imported successfully!")

class MovieLookupSystem:
    def __init__(self, csv_file_path: str = 'top_100_movies.csv'):
        """
        Initialize the movie lookup system with the CSV dataset

        Args:
            csv_file_path (str): Path to the CSV file containing movie data
        """
        self.csv_file_path = csv_file_path
        self.movies_df = None
        self.movie_titles = []
        self.load_dataset()

    def load_dataset(self) -> bool:
        """
        Load the movie dataset from CSV file

        Returns:
            bool: True if loaded successfully, False otherwise
        """
        try:
            # Check if file exists
            if not os.path.exists(self.csv_file_path):
                print(f" CSV file '{self.csv_file_path}' not found in current directory.")
                print(" Current directory files:")
                files = [f for f in os.listdir('.') if f.endswith('.csv')]
                if files:
                    print("   CSV files found:", files)
                    # Try to use the first CSV file found
                    if len(files) == 1:
                        print(f" Trying to use: {files[0]}")
                        self.csv_file_path = files[0]
                    else:
                        return False
                else:
                    print("   No CSV files found!")
                    print(" Please make sure you have run the scraper first to generate the CSV file.")
                    return False

            self.movies_df = pd.read_csv(self.csv_file_path, encoding='utf-8')

            # Clean and prepare movie titles for fuzzy matching
            self.movies_df['Title'] = self.movies_df['Title'].astype(str)
            self.movie_titles = self.movies_df['Title'].tolist()

            print(f" Dataset loaded successfully! Found {len(self.movies_df)} movies.")
            return True

        except Exception as e:
            print(f"Error loading dataset: {str(e)}")
            return False

    def clean_title(self, title: str) -> str:
        """
        Clean movie title for better matching

        Args:
            title (str): Raw movie title

        Returns:
            str: Cleaned title
        """
        # Remove common prefixes and suffixes that might interfere with matching
        title = str(title).strip()

        # Remove year in parentheses if present
        title = re.sub(r'\s*\(\d{4}\)\s*', '', title)

        # Remove articles at the beginning for better matching
        title = re.sub(r'^(The|A|An)\s+', '', title, flags=re.IGNORECASE)

        return title.strip()

    def find_best_matches(self, user_input: str, threshold: int = 60, max_results: int = 5) -> List[Tuple[str, int]]:
        """
        Find best matching movies using fuzzy string matching

        Args:
            user_input (str): User's movie title input
            threshold (int): Minimum similarity score (0-100)
            max_results (int): Maximum number of results to return

        Returns:
            List[Tuple[str, int]]: List of (movie_title, similarity_score) tuples
        """
        if not user_input.strip():
            return []

        # Clean the user input
        cleaned_input = self.clean_title(user_input)

        # Create a list of cleaned titles for matching
        cleaned_titles = [(self.clean_title(title), title) for title in self.movie_titles]
        cleaned_title_list = [cleaned for cleaned, _ in cleaned_titles]

        # Use process.extract for multiple matches
        matches = process.extract(cleaned_input, cleaned_title_list,
                                scorer=fuzz.WRatio, limit=max_results)

        # Filter by threshold and map back to original titles
        result = []
        for match_text, score in matches:
            if score >= threshold:
                # Find the original title
                original_title = next((original for cleaned, original in cleaned_titles
                                     if cleaned == match_text), match_text)
                result.append((original_title, score))

        return result

    def get_movie_details(self, movie_title: str) -> Optional[Dict]:
        """
        Get full details of a specific movie

        Args:
            movie_title (str): Exact movie title from dataset

        Returns:
            Dict: Movie details or None if not found
        """
        try:
            movie_row = self.movies_df[self.movies_df['Title'] == movie_title]

            if movie_row.empty:
                return None

            # Convert to dictionary and clean up
            movie_data = movie_row.iloc[0].to_dict()

            # Replace NaN values with 'N/A'
            for key, value in movie_data.items():
                if pd.isna(value) or value in ['nan', 'NaN', None]:
                    movie_data[key] = 'N/A'

            return movie_data

        except Exception as e:
            print(f" Error retrieving movie details: {str(e)}")
            return None

    def display_movie_info(self, movie_data: Dict) -> None:
        """
        Display movie information in a formatted way

        Args:
            movie_data (Dict): Movie details dictionary
        """
        print("\n" + "="*60)
        print(" MOVIE DETAILS")
        print("="*60)

        # Define the order and display names for fields
        field_mapping = {
            'Title': ' Title',
            'YearofRelease': 'Year of Release',
            'Duration': ' Duration',
            'IMDbRating': ' IMDb Rating',
            'Description': ' Description'
        }

        for field, display_name in field_mapping.items():
            value = movie_data.get(field, 'N/A')

            if field == 'Description' and value != 'N/A':
                # Format description with proper wrapping
                print(f"\n{display_name}:")
                print("-" * 40)
                # Word wrap for description
                words = value.split()
                line_length = 0
                formatted_desc = ""
                for word in words:
                    if line_length + len(word) + 1 > 70:  # 70 chars per line
                        formatted_desc += "\n" + word + " "
                        line_length = len(word) + 1
                    else:
                        formatted_desc += word + " "
                        line_length += len(word) + 1
                print(formatted_desc.strip())
            else:
                print(f"{display_name}: {value}")

        print("="*60)

    def search_movie(self, user_input: str, show_multiple: bool = True) -> bool:
        """
        Main search function for Colab (simplified)

        Args:
            user_input (str): User's movie title input
            show_multiple (bool): Whether to show multiple matches

        Returns:
            bool: True if movie found and displayed, False otherwise
        """
        if self.movies_df is None:
            print(" Dataset not loaded. Please check if the CSV file exists.")
            return False

        print(f"\n Searching for: '{user_input}'")
        print("-" * 50)

        # Find best matches
        matches = self.find_best_matches(user_input, threshold=50, max_results=5)

        if not matches:
            print(" No movies found matching your search.")
            print(" Try checking spelling or using fewer words.")
            return False

        # Show the best match
        best_title, best_score = matches[0]
        print(f" Best match: '{best_title}' (Similarity: {best_score}%)")

        # Show other matches if available and requested
        if len(matches) > 1 and show_multiple:
            print(f"\n Other similar matches:")
            for title, score in matches[1:]:
                print(f"   • {title} (Similarity: {score}%)")

        # Display details for best match
        movie_details = self.get_movie_details(best_title)
        if movie_details:
            self.display_movie_info(movie_details)
            return True

        return False

    def get_random_movies(self, n: int = 5) -> List[str]:
        """
        Get random movie titles from the dataset
        """
        if self.movies_df is None or len(self.movies_df) == 0:
            return []

        sample_size = min(n, len(self.movies_df))
        random_movies = self.movies_df.sample(sample_size)['Title'].tolist()
        return random_movies

    def display_stats(self) -> None:
        """
        Display dataset statistics
        """
        if self.movies_df is None:
            print(" Dataset not loaded.")
            return

        print("\n DATASET STATISTICS")
        print("=" * 40)
        print(f" File: {self.csv_file_path}")
        print(f" Total movies: {len(self.movies_df)}")

        # Year statistics
        valid_years = pd.to_numeric(self.movies_df['YearofRelease'], errors='coerce').dropna()
        if not valid_years.empty:
            print(f" Year range: {int(valid_years.min())} - {int(valid_years.max())}")

        # Rating statistics
        valid_ratings = pd.to_numeric(self.movies_df['IMDbRating'], errors='coerce').dropna()
        if not valid_ratings.empty:
            print(f" Rating range: {valid_ratings.min():.1f} - {valid_ratings.max():.1f}")
            print(f" Average rating: {valid_ratings.mean():.1f}")

        # Show columns
        print(f" Columns: {', '.join(self.movies_df.columns.tolist())}")


# Initialize the system
print("\n Initializing Movie Lookup System...")
print("=" * 50)

# Create the lookup system instance
lookup_system = MovieLookupSystem('top_100_movies.csv')

# Check if dataset loaded successfully
if lookup_system.movies_df is not None:
    # Display statistics
    lookup_system.display_stats()

    print("\n Random movies for testing:")
    print("-" * 30)
    random_movies = lookup_system.get_random_movies(5)
    for i, movie in enumerate(random_movies, 1):
        print(f"{i}. {movie}")

    print("\n" + "="*60)
    print(" FUZZY SEARCH DEMO")
    print("="*60)

    # Demo searches with common typos
    test_searches = [
        "Shawsank Redmption",  # Typos in "The Shawshank Redemption"
        "Dark Night",          # Common mistake for "The Dark Knight"
        "Godfather",          # Partial name
        "Matrix",             # Partial name
        "Pulp Fiction",       # Correct name
    ]

    for search_term in test_searches:
        lookup_system.search_movie(search_term, show_multiple=False)
        print("\n" + "-"*50)

    print("\n USAGE EXAMPLES:")
    print("="*40)
    print("# Search for a movie:")
    print("lookup_system.search_movie('your movie title here')")
    print("\n# Get movie details directly:")
    print("details = lookup_system.get_movie_details('The Shawshank Redemption')")
    print("\n# Find similar movies:")
    print("matches = lookup_system.find_best_matches('shawshank')")

else:
    print("\n Could not load dataset. Please make sure your CSV file exists!")
    print(" Make sure you've run Task 1 (the scraper) first to generate the CSV file.")

print("\n Movie Lookup System is ready to use!")
print(" Try: lookup_system.search_movie('your movie name')")

 Installing required packages...
Packages installed successfully!
Libraries imported successfully!

 Initializing Movie Lookup System...
 Dataset loaded successfully! Found 99 movies.

 DATASET STATISTICS
 File: top_100_movies.csv
 Total movies: 99
 Year range: 1931 - 2014
 Rating range: 7.8 - 9.3
 Average rating: 8.4
 Columns: Title, Duration, YearofRelease, IMDbRating, Description

 Random movies for testing:
------------------------------
1. The Killing
2. Cool Hand Luke
3. La vita è bella
4. Terminator 2: Judgment Day
5. Cidade de Deus

 FUZZY SEARCH DEMO

 Searching for: 'Shawsank Redmption'
--------------------------------------------------
 Best match: 'The Shawshank Redemption' (Similarity: 95%)

 MOVIE DETAILS
 Title: The Shawshank Redemption
Year of Release: 1994
 Duration: 2h 22m
 IMDb Rating: 9.3

 Description:
----------------------------------------
A banker convicted of uxoricide forms a friendship over a quarter 
century with a hardened convict, while maintaining his in

In [23]:
lookup_system.search_movie('It&apos;s a Wonderful Life')


 Searching for: 'It&apos;s a Wonderful Life'
--------------------------------------------------
 Best match: 'It&apos;s a Wonderful Life' (Similarity: 100%)

 Other similar matches:
   • Schindler&apos;s List (Similarity: 61%)
   • Alien (Similarity: 60%)
   • Aliens (Similarity: 57%)
   • C&apos;era una volta il West (Similarity: 52%)

 MOVIE DETAILS
 Title: It&apos;s a Wonderful Life
Year of Release: 1947
 Duration: 2h 10m
 IMDb Rating: 8.6

 Description:
----------------------------------------
An angel is sent from Heaven to help a desperately frustrated 
businessman by showing him what life would have been like if he had 
never existed.


True

# **Task 3: Movie Recommendation Using Description Similarity**

In [28]:
# This includes Task 2 (Search) + Task 3 (Recommendation)

# Import all required libraries
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz, process
import re
from typing import Dict, List, Optional, Tuple
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.decomposition import TruncatedSVD
import nltk
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
try:
    import ssl
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
except:
    pass # Added pass to fix indentation error

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


class MovieRecommendationSystem:
    def __init__(self, csv_file_path: str = 'top_100_movies.csv'):
        """
        Initialize the complete movie system with search and recommendation
        """
        self.csv_file_path = csv_file_path
        self.movies_df = None
        self.movie_titles = []
        self.tfidf_matrix = None
        self.tfidf_vectorizer = None
        self.similarity_matrix = None
        self.processed_descriptions = []

        # Initialize NLP components
        try:
            self.stop_words = set(stopwords.words('english'))
            self.lemmatizer = WordNetLemmatizer()
            self.nlp_available = True
        except:
            self.stop_words = set(['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'shall', 'this', 'that', 'these', 'those'])
            self.lemmatizer = None
            self.nlp_available = False

        self.load_dataset()
        if self.movies_df is not None:
            self.prepare_recommendation_engine()

    def load_dataset(self) -> bool:
        """Load the movie dataset and prepare for processing"""
        try:
            if not os.path.exists(self.csv_file_path):
                print(f"CSV file '{self.csv_file_path}' not found.")
                files = [f for f in os.listdir('.') if f.endswith('.csv')]
                if files:
                    self.csv_file_path = files[0]
                    print(f" Using: {files[0]}")
                else:
                    print(" Please run the scraper first to generate the CSV file.")
                    return False

            self.movies_df = pd.read_csv(self.csv_file_path, encoding='utf-8')
            self.movies_df['Title'] = self.movies_df['Title'].astype(str)
            self.movies_df['Description'] = self.movies_df['Description'].fillna('').astype(str)

            # Filter out movies without descriptions
            self.movies_df = self.movies_df[
                (self.movies_df['Description'] != '') &
                (self.movies_df['Description'] != 'N/A') &
                (self.movies_df['Description'] != 'nan')
            ].reset_index(drop=True)

            self.movie_titles = self.movies_df['Title'].tolist()

            print(f" Dataset loaded! {len(self.movies_df)} movies with descriptions.")
            return True

        except Exception as e:
            print(f" Error loading dataset: {str(e)}")
            return False

    def preprocess_text(self, text: str) -> str:
        """
        Advanced text preprocessing for better similarity matching
        """
        if not text or text in ['N/A', 'nan', '']:
            return ""

        # Convert to lowercase
        text = str(text).lower()

        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        if self.nlp_available:
            try:
                # Tokenize
                tokens = word_tokenize(text)

                # Remove stopwords and lemmatize
                tokens = [
                    self.lemmatizer.lemmatize(token)
                    for token in tokens
                    if token not in self.stop_words and len(token) > 2
                ]

                return ' '.join(tokens)
            except:
                pass

        # Basic preprocessing if NLTK fails
        words = text.split()
        words = [word for word in words if word not in self.stop_words and len(word) > 2]
        return ' '.join(words)

    def prepare_recommendation_engine(self):
        """
        Prepare the recommendation engine using TF-IDF and cosine similarity
        """

        # Preprocess all descriptions
        descriptions = self.movies_df['Description'].tolist()
        self.processed_descriptions = [self.preprocess_text(desc) for desc in descriptions]

        # Filter out empty descriptions
        valid_indices = [i for i, desc in enumerate(self.processed_descriptions) if desc.strip()]

        if len(valid_indices) < 5:
            return

        # Create TF-IDF matrix
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=5000,  # Limit features for efficiency
            ngram_range=(1, 2),  # Use unigrams and bigrams
            max_df=0.8,  # Ignore terms that appear in more than 80% of documents
            min_df=2,  # Ignore terms that appear in less than 2 documents
            stop_words='english'
        )

        valid_descriptions = [self.processed_descriptions[i] for i in valid_indices]

        try:
            self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(valid_descriptions)

            # Compute cosine similarity matrix (memory efficient)
            self.similarity_matrix = cosine_similarity(self.tfidf_matrix)

            # Store valid indices mapping
            self.valid_indices = valid_indices

            print(f" TF-IDF matrix shape: {self.tfidf_matrix.shape}")

        except Exception as e:
            self.similarity_matrix = None

    # [Previous methods from Task 2 - kept unchanged]
    def clean_title(self, title: str) -> str:
        """Clean movie title for better matching"""
        title = str(title).strip()
        title = re.sub(r'\s*\(\d{4}\)\s*', '', title)
        title = re.sub(r'^(The|A|An)\s+', '', title, flags=re.IGNORECASE)
        return title.strip()

    def find_best_matches(self, user_input: str, threshold: int = 60, max_results: int = 5) -> List[Tuple[str, int]]:
        """Find best matching movies using fuzzy string matching"""
        if not user_input.strip():
            return []

        cleaned_input = self.clean_title(user_input)
        cleaned_titles = [(self.clean_title(title), title) for title in self.movie_titles]
        cleaned_title_list = [cleaned for cleaned, _ in cleaned_titles]

        matches = process.extract(cleaned_input, cleaned_title_list,
                                scorer=fuzz.WRatio, limit=max_results)

        result = []
        for match_text, score in matches:
            if score >= threshold:
                original_title = next((original for cleaned, original in cleaned_titles
                                     if cleaned == match_text), match_text)
                result.append((original_title, score))

        return result

    def get_movie_details(self, movie_title: str) -> Optional[Dict]:
        """Get full details of a specific movie"""
        try:
            movie_row = self.movies_df[self.movies_df['Title'] == movie_title]
            if movie_row.empty:
                return None

            movie_data = movie_row.iloc[0].to_dict()
            for key, value in movie_data.items():
                if pd.isna(value) or value in ['nan', 'NaN', None]:
                    movie_data[key] = 'N/A'

            return movie_data
        except Exception as e:
            return None

    def get_movie_index(self, movie_title: str) -> Optional[int]:
        """Get the index of a movie in the dataset"""
        try:
            movie_row = self.movies_df[self.movies_df['Title'] == movie_title]
            if movie_row.empty:
                return None
            return movie_row.index[0]
        except:
            return None

    def recommend_similar_movies(self, movie_title: str, n_recommendations: int = 5) -> List[Tuple[str, float, Dict]]:
        """
        Recommend similar movies based on description similarity

        Args:
            movie_title (str): Title of the movie to base recommendations on
            n_recommendations (int): Number of recommendations to return

        Returns:
            List[Tuple[str, float, Dict]]: List of (title, similarity_score, movie_details)
        """
        if self.similarity_matrix is None:
            return []

        # Get movie index
        movie_idx = self.get_movie_index(movie_title)
        if movie_idx is None:
            print(f" Movie '{movie_title}' not found in dataset.")
            return []

        # Check if this movie has a valid description
        if movie_idx not in self.valid_indices:
            print(f" Movie '{movie_title}' doesn't have a valid description for recommendations.")
            return []

        # Get the index in the similarity matrix
        similarity_idx = self.valid_indices.index(movie_idx)

        # Get similarity scores
        sim_scores = list(enumerate(self.similarity_matrix[similarity_idx]))

        # Sort by similarity (exclude the movie itself)
        sim_scores = [(i, score) for i, score in sim_scores if i != similarity_idx]
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get top N recommendations
        recommendations = []
        for i, (sim_idx, sim_score) in enumerate(sim_scores[:n_recommendations]):
            original_idx = self.valid_indices[sim_idx]
            recommended_title = self.movies_df.iloc[original_idx]['Title']
            movie_details = self.get_movie_details(recommended_title)

            if movie_details:
                recommendations.append((recommended_title, float(sim_score), movie_details))

        return recommendations

    def display_movie_info(self, movie_data: Dict, show_description: bool = True) -> None:
        """Display movie information in a formatted way"""
        print("\n" + "="*60)
        print("MOVIE DETAILS")
        print("="*60)

        field_mapping = {
            'Title': ' Title',
            'YearofRelease': ' Year of Release',
            'Duration': ' Duration',
            'IMDbRating': ' IMDb Rating',
        }

        for field, display_name in field_mapping.items():
            value = movie_data.get(field, 'N/A')
            print(f"{display_name}: {value}")

        if show_description:
            description = movie_data.get('Description', 'N/A')
            if description != 'N/A' and description:
                print(f"\n Description:")
                print("-" * 40)
                # Word wrap for description
                words = description.split()
                line_length = 0
                formatted_desc = ""
                for word in words:
                    if line_length + len(word) + 1 > 70:
                        formatted_desc += "\n" + word + " "
                        line_length = len(word) + 1
                    else:
                        formatted_desc += word + " "
                        line_length += len(word) + 1
                print(formatted_desc.strip())

        print("="*60)

    def display_recommendations(self, movie_title: str, recommendations: List[Tuple[str, float, Dict]]) -> None:
        """Display recommendations in a formatted way"""
        if not recommendations:
            print(" No recommendations found.")
            return

        print(f"\n  RECOMMENDATIONS BASED ON: '{movie_title}'")
        print("="*70)
        print("Movies with similar plot/themes:")
        print("="*70)

        for i, (rec_title, similarity, details) in enumerate(recommendations, 1):
            similarity_percentage = similarity * 100

            print(f"\n{i}.  {rec_title}")
            print(f"    Similarity: {similarity_percentage:.1f}%")
            print(f"    Year: {details.get('YearofRelease', 'N/A')}")
            print(f"    Rating: {details.get('IMDbRating', 'N/A')}")
            print(f"    Duration: {details.get('Duration', 'N/A')}")

            # Show brief description
            description = details.get('Description', '')
            if description and description != 'N/A':
                # Truncate description for recommendations view
                brief_desc = description[:150] + "..." if len(description) > 150 else description
                print(f"    Plot: {brief_desc}")

            print("-" * 60)

    def search_and_recommend(self, user_input: str, show_recommendations: bool = True) -> bool:
        """
        Combined search and recommendation function
        """
        if self.movies_df is None:
            return False

        print(f"\n Searching for: '{user_input}'")
        print("-" * 50)

        # Find best matches using fuzzy search
        matches = self.find_best_matches(user_input, threshold=50, max_results=5)

        if not matches:
            print(" No movies found matching your search.")
            return False

        # Get the best match
        best_title, best_score = matches[0]
        print(f" Found: '{best_title}' (Match: {best_score}%)")

        # Show movie details
        movie_details = self.get_movie_details(best_title)
        if movie_details:
            self.display_movie_info(movie_details)

        # Get and show recommendations
        if show_recommendations:
            print(f"\n  Finding similar movies to '{best_title}'...")
            recommendations = self.recommend_similar_movies(best_title, n_recommendations=5)

            if recommendations:
                self.display_recommendations(best_title, recommendations)
            else:
                print("  Could not generate recommendations for this movie.")

        return True

    def get_content_based_recommendations(self, description: str, n_recommendations: int = 5) -> List[Tuple[str, float, Dict]]:
        """
        Get recommendations based on a custom description
        """
        if self.similarity_matrix is None or self.tfidf_vectorizer is None:
            return []

        try:
            # Preprocess the input description
            processed_desc = self.preprocess_text(description)

            # Transform the description using the fitted vectorizer
            desc_tfidf = self.tfidf_vectorizer.transform([processed_desc])

            # Compute similarity with all movies
            similarities = cosine_similarity(desc_tfidf, self.tfidf_matrix).flatten()

            # Get top recommendations
            top_indices = similarities.argsort()[-n_recommendations:][::-1]

            recommendations = []
            for idx in top_indices:
                if similarities[idx] > 0.1:  # Minimum similarity threshold
                    original_idx = self.valid_indices[idx]
                    movie_title = self.movies_df.iloc[original_idx]['Title']
                    movie_details = self.get_movie_details(movie_title)
                    if movie_details:
                        recommendations.append((movie_title, float(similarities[idx]), movie_details))

            return recommendations

        except Exception as e:
            return []

    def display_stats(self):
        """Display dataset statistics"""
        if self.movies_df is None:
            return

        print("\n DATASET STATISTICS")
        print("=" * 50)
        print(f"File: {self.csv_file_path}")
        print(f" Total movies: {len(self.movies_df)}")
        print(f" Movies with descriptions: {len([d for d in self.processed_descriptions if d.strip()])}")

        if hasattr(self, 'valid_indices'):
            print(f" Movies available for recommendations: {len(self.valid_indices)}")

        if self.tfidf_matrix is not None:
            print(f" TF-IDF features: {self.tfidf_matrix.shape[1]}")

        # Rating statistics
        valid_ratings = pd.to_numeric(self.movies_df['IMDbRating'], errors='coerce').dropna()
        if not valid_ratings.empty:
            print(f" Average rating: {valid_ratings.mean():.1f}")


# Initialize the complete system
print("=" * 60)

movie_system = MovieRecommendationSystem('top_100_movies.csv')

if movie_system.movies_df is not None:
    # Display statistics
    movie_system.display_stats()

    print("\n SYSTEM READY!")
    print("=" * 40)
    print("Available functions:")
    print("1.  Search: movie_system.search_and_recommend('movie name')")
    print("2.  Recommend: movie_system.recommend_similar_movies('exact title')")
    print("3. Content-based: movie_system.get_content_based_recommendations('description')")

    # Demo with examples
    print("\n DEMO: Search + Recommendations")
    print("=" * 60)

    demo_searches = ["Matrix", "Godfather", "Shawshank"]

    for search_term in demo_searches:
        print(f"\n Demo search: '{search_term}'")
        movie_system.search_and_recommend(search_term, show_recommendations=True)
        break  # Show only one example to save space

    print("\n Complete Movie System with Recommendations is Ready!")
    print(" Try: movie_system.search_and_recommend('your movie name')")

else:
    print(" System initialization failed. Please check your CSV file.")

 Dataset loaded! 99 movies with descriptions.
 TF-IDF matrix shape: (99, 193)

 DATASET STATISTICS
File: top_100_movies.csv
 Total movies: 99
 Movies with descriptions: 99
 Movies available for recommendations: 99
 TF-IDF features: 193
 Average rating: 8.4

 SYSTEM READY!
Available functions:
1.  Search: movie_system.search_and_recommend('movie name')
2.  Recommend: movie_system.recommend_similar_movies('exact title')
3. Content-based: movie_system.get_content_based_recommendations('description')

 DEMO: Search + Recommendations

 Demo search: 'Matrix'

 Searching for: 'Matrix'
--------------------------------------------------
 Found: 'The Matrix' (Match: 100%)

MOVIE DETAILS
 Title: The Matrix
 Year of Release: 1999
 Duration: 2h 16m
 IMDb Rating: 8.7

 Description:
----------------------------------------
When a beautiful stranger leads computer hacker Neo to a forbidding 
underworld, he discovers the shocking truth--the life he knows is the 
elaborate deception of an evil cyber-int

In [29]:
movie_system.search_and_recommend('The Godfather')


 Searching for: 'The Godfather'
--------------------------------------------------
 Found: 'The Godfather' (Match: 100%)

MOVIE DETAILS
 Title: The Godfather
 Year of Release: 1972
 Duration: 2h 55m
 IMDb Rating: 9.2

 Description:
----------------------------------------
The aging patriarch of an organized crime dynasty transfers control 
of his clandestine empire to his reluctant son.

  Finding similar movies to 'The Godfather'...

  RECOMMENDATIONS BASED ON: 'The Godfather'
Movies with similar plot/themes:

1.  The Godfather Part II
    Similarity: 35.1%
    Year: 1974
    Rating: 9.0
    Duration: 3h 22m
    Plot: The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his grip on the family cr...
------------------------------------------------------------

2.  One Flew Over the Cuckoo&apos;s Nest
    Similarity: 25.0%
    Year: 1975
    Rating: 8.7
    Duration: 2h 13m
    Plot: A rebellious convict is sent to

True