In [16]:
# 1️⃣ Install dependencies
!pip install -q nltk scikit-learn pandas

# 2️⃣ Imports & NLTK setup
import os, re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords', quiet=True)

# 3️⃣ Load dataset
dataset_filename = '/content/sample_data/25k IMDb movie Dataset.csv'
if not os.path.exists(dataset_filename):
    raise FileNotFoundError(
        f"'{dataset_filename}' not found. Place the CSV in the working directory."
    )
df = pd.read_csv(dataset_filename)

# 4️⃣ Normalize column names
if any(col.lower() == 'movie title' for col in df.columns):
    old = next(col for col in df.columns if col.lower() == 'movie title')
    df.rename(columns={old: 'Movie Name'}, inplace=True)
if any(col.lower() == 'overview' for col in df.columns):
    old = next(col for col in df.columns if col.lower() == 'overview')
    df.rename(columns={old: 'Storyline'}, inplace=True)
assert 'Movie Name' in df.columns and 'Storyline' in df.columns, "Dataset must contain 'Movie Name' and 'Storyline'."

# 5️⃣ Preprocessing function
stop_words = set(stopwords.words('english'))
def preprocess(text: str) -> str:
    text = text.lower() if isinstance(text, str) else ''
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = [tok for tok in text.split() if tok not in stop_words]
    return ' '.join(tokens)

df['clean_story'] = df['Storyline'].apply(preprocess)

# 6️⃣ TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['clean_story'])

# 7️⃣ Recommendation function
def get_recommendations(storyline: str, top_n: int = 5):
    vec = vectorizer.transform([preprocess(storyline)])
    sims = cosine_similarity(vec, tfidf_matrix).flatten()
    idxs = sims.argsort()[-top_n:][::-1]
    return df[['Movie Name', 'Storyline']].iloc[idxs].reset_index(drop=True)

# 8️⃣ Prompt & display recommendations
story_input = input("Enter a movie storyline: ").strip()
if not story_input:
    raise ValueError("Please enter a non-empty storyline.")

recs = get_recommendations(story_input)
print("\nTop 5 Movie Recommendations:\n")
for i, row in recs.iterrows():
    print(f"{i+1}. {row['Movie Name']}")
    print(f"   {row['Storyline']}\n")


Enter a movie storyline: the hero fights in a tough boxing match and wins it

Top 5 Movie Recommendations:

1. Jungleland
   Two brothers try to escape their circumstances by travelling across the country for a no holds barred boxing match that becomes a fight for their lives.

2. Judgment Night
   Four young friends, while taking a shortcut en route to a local boxing match, witness a brutal murder which leaves them running for their lives.

3. Grudge Match
   A pair of aging boxing rivals are coaxed out of retirement to fight one final bout, thirty years after their last match.

4. The Sting II
   A team of cons is planning a new scam involving betting on a boxing match but one of its past victims aims to exact revenge by eliminating the whole group.

5. Heart of a Lion
   The life and boxing career of George Foreman.

