In [1]:
pip install selenium pandas webdriver-manager


Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
nltk.download("punkt_tab")


[nltk_data] Downloading package punkt_tab to C:\Users\Heram
[nltk_data]     Ramesh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [None]:
# data scraping

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re

# Setup WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# IMDb URL
base_url = "https://www.imdb.com/search/title/?genres=drama&start={}&explore=title_type,genres"

movie_names = []
storylines = []

# Start from page 1
start = 1

while True:
    url = base_url.format(start)
    print(f"Scraping: {url}")
    driver.get(url)
    time.sleep(3)

    # Find all movie containers on current page
    movies = driver.find_elements(By.XPATH, '//li[@class="ipc-metadata-list-summary-item"]')

    # If no movies found, we reached the last page → break loop
    if not movies:
        print("Reached the end of pages.")
        break

    for movie in movies:
        # Extract movie title
        try:
            title_text = movie.find_element(By.XPATH, './/h3[contains(@class,"ipc-title__text")]').text
            # Remove number prefix (e.g., "1. The Shawshank Redemption" → "The Shawshank Redemption")
            title = re.sub(r"^\d+\.\s*", "", title_text)
        except:
            title = "N/A"

        # Extract storyline / description
        try:
            storyline = movie.find_element(By.XPATH, './/div[contains(@class,"ipc-html-content-inner-div")]').text
        except:
            storyline = "N/A"

        movie_names.append(title)
        storylines.append(storyline)

    # Go to next page
    start += 50

# Save all scraped data into CSV
df = pd.DataFrame({"Movie Name": movie_names, "Storyline": storylines})
df.to_csv("imdb_all_movies_storylines.csv", index=False, encoding="utf-8")
print(f"✅ Scraped {len(df)} movies successfully!")

driver.quit()


Scraping: https://www.imdb.com/search/title/?genres=drama&start=1&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=51&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=101&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=151&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=201&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=251&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=301&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=351&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=401&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=451&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start

KeyboardInterrupt: 

In [None]:
#Preprocessing

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data (including the new punkt_tab)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

# ✅ Load the correct dataset
df = pd.read_csv("imdb_movies_storylines.csv")

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', str(text))
    # Lowercase conversion
    text = text.lower()
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Remove stopwords & lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply text cleaning on storylines
df["Cleaned_Storyline"] = df["Storyline"].apply(clean_text)

# Save cleaned dataset
df.to_csv("imdb_movies_storylines_cleaned.csv", index=False)

print("✅ Preprocessing completed successfully!")
df.head()


[nltk_data] Downloading package stopwords to C:\Users\Heram
[nltk_data]     Ramesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Heram
[nltk_data]     Ramesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Heram
[nltk_data]     Ramesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Heram
[nltk_data]     Ramesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


✅ Preprocessing completed successfully!


Unnamed: 0,Movie Name,Storyline,Cleaned_Storyline
0,Dexter: Resurrection,Dexter Morgan awakens from a coma and sets out...,dexter morgan awakens coma set new york city d...
1,Hostage,When the PM's husband is kidnapped and the vis...,pm husband kidnapped visiting french president...
2,F1: The Movie,A Formula One driver comes out of retirement t...,formula one driver come retirement mentor team...
3,Eenie Meanie,A reformed teenage getaway driver is dragged b...,reformed teenage getaway driver dragged back u...
4,Fallout,"In a future, post-apocalyptic Los Angeles brou...",future post apocalyptic los angeles brought nu...


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the cleaned dataset
df = pd.read_csv("imdb_movies_storylines_cleaned.csv")

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words="english")

# Convert cleaned storylines into TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df["Cleaned_Storyline"])

print("✅ TF-IDF Matrix Shape:", tfidf_matrix.shape)

✅ TF-IDF Matrix Shape: (2000, 572)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("✅ Cosine Similarity Matrix Shape:", cosine_sim.shape)

✅ Cosine Similarity Matrix Shape: (2000, 2000)


In [2]:
def recommend_movies(title, df, cosine_sim):
    # Check if the movie exists
    if title.lower() not in df["Movie Name"].str.lower().values:
        return f"❌ Movie '{title}' not found in the dataset."

    # Get index of the movie
    idx = df[df["Movie Name"].str.lower() == title.lower()].index[0]

    # Get similarity scores for the selected movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies by similarity score (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top 5 recommended movie indices (excluding itself)
    top_indices = [i[0] for i in sim_scores[1:6]]

    # Return top 5 movie names
    return df["Movie Name"].iloc[top_indices].tolist()
