In [5]:
"""
IMDb Review Scraper
-------------------
This script uses Selenium to scrape user reviews from an IMDb movie page.
It automatically expands all "See more" and "Spoiler" buttons to ensure
the full review text is captured. The script then cleans the extracted
text (removing rating numbers, engagement counts, and extra whitespace)
before saving it into a CSV file.

Steps performed:
1. Open the IMDb review page.
2. (Optional) Load all available reviews by clicking "Load More" repeatedly.
3. Click all "See more" buttons to expand truncated reviews.
4. Reveal hidden spoilers by clicking spoiler buttons.
5. Extract all review cards and clean the text content.
6. Save cleaned reviews into a Pandas DataFrame and export as CSV.

Dependencies:
- selenium
- pandas
- re
- time

Author: (Your Name)
Date: (Today’s Date)
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
import re

# === Setup Chrome WebDriver Options ===
options = Options()
options.add_experimental_option("detach", True)  # Keeps the browser open after script ends
driver = webdriver.Chrome(options=options)

# === IMDb Reviews URL ===
url = "https://www.imdb.com/title/tt5950044/reviews/?ref_=tt_ov_ururv"
driver.get(url)
driver.maximize_window()
time.sleep(3)

# === Step 1 (Optional): Load all reviews by clicking "Load More" ===
while True:
    try:
        load_more = driver.find_element(By.XPATH, "//button[contains(., 'Load More')]")
        driver.execute_script("arguments[0].scrollIntoView(true);", load_more)
        driver.execute_script("arguments[0].click();", load_more)
        time.sleep(2)
    except:
        print("No more 'Load More' button found.")
        break

# === Step 2: Click all "See more" buttons ===
see_more_buttons = driver.find_elements(By.CLASS_NAME, "ipc-see-more__button")
for btn in see_more_buttons:
    try:
        driver.execute_script("arguments[0].scrollIntoView(true);", btn)
        driver.execute_script("arguments[0].click();", btn)
        time.sleep(1)
    except Exception as e:
        print("❌ Failed to click 'See more' button:", e)

# Wait for spoilers to appear after expanding text
time.sleep(2)

# === Step 3: Reveal all spoiler sections (re-scan after expansions) ===
# === Step 3: Reveal all spoiler sections (robust version) ===
def click_all_spoilers():
    while True:
        spoiler_buttons = driver.find_elements(By.XPATH, "//button[contains(., 'Spoiler')]")
        if not spoiler_buttons:
            break

        print(f"Found {len(spoiler_buttons)} spoiler buttons...")
        clicked_any = False
        for btn in spoiler_buttons:
            try:
                driver.execute_script("arguments[0].scrollIntoView(true);", btn)
                driver.execute_script("arguments[0].click();", btn)
                time.sleep(0.3)
                clicked_any = True
            except Exception as e:
                pass

        if not clicked_any:
            break

        # Wait to let new spoilers appear
        time.sleep(1)
        
click_all_spoilers()

# Wait a bit in case new spoilers load dynamically
time.sleep(2)

# === Step 4: Extract all review cards ===
cards = driver.find_elements(By.XPATH, "//div[contains(@class, 'ipc-list-card--border-speech')]")

clean_reviews = []
for card in cards:
    text = card.text.strip()

    # === Clean the extracted text ===
    text = re.sub(r"\b\d+\s*/\s*\d+\b", "", text)       # Remove scores like '6 /10'
    text = re.sub(r"Helpful\s*•.*", "", text)           # Remove 'Helpful • 270 • 68'
    text = re.sub(r"\b\d{1,4}\b$", "", text.strip())    # Remove engagement numbers at the end
    text = text.replace("\n", " ").strip()              # Replace newlines with spaces
    text = re.sub(r"\s{2,}", " ", text)                 # Collapse multiple spaces

    if len(text) > 20:  # Skip empty or too-short reviews
        clean_reviews.append(text)

# === Step 5: Create a DataFrame ===
df = pd.DataFrame(clean_reviews, columns=["review"])

# === Step 6: Print all reviews for verification ===
# for review in df["review"]:
    # print(review)
    # print("--------------------------------------------------------------------------------")

No more 'Load More' button found.
Found 32 spoiler buttons...
Found 98 spoiler buttons...
Found 514 spoiler buttons...
Found 285 spoiler buttons...


PermissionError: [Errno 13] Permission denied: 'Superman_reviews.csv'

In [6]:
df.to_csv("Superman_reviews.csv", index=False, encoding="utf-8")

print(f"\n✅ Done! Successfully saved {len(df)} reviews.")


✅ Done! Successfully saved 4433 reviews.


In [20]:
driver.quit()

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
import re

# === Setup Chrome WebDriver Options ===
options = Options()
options.add_experimental_option("detach", True)  # Keeps the browser open after script ends
driver = webdriver.Chrome(options=options)

# === IMDb Reviews URL ===
url = "https://www.imdb.com/title/tt31036941/reviews/?ref_=tt_ov_ururv"
driver.get(url)
driver.maximize_window()
time.sleep(3)

# === Step 1 (Optional): Load all reviews by clicking "Load More" ===
while True:
    try:
        load_more = driver.find_element(By.XPATH, "//button[contains(., 'Load More')]")
        driver.execute_script("arguments[0].scrollIntoView(true);", load_more)
        driver.execute_script("arguments[0].click();", load_more)
        time.sleep(2)
    except:
        print("No more 'Load More' button found.")
        break

# === Step 2: Click all "See more" buttons ===
see_more_buttons = driver.find_elements(By.CLASS_NAME, "ipc-see-more__button")
for btn in see_more_buttons:
    try:
        driver.execute_script("arguments[0].scrollIntoView(true);", btn)
        driver.execute_script("arguments[0].click();", btn)
        time.sleep(1)
    except Exception as e:
        print("❌ Failed to click 'See more' button:", e)

# Wait for spoilers to appear after expanding text
time.sleep(2)

# === Step 3: Reveal all spoiler sections (re-scan after expansions) ===
# === Step 3: Reveal all spoiler sections (robust version) ===
def click_all_spoilers():
    while True:
        spoiler_buttons = driver.find_elements(By.XPATH, "//button[contains(., 'Spoiler')]")
        if not spoiler_buttons:
            break

        print(f"Found {len(spoiler_buttons)} spoiler buttons...")
        clicked_any = False
        for btn in spoiler_buttons:
            try:
                driver.execute_script("arguments[0].scrollIntoView(true);", btn)
                driver.execute_script("arguments[0].click();", btn)
                time.sleep(0.3)
                clicked_any = True
            except Exception as e:
                pass

        if not clicked_any:
            break

        # Wait to let new spoilers appear
        time.sleep(1)
        
click_all_spoilers()

# Wait a bit in case new spoilers load dynamically
time.sleep(2)

# === Step 4: Extract all review cards ===
cards = driver.find_elements(By.XPATH, "//div[contains(@class, 'ipc-list-card--border-speech')]")

clean_reviews = []
for card in cards:
    text = card.text.strip()

    # === Clean the extracted text ===
    text = re.sub(r"\b\d+\s*/\s*\d+\b", "", text)       # Remove scores like '6 /10'
    text = re.sub(r"Helpful\s*•.*", "", text)           # Remove 'Helpful • 270 • 68'
    text = re.sub(r"\b\d{1,4}\b$", "", text.strip())    # Remove engagement numbers at the end
    text = text.replace("\n", " ").strip()              # Replace newlines with spaces
    text = re.sub(r"\s{2,}", " ", text)                 # Collapse multiple spaces

    if len(text) > 20:  # Skip empty or too-short reviews
        clean_reviews.append(text)

# === Step 5: Create a DataFrame ===
df = pd.DataFrame(clean_reviews, columns=["review"])

# === Step 6: Print all reviews for verification ===
# for review in df["review"]:
    # print(review)
    # print("--------------------------------------------------------------------------------")

# === Step 7: Save to CSV ===
df.to_csv("Jurassic_World_Rebirth_reviews.csv", index=False, encoding="utf-8")

print(f"\n✅ Done! Successfully saved {len(df)} reviews.")

No more 'Load More' button found.
Found 42 spoiler buttons...
Found 211 spoiler buttons...
Found 451 spoiler buttons...

✅ Done! Successfully saved 2268 reviews.


In [15]:
scrape_imdb_reviews(
    url="https://www.imdb.com/title/tt0903747/reviews/?ref_=tt_ov_ururv",
    output_filename="Breaking_Bad_reviews.csv"
)

In [None]:
driver.quit()