# BALANCED REVIEWS FROM IMDB

In [None]:
import re
import time
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# ====== PROJECT SETTING ======
TARGET_PER_CLASS = 700
TARGET_TOTAL = TARGET_PER_CLASS * 3

LOAD_MORE_PER_MOVIE = 150
CLICK_SLEEP = 1.2
MIN_WORDS = 12

movie_ids = [
    "tt0137523",  # Fight Club
    "tt0111161",  # Shawshank
    "tt0068646",  # The Godfather
    "tt0468569",  # The Dark Knight
    "tt1375666",  # Inception
    "tt0816692",  # Interstellar
    "tt0102926",  # Silence of the Lambs
    "tt0114369",  # Se7en
    "tt0172495",  # Gladiator
    "tt0133093",  # The Matrix
    "tt0167261",  # LOTR: Return of the King
    "tt0080684",  # The Empire Strikes Back
    "tt0076759",  # Star Wars
    "tt0120737",  # LOTR: Fellowship of the Ring
    "tt0120586",  # American History X
    "tt0167404",  # The Sixth Sense
    "tt0209144",  # Memento
    "tt0372784",  # Batman Begins
    "tt0118799",  # Life Is Beautiful
    "tt0108052",  # Schindler's List
    "tt0407887",  # The Departed
    "tt0993846",  # The Wolf of Wall Street
    "tt1130884",  # Shutter Island
    "tt0264464",  # Catch Me If You Can
    "tt1675434",  # The Intouchables
    "tt4154756",  # Avengers: Infinity War
    "tt4633694",  # Spider-Man: Into the Spider-Verse
    "tt2380307",  # Coco
    "tt0232500",  # The Room 
    "tt12538646", # 365 Days 
    "tt3778644",  # Ghostbusters
    "tt0293662",  # The Cat in the Hat 
    "tt2450186",  # The Happening 
    "tt0818157",  # The Incredible Hulk
    "tt0099785",  # Highlander 2 
    "tt0419706",  # Transformers 
    "tt2109248",  # Transformers: Age of Extinction 
    "tt9243804",  # Cats (2019)
    "tt1571249",  # RoboCop (2014)
    "tt1045658",  # The Love Guru
    "tt0298203",  # Catwoman
    "tt1600195",  # Grown Ups 2
]


def label_from_rating(r):
    if r >= 7:
        return "positive"
    elif r <= 4:
        return "negative"
    else:
        return "neutral"   # 5â€“6


# ====== SELENIUM SETTING ======
opts = Options()
opts.add_argument("--lang=en-US")
opts.add_argument("--window-size=1400,900")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")

prefs = {"profile.managed_default_content_settings.images": 2}
opts.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
wait = WebDriverWait(driver, 25)


# ====== GLOBAL VARIABLES ======
all_rows = []

class_counts = {
    "positive": 0,
    "neutral": 0,
    "negative": 0
}


# ====== AUXILIARY FUNCTIONS ======
def accept_consent_if_present():
    xpaths = [
        "//button[contains(., 'Accept')]",
        "//button[contains(., 'Accept all')]",
        "//button[contains(., 'I agree')]",
        "//button[contains(., 'Agree')]",
    ]
    for xp in xpaths:
        try:
            btn = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, xp)))
            btn.click()
            time.sleep(1)
            return True
        except:
            pass
    return False


def click_load_more(max_clicks):
    clicks = 0
    for _ in range(max_clicks):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.6)
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.ipc-see-more__button"))
            )
            driver.execute_script("arguments[0].click();", btn)
            clicks += 1
            time.sleep(CLICK_SLEEP)
        except:
            break
    return clicks


def parse_reviews_from_html(html, movie_id):
    soup = BeautifulSoup(html, "lxml")
    rows = []

    # ---- Old layout ----
    cards = soup.select("div.review-container")
    if cards:
        for c in cards:
            text_tag = c.select_one("div.text.show-more__control")
            rating_tag = c.select_one("span.rating-other-user-rating span")
            if not text_tag or not rating_tag:
                continue
            review_text = text_tag.get_text(strip=True)
            if len(review_text.split()) < MIN_WORDS:
                continue

            try:
                rating = int(rating_tag.get_text(strip=True))
            except:
                continue

            rows.append((review_text, rating, movie_id))
        return rows

    # ---- New layout ----
    articles = soup.select("article")
    for a in articles:
        block_text = a.get_text(" ", strip=True)
        m = re.search(r"\b(\d{1,2})/10\b", block_text)
        if not m:
            continue
        rating = int(m.group(1))

        text = None
        for sel in [
            '[data-testid="review-content"]',
            "div.ipc-html-content-inner-div",
            "div.text.show-more__control",
        ]:
            t = a.select_one(sel)
            if t:
                text = t.get_text(" ", strip=True)
                break

        if not text:
            text = block_text

        if len(text.split()) < MIN_WORDS:
            continue

        rows.append((text, rating, movie_id))

    return rows


# ====== AMIN SCRAPING ======
try:
    for mid in movie_ids:

        if all(class_counts[c] >= TARGET_PER_CLASS for c in class_counts):
            break

        url = f"https://www.imdb.com/title/{mid}/reviews"
        driver.get(url)
        time.sleep(2.0)
        accept_consent_if_present()

        try:
            wait.until(lambda d: "ipc-see-more__button" in d.page_source or "/10" in d.page_source)
        except:
            print(mid, "Page did not load reviews properly (blocked/consent).")
            continue

        clicks = click_load_more(LOAD_MORE_PER_MOVIE)
        
        html = driver.page_source
        parsed = parse_reviews_from_html(html, mid)

        added = 0
        for review_text, rating, movie_id in parsed:
            label = label_from_rating(rating)

            if class_counts[label] >= TARGET_PER_CLASS:
                continue

            all_rows.append({
                "review_text": review_text,
                "label": label,
                "rating": rating,
                "movie_id": movie_id
            })

            class_counts[label] += 1
            added += 1

            if all(class_counts[c] >= TARGET_PER_CLASS for c in class_counts):
                break

        print(mid, f"LoadMoreClickes={clicks}", f"Added={added}", f"Counts={class_counts}")

finally:
    driver.quit()


# ====== DATAFRAME & CSV ======
df = (pd.DataFrame(all_rows).drop_duplicates(subset=["review_text"]).reset_index(drop=True))

print("\nFINAL CLASS DISTRIBUTION:")
print(df["label"].value_counts())

df.to_csv("imdb_reviews_balanced_2100.csv",index=False,encoding="utf-8-sig")
print("\nSaved: imdb_reviews_balanced_2100.csv")