In [None]:
import re
import time
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# ====== إعدادات المشروع ======
TARGET_TOTAL = 2200            # اجمعي أكثر من 2000 احتياط
LOAD_MORE_PER_MOVIE = 120      # زيديها إذا العدد قليل
CLICK_SLEEP = 1.2              # احترام الموقع (لا تجعليه 0)
MIN_WORDS = 12                 # جودة النص (استبعدي القصير جداً)

# قائمة أفلام (بدون Romance) — زيدي/غيري حسب الحاجة
movie_ids = [
    "tt0137523",  # Fight Club
    "tt0111161",  # Shawshank
    "tt0068646",  # The Godfather
    "tt0468569",  # The Dark Knight
    "tt1375666",  # Inception
    "tt0816692",  # Interstellar
    "tt0102926",  # Silence of the Lambs
    "tt0114369",  # Se7en
    "tt0172495",  # Gladiator
    "tt0133093",  # The Matrix
    "tt0167261",  # LOTR: Return of the King
    "tt0080684",  # The Empire Strikes Back
    "tt0076759",  # Star Wars
]

def label_from_rating(r):
    # 3-Classes (يسهّل الوصول لـ 2000)
    if r >= 7:
        return "positive"
    elif r <= 4:
        return "negative"
    else:
        return "neutral"  # 5-6

# ====== إعداد Selenium (أسرع + أقل إعلانات) ======
opts = Options()
# بعد ما تتأكدي كل شيء شغّال، فعّلي headless لتسريع:
# opts.add_argument("--headless=new")

opts.add_argument("--lang=en-US")
opts.add_argument("--window-size=1400,900")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")

# تقليل البطء: إيقاف الصور قدر الإمكان
prefs = {"profile.managed_default_content_settings.images": 2}
opts.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
wait = WebDriverWait(driver, 25)

all_rows = []

def accept_consent_if_present():
    # أزرار الموافقة تختلف، نجرب أكثر من نمط
    xpaths = [
        "//button[contains(., 'Accept')]",
        "//button[contains(., 'Accept all')]",
        "//button[contains(., 'I agree')]",
        "//button[contains(., 'Agree')]",
    ]
    for xp in xpaths:
        try:
            btn = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, xp)))
            btn.click()
            time.sleep(1)
            return True
        except:
            pass
    return False

def click_load_more(max_clicks):
    clicks = 0
    for _ in range(max_clicks):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.6)
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.ipc-see-more__button"))
            )
            driver.execute_script("arguments[0].click();", btn)
            clicks += 1
            time.sleep(CLICK_SLEEP)
        except:
            break
    return clicks

def parse_reviews_from_html(html, movie_id):
    soup = BeautifulSoup(html, "lxml")

    rows = []

    # (A) تخطيط قديم: div.review-container
    cards = soup.select("div.review-container")
    if cards:
        for c in cards:
            text_tag = c.select_one("div.text.show-more__control")
            rating_tag = c.select_one("span.rating-other-user-rating span")
            if not text_tag or not rating_tag:
                continue
            review_text = text_tag.get_text(strip=True)
            if len(review_text.split()) < MIN_WORDS:
                continue
            try:
                rating = int(rating_tag.get_text(strip=True))
            except:
                continue
            rows.append((review_text, rating, movie_id))
        return rows

    # (B) تخطيط جديد: article cards + rating مثل "8/10"
    # نلتقط أي block يشبه review card ويحتوي rating
    articles = soup.select("article")
    for a in articles:
        block_text = a.get_text(" ", strip=True)
        m = re.search(r"\b(\d{1,2})/10\b", block_text)
        if not m:
            continue
        rating = int(m.group(1))

        # نحاول أخذ النص من سيلكتورات شائعة، وإلا نأخذ جزء كبير من النص
        text = None
        for sel in [
            '[data-testid="review-content"]',
            "div.ipc-html-content-inner-div",
            "div.text.show-more__control",
        ]:
            t = a.select_one(sel)
            if t:
                text = t.get_text(" ", strip=True)
                break
        if not text:
            # fallback: خذي block كله لكن فلترته بالطول
            text = block_text

        if len(text.split()) < MIN_WORDS:
            continue

        rows.append((text, rating, movie_id))

    return rows

try:
    for mid in movie_ids:
        if len(all_rows) >= TARGET_TOTAL:
            break

        url = f"https://www.imdb.com/title/{mid}/reviews"
        driver.get(url)
        time.sleep(2.0)
        accept_consent_if_present()

        # انتظر شيء يدل أن الصفحة وصلت (أي وجود زر load more أو وجود /10 في المصدر)
        try:
            wait.until(lambda d: ("ipc-see-more__button" in d.page_source) or ("/10" in d.page_source))
        except:
            print(mid, "Page did not load reviews properly (blocked/consent).")
            continue

        clicks = click_load_more(LOAD_MORE_PER_MOVIE)

        html = driver.page_source
        parsed = parse_reviews_from_html(html, mid)

        added = 0
        for review_text, rating, movie_id in parsed:
            label = label_from_rating(rating)
            all_rows.append({
                "review_text": review_text,
                "label": label,
                "rating": rating,
                "movie_id": movie_id
            })
            added += 1
            if len(all_rows) >= TARGET_TOTAL:
                break

        print(mid, f"LoadMoreClicks={clicks} ParsedRows={len(parsed)} Added={added} TotalSoFar={len(all_rows)}")

finally:
    driver.quit()

df = pd.DataFrame(all_rows).drop_duplicates(subset=["review_text"]).reset_index(drop=True)
print("Final total after drop_duplicates:", len(df))

df.to_csv("imdb_reviews_scraped_2000plus.csv", index=False, encoding="utf-8-sig")
print("Saved: imdb_reviews_scraped_2000plus.csv")

In [1]:
from bs4 import BeautifulSoup
soup = BeautifulSoup("<html></html>", "lxml")
print("OK")

OK


In [4]:
import sys
print(sys.executable)


C:\Users\Monster Huma H5 v4.1\.conda\envs\lfd_project\python.exe


# BALANCED REVIEWS FROM IMDB

In [None]:
import re
import time
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# ====== PROJECT SETTING ======
TARGET_PER_CLASS = 700
TARGET_TOTAL = TARGET_PER_CLASS * 3

LOAD_MORE_PER_MOVIE = 150
CLICK_SLEEP = 1.2
MIN_WORDS = 12

movie_ids = [
    "tt0137523",  # Fight Club
    "tt0111161",  # Shawshank
    "tt0068646",  # The Godfather
    "tt0468569",  # The Dark Knight
    "tt1375666",  # Inception
    "tt0816692",  # Interstellar
    "tt0102926",  # Silence of the Lambs
    "tt0114369",  # Se7en
    "tt0172495",  # Gladiator
    "tt0133093",  # The Matrix
    "tt0167261",  # LOTR: Return of the King
    "tt0080684",  # The Empire Strikes Back
    "tt0076759",  # Star Wars
    "tt0120737",  # LOTR: Fellowship of the Ring
    "tt0120586",  # American History X
    "tt0167404",  # The Sixth Sense
    "tt0209144",  # Memento
    "tt0372784",  # Batman Begins
    "tt0118799",  # Life Is Beautiful
    "tt0108052",  # Schindler's List
    "tt0407887",  # The Departed
    "tt0993846",  # The Wolf of Wall Street
    "tt1130884",  # Shutter Island
    "tt0264464",  # Catch Me If You Can
    "tt1675434",  # The Intouchables
    "tt4154756",  # Avengers: Infinity War
    "tt4633694",  # Spider-Man: Into the Spider-Verse
    "tt2380307",  # Coco
    "tt0232500",  # The Room 
    "tt12538646", # 365 Days 
    "tt3778644",  # Ghostbusters
    "tt0293662",  # The Cat in the Hat 
    "tt2450186",  # The Happening 
    "tt0818157",  # The Incredible Hulk
    "tt0099785",  # Highlander 2 
    "tt0419706",  # Transformers 
    "tt2109248",  # Transformers: Age of Extinction 
    "tt9243804",  # Cats (2019)
    "tt1571249",  # RoboCop (2014)
    "tt1045658",  # The Love Guru
    "tt0298203",  # Catwoman
    "tt1600195",  # Grown Ups 2
]


def label_from_rating(r):
    if r >= 7:
        return "positive"
    elif r <= 4:
        return "negative"
    else:
        return "neutral"   # 5–6


# ====== SELENIUM SETTING ======
opts = Options()
opts.add_argument("--lang=en-US")
opts.add_argument("--window-size=1400,900")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")

prefs = {"profile.managed_default_content_settings.images": 2}
opts.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
wait = WebDriverWait(driver, 25)


# ====== GLOBAL VARIABLES ======
all_rows = []

class_counts = {
    "positive": 0,
    "neutral": 0,
    "negative": 0
}


# ====== AUXILIARY FUNCTIONS ======
def accept_consent_if_present():
    xpaths = [
        "//button[contains(., 'Accept')]",
        "//button[contains(., 'Accept all')]",
        "//button[contains(., 'I agree')]",
        "//button[contains(., 'Agree')]",
    ]
    for xp in xpaths:
        try:
            btn = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, xp)))
            btn.click()
            time.sleep(1)
            return True
        except:
            pass
    return False


def click_load_more(max_clicks):
    clicks = 0
    for _ in range(max_clicks):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.6)
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.ipc-see-more__button"))
            )
            driver.execute_script("arguments[0].click();", btn)
            clicks += 1
            time.sleep(CLICK_SLEEP)
        except:
            break
    return clicks


def parse_reviews_from_html(html, movie_id):
    soup = BeautifulSoup(html, "lxml")
    rows = []

    # ---- Old layout ----
    cards = soup.select("div.review-container")
    if cards:
        for c in cards:
            text_tag = c.select_one("div.text.show-more__control")
            rating_tag = c.select_one("span.rating-other-user-rating span")
            if not text_tag or not rating_tag:
                continue
            review_text = text_tag.get_text(strip=True)
            if len(review_text.split()) < MIN_WORDS:
                continue

            try:
                rating = int(rating_tag.get_text(strip=True))
            except:
                continue

            rows.append((review_text, rating, movie_id))
        return rows

    # ---- New layout ----
    articles = soup.select("article")
    for a in articles:
        block_text = a.get_text(" ", strip=True)
        m = re.search(r"\b(\d{1,2})/10\b", block_text)
        if not m:
            continue
        rating = int(m.group(1))

        text = None
        for sel in [
            '[data-testid="review-content"]',
            "div.ipc-html-content-inner-div",
            "div.text.show-more__control",
        ]:
            t = a.select_one(sel)
            if t:
                text = t.get_text(" ", strip=True)
                break

        if not text:
            text = block_text

        if len(text.split()) < MIN_WORDS:
            continue

        rows.append((text, rating, movie_id))

    return rows


# ====== AMIN SCRAPING ======
try:
    for mid in movie_ids:

        if all(class_counts[c] >= TARGET_PER_CLASS for c in class_counts):
            break

        url = f"https://www.imdb.com/title/{mid}/reviews"
        driver.get(url)
        time.sleep(2.0)
        accept_consent_if_present()

        try:
            wait.until(lambda d: "ipc-see-more__button" in d.page_source or "/10" in d.page_source)
        except:
            print(mid, "Page did not load reviews properly (blocked/consent).")
            continue

        clicks = click_load_more(LOAD_MORE_PER_MOVIE)
        
        html = driver.page_source
        parsed = parse_reviews_from_html(html, mid)

        added = 0
        for review_text, rating, movie_id in parsed:
            label = label_from_rating(rating)

            if class_counts[label] >= TARGET_PER_CLASS:
                continue

            all_rows.append({
                "review_text": review_text,
                "label": label,
                "rating": rating,
                "movie_id": movie_id
            })

            class_counts[label] += 1
            added += 1

            if all(class_counts[c] >= TARGET_PER_CLASS for c in class_counts):
                break

        print(mid, f"LoadMoreClickes={clicks}", f"Added={added}", f"Counts={class_counts}")

finally:
    driver.quit()


# ====== DATAFRAME & CSV ======
df = (pd.DataFrame(all_rows).drop_duplicates(subset=["review_text"]).reset_index(drop=True))

print("\nFINAL CLASS DISTRIBUTION:")
print(df["label"].value_counts())

df.to_csv("imdb_reviews_balanced_2100.csv",index=False,encoding="utf-8-sig")
print("\nSaved: imdb_reviews_balanced_2100.csv")

In [3]:
import re
import time
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# ====== PROJE AYARLARI ======
NEEDED_NEUTRAL = 430
NEEDED_NEGATIVE = 515
TARGET_TOTAL = TARGET_PER_CLASS * 3

LOAD_MORE_PER_MOVIE = 150
CLICK_SLEEP = 1.2
MIN_WORDS = 12

movie_ids = [
    "tt0137523",  # Fight Club
    "tt0111161",  # Shawshank
    "tt0068646",  # The Godfather
    "tt0468569",  # The Dark Knight
    "tt1375666",  # Inception
    "tt0816692",  # Interstellar
    "tt0102926",  # Silence of the Lambs
    "tt0114369",  # Se7en
    "tt0172495",  # Gladiator
    "tt0133093",  # The Matrix
    "tt0167261",  # LOTR: Return of the King
    "tt0080684",  # The Empire Strikes Back
    "tt0076759",  # Star Wars
    "tt0120737",  # LOTR: Fellowship of the Ring
    "tt0120586",  # American History X
    "tt0167404",  # The Sixth Sense
    "tt0209144",  # Memento
    "tt0372784",  # Batman Begins
    "tt0118799",  # Life Is Beautiful
    "tt0108052",  # Schindler's List
    "tt0407887",  # The Departed
    "tt0993846",  # The Wolf of Wall Street
    "tt1130884",  # Shutter Island
    "tt0264464",  # Catch Me If You Can
    "tt1675434",  # The Intouchables
    "tt4154756",  # Avengers: Infinity War
    "tt4633694",  # Spider-Man: Into the Spider-Verse
    "tt2380307",  # Coco
]


def label_from_rating(r):
    if r >= 7:
        return "positive"
    elif r <= 4:
        return "negative"
    else:
        return "neutral"   # 5–6


# ====== SELENIUM AYARLARI ======
opts = Options()
# opts.add_argument("--headless=new")  # Her şey çalışınca açabilirsin
opts.add_argument("--lang=en-US")
opts.add_argument("--window-size=1400,900")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")

prefs = {"profile.managed_default_content_settings.images": 2}
opts.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=opts
)

wait = WebDriverWait(driver, 25)


# ====== GLOBAL DEĞİŞKENLER ======
all_rows = []

class_counts = {
    "positive": 0,
    "neutral": 0,
    "negative": 0
}


# ====== YARDIMCI FONKSİYONLAR ======
def accept_consent_if_present():
    xpaths = [
        "//button[contains(., 'Accept')]",
        "//button[contains(., 'Accept all')]",
        "//button[contains(., 'I agree')]",
        "//button[contains(., 'Agree')]",
    ]
    for xp in xpaths:
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.XPATH, xp))
            )
            btn.click()
            time.sleep(1)
            return True
        except:
            pass
    return False


def click_load_more(max_clicks):
    clicks = 0
    for _ in range(max_clicks):
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);"
        )
        time.sleep(0.6)
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, "button.ipc-see-more__button")
                )
            )
            driver.execute_script("arguments[0].click();", btn)
            clicks += 1
            time.sleep(CLICK_SLEEP)
        except:
            break
    return clicks


def parse_reviews_from_html(html, movie_id):
    soup = BeautifulSoup(html, "lxml")
    rows = []

    # ---- Eski layout ----
    cards = soup.select("div.review-container")
    if cards:
        for c in cards:
            text_tag = c.select_one("div.text.show-more__control")
            rating_tag = c.select_one(
                "span.rating-other-user-rating span"
            )
            if not text_tag or not rating_tag:
                continue

            text = text_tag.get_text(strip=True)
            if len(text.split()) < MIN_WORDS:
                continue

            try:
                rating = int(rating_tag.get_text(strip=True))
            except:
                continue

            rows.append((text, rating, movie_id))
        return rows

    # ---- Yeni layout ----
    articles = soup.select("article")
    for a in articles:
        block_text = a.get_text(" ", strip=True)
        m = re.search(r"\b(\d{1,2})/10\b", block_text)
        if not m:
            continue

        rating = int(m.group(1))

        text = None
        for sel in [
            '[data-testid="review-content"]',
            "div.ipc-html-content-inner-div",
            "div.text.show-more__control",
        ]:
            t = a.select_one(sel)
            if t:
                text = t.get_text(" ", strip=True)
                break

        if not text:
            text = block_text

        if len(text.split()) < MIN_WORDS:
            continue

        rows.append((text, rating, movie_id))

    return rows


# ====== ANA SCRAPING ======
try:
    for mid in movie_ids:

        if all(class_counts[c] >= TARGET_PER_CLASS for c in class_counts):
            break

        url = f"https://www.imdb.com/title/{mid}/reviews"
        driver.get(url)
        time.sleep(2)
        accept_consent_if_present()

        try:
            wait.until(
                lambda d: "ipc-see-more__button" in d.page_source
                or "/10" in d.page_source
            )
        except:
            print(mid, "-> Reviews yüklenemedi")
            continue

        clicks = click_load_more(LOAD_MORE_PER_MOVIE)
        parsed = parse_reviews_from_html(driver.page_source, mid)

        added = 0
        for review_text, rating, movie_id in parsed:
            label = label_from_rating(rating)

            if class_counts[label] >= TARGET_PER_CLASS:
                continue

            all_rows.append({
                "review_text": review_text,
                "label": label,
                "rating": rating,
                "movie_id": movie_id
            })

            class_counts[label] += 1
            added += 1

            if all(class_counts[c] >= TARGET_PER_CLASS for c in class_counts):
                break

        print(
            mid,
            f"LoadMore={clicks}",
            f"Added={added}",
            f"Counts={class_counts}"
        )

finally:
    driver.quit()


# ====== DATAFRAME & CSV ======
df = (
    pd.DataFrame(all_rows)
    .drop_duplicates(subset=["review_text"])
    .reset_index(drop=True)
)

print("\nFINAL CLASS DISTRIBUTION:")
print(df["label"].value_counts())

df.to_csv(
    "imdb_reviews_balanced_2100.csv",
    index=False,
    encoding="utf-8-sig"
)

print("\nSaved: imdb_reviews_balanced_2100.csv")


tt0137523 LoadMore=150 Added=257 Counts={'positive': 206, 'neutral': 30, 'negative': 21}
tt0111161 LoadMore=150 Added=188 Counts={'positive': 365, 'neutral': 47, 'negative': 33}
tt0068646 LoadMore=150 Added=205 Counts={'positive': 532, 'neutral': 64, 'negative': 54}
tt0468569 LoadMore=150 Added=212 Counts={'positive': 700, 'neutral': 90, 'negative': 72}
tt1375666 LoadMore=150 Added=35 Counts={'positive': 700, 'neutral': 111, 'negative': 86}
tt0816692 LoadMore=150 Added=38 Counts={'positive': 700, 'neutral': 129, 'negative': 106}
tt0102926 LoadMore=76 Added=23 Counts={'positive': 700, 'neutral': 146, 'negative': 112}
tt0114369 LoadMore=91 Added=36 Counts={'positive': 700, 'neutral': 170, 'negative': 124}
tt0172495 LoadMore=132 Added=41 Counts={'positive': 700, 'neutral': 195, 'negative': 140}
tt0133093 LoadMore=150 Added=48 Counts={'positive': 700, 'neutral': 221, 'negative': 162}
tt0167261 LoadMore=115 Added=46 Counts={'positive': 700, 'neutral': 251, 'negative': 178}
tt0080684 LoadMor

In [5]:
import re
import time
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# ====== PROJE AYARLARI ======
NEEDED_NEUTRAL = 430
NEEDED_NEGATIVE = 515

LOAD_MORE_PER_MOVIE = 150
CLICK_SLEEP = 1.2
MIN_WORDS = 12

movie_ids = [
    "tt0120737",  # LOTR: Fellowship of the Ring
    "tt0120586",  # American History X
    "tt0167404",  # The Sixth Sense
    "tt0209144",  # Memento
    "tt0372784",  # Batman Begins
    "tt0118799",  # Life Is Beautiful
    "tt0108052",  # Schindler's List
    "tt0407887",  # The Departed
    "tt0993846",  # The Wolf of Wall Street
    "tt1130884",  # Shutter Island
    "tt0264464",  # Catch Me If You Can
    "tt1675434",  # The Intouchables
    "tt4154756",  # Avengers: Infinity War
    "tt4633694",  # Spider-Man: Into the Spider-Verse
    "tt2380307",  # Coco
    "tt0232500",  # The Room 
    "tt12538646", # 365 Days 
    "tt3778644",  # Ghostbusters
    "tt0293662",  # The Cat in the Hat 
    "tt2450186",  # The Happening 
    "tt0818157",  # The Incredible Hulk
    "tt0099785",  # Highlander 2 
    "tt0419706",  # Transformers 
    "tt2109248",  # Transformers: Age of Extinction 
    "tt9243804",  # Cats (2019)
]


def label_from_rating(r):
    if 5 <= r <= 6:
        return "neutral"
    elif r <= 4:
        return "negative"
    else:
        return "positive"


# ====== SELENIUM AYARLARI ======
opts = Options()
# opts.add_argument("--headless=new")
opts.add_argument("--lang=en-US")
opts.add_argument("--window-size=1400,900")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")

prefs = {"profile.managed_default_content_settings.images": 2}
opts.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=opts
)

wait = WebDriverWait(driver, 25)


# ====== GLOBAL ======
all_rows = []
class_counts = {
    "neutral": 0,
    "negative": 0
}


# ====== YARDIMCI FONKSİYONLAR ======
def accept_consent_if_present():
    xpaths = [
        "//button[contains(., 'Accept')]",
        "//button[contains(., 'Accept all')]",
        "//button[contains(., 'I agree')]",
        "//button[contains(., 'Agree')]",
    ]
    for xp in xpaths:
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.XPATH, xp))
            )
            btn.click()
            time.sleep(1)
            return True
        except:
            pass
    return False


def click_load_more(max_clicks):
    clicks = 0
    for _ in range(max_clicks):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.6)
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, "button.ipc-see-more__button")
                )
            )
            driver.execute_script("arguments[0].click();", btn)
            clicks += 1
            time.sleep(CLICK_SLEEP)
        except:
            break
    return clicks


def parse_reviews_from_html(html, movie_id):
    soup = BeautifulSoup(html, "lxml")
    rows = []

    # ---- Eski layout ----
    cards = soup.select("div.review-container")
    if cards:
        for c in cards:
            text_tag = c.select_one("div.text.show-more__control")
            rating_tag = c.select_one("span.rating-other-user-rating span")
            if not text_tag or not rating_tag:
                continue

            text = text_tag.get_text(strip=True)
            if len(text.split()) < MIN_WORDS:
                continue

            try:
                rating = int(rating_tag.get_text(strip=True))
            except:
                continue

            rows.append((text, rating, movie_id))
        return rows

    # ---- Yeni layout ----
    articles = soup.select("article")
    for a in articles:
        block_text = a.get_text(" ", strip=True)
        m = re.search(r"\b(\d{1,2})/10\b", block_text)
        if not m:
            continue
        rating = int(m.group(1))

        text = None
        for sel in [
            '[data-testid="review-content"]',
            "div.ipc-html-content-inner-div",
            "div.text.show-more__control",
        ]:
            t = a.select_one(sel)
            if t:
                text = t.get_text(" ", strip=True)
                break

        if not text:
            text = block_text

        if len(text.split()) < MIN_WORDS:
            continue

        rows.append((text, rating, movie_id))

    return rows


# ====== ANA SCRAPING ======
try:
    for mid in movie_ids:

        # hedeflere ulaştıysak çık
        if class_counts["neutral"] >= NEEDED_NEUTRAL and class_counts["negative"] >= NEEDED_NEGATIVE:
            break

        url = f"https://www.imdb.com/title/{mid}/reviews"
        driver.get(url)
        time.sleep(2)
        accept_consent_if_present()

        try:
            wait.until(lambda d: "ipc-see-more__button" in d.page_source or "/10" in d.page_source)
        except:
            print(mid, "-> Reviews yüklenemedi")
            continue

        clicks = click_load_more(LOAD_MORE_PER_MOVIE)
        parsed = parse_reviews_from_html(driver.page_source, mid)

        added = 0
        for review_text, rating, movie_id in parsed:
            label = label_from_rating(rating)

            if label == "neutral" and class_counts["neutral"] < NEEDED_NEUTRAL:
                pass
            elif label == "negative" and class_counts["negative"] < NEEDED_NEGATIVE:
                pass
            else:
                continue  # positive'ları veya fazlaları alma

            all_rows.append({
                "review_text": review_text,
                "label": label,
                "rating": rating,
                "movie_id": movie_id
            })
            class_counts[label] += 1
            added += 1

            if class_counts["neutral"] >= NEEDED_NEUTRAL and class_counts["negative"] >= NEEDED_NEGATIVE:
                break

        print(
            mid,
            f"LoadMore={clicks}",
            f"Added={added}",
            f"Counts={class_counts}"
        )

finally:
    driver.quit()


# ====== DATAFRAME & CSV ======
df = (
    pd.DataFrame(all_rows)
    .drop_duplicates(subset=["review_text"])
    .reset_index(drop=True)
)

print("\nFINAL CLASS DISTRIBUTION:")
print(df["label"].value_counts())

df.to_csv(
    "imdb_extra_neutral_negative.csv",
    index=False,
    encoding="utf-8-sig"
)

print("\nSaved: imdb_extra_neutral_negative.csv")


tt0120737 LoadMore=150 Added=51 Counts={'neutral': 35, 'negative': 16}
tt0120586 LoadMore=72 Added=21 Counts={'neutral': 51, 'negative': 21}
tt0167404 LoadMore=100 Added=26 Counts={'neutral': 66, 'negative': 32}
tt0209144 LoadMore=107 Added=44 Counts={'neutral': 93, 'negative': 49}
tt0372784 LoadMore=134 Added=61 Counts={'neutral': 131, 'negative': 72}
tt0118799 LoadMore=65 Added=10 Counts={'neutral': 137, 'negative': 76}
tt0108052 LoadMore=96 Added=15 Counts={'neutral': 145, 'negative': 83}
tt0407887 LoadMore=109 Added=32 Counts={'neutral': 164, 'negative': 96}
tt0993846 LoadMore=80 Added=30 Counts={'neutral': 182, 'negative': 108}
tt1130884 LoadMore=73 Added=21 Counts={'neutral': 195, 'negative': 116}
tt0264464 LoadMore=46 Added=19 Counts={'neutral': 207, 'negative': 123}
tt1675434 LoadMore=37 Added=7 Counts={'neutral': 211, 'negative': 126}
tt4154756 LoadMore=150 Added=50 Counts={'neutral': 234, 'negative': 153}
tt4633694 LoadMore=94 Added=23 Counts={'neutral': 251, 'negative': 159}

In [15]:
import re
import time
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# ====== PROJE AYARLARI ======
NEEDED_NEUTRAL = 0
NEEDED_NEGATIVE = 1

LOAD_MORE_PER_MOVIE = 150
CLICK_SLEEP = 1.2
MIN_WORDS = 12

movie_ids = [
    "tt1571249",   # RoboCop (2014)
    "tt1045658",   # The Love Guru
    "tt0298203",   # Catwoman
    "tt1600195",   # Grown Ups 2
]


def label_from_rating(r):
    if 5 <= r <= 6:
        return "neutral"
    elif r <= 4:
        return "negative"
    else:
        return "positive"


# ====== SELENIUM AYARLARI ======
opts = Options()
# opts.add_argument("--headless=new")
opts.add_argument("--lang=en-US")
opts.add_argument("--window-size=1400,900")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")

prefs = {"profile.managed_default_content_settings.images": 2}
opts.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=opts
)

wait = WebDriverWait(driver, 25)


# ====== GLOBAL ======
all_rows = []
class_counts = {
    "neutral": 0,
    "negative": 0
}


# ====== YARDIMCI FONKSİYONLAR ======
def accept_consent_if_present():
    xpaths = [
        "//button[contains(., 'Accept')]",
        "//button[contains(., 'Accept all')]",
        "//button[contains(., 'I agree')]",
        "//button[contains(., 'Agree')]",
    ]
    for xp in xpaths:
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.XPATH, xp))
            )
            btn.click()
            time.sleep(1)
            return True
        except:
            pass
    return False


def click_load_more(max_clicks):
    clicks = 0
    for _ in range(max_clicks):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.6)
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, "button.ipc-see-more__button")
                )
            )
            driver.execute_script("arguments[0].click();", btn)
            clicks += 1
            time.sleep(CLICK_SLEEP)
        except:
            break
    return clicks


def parse_reviews_from_html(html, movie_id):
    soup = BeautifulSoup(html, "lxml")
    rows = []

    # ---- Eski layout ----
    cards = soup.select("div.review-container")
    if cards:
        for c in cards:
            text_tag = c.select_one("div.text.show-more__control")
            rating_tag = c.select_one("span.rating-other-user-rating span")
            if not text_tag or not rating_tag:
                continue

            text = text_tag.get_text(strip=True)
            if len(text.split()) < MIN_WORDS:
                continue

            try:
                rating = int(rating_tag.get_text(strip=True))
            except:
                continue

            rows.append((text, rating, movie_id))
        return rows

    # ---- Yeni layout ----
    articles = soup.select("article")
    for a in articles:
        block_text = a.get_text(" ", strip=True)
        m = re.search(r"\b(\d{1,2})/10\b", block_text)
        if not m:
            continue
        rating = int(m.group(1))

        text = None
        for sel in [
            '[data-testid="review-content"]',
            "div.ipc-html-content-inner-div",
            "div.text.show-more__control",
        ]:
            t = a.select_one(sel)
            if t:
                text = t.get_text(" ", strip=True)
                break

        if not text:
            text = block_text

        if len(text.split()) < MIN_WORDS:
            continue

        rows.append((text, rating, movie_id))

    return rows


# ====== ANA SCRAPING ======
try:
    for mid in movie_ids:

        # hedeflere ulaştıysak çık
        if class_counts["neutral"] >= NEEDED_NEUTRAL and class_counts["negative"] >= NEEDED_NEGATIVE:
            break

        url = f"https://www.imdb.com/title/{mid}/reviews"
        driver.get(url)
        time.sleep(2)
        accept_consent_if_present()

        try:
            wait.until(lambda d: "ipc-see-more__button" in d.page_source or "/10" in d.page_source)
        except:
            print(mid, "-> Reviews yüklenemedi")
            continue

        clicks = click_load_more(LOAD_MORE_PER_MOVIE)
        parsed = parse_reviews_from_html(driver.page_source, mid)

        added = 0
        for review_text, rating, movie_id in parsed:
            label = label_from_rating(rating)

            if label == "neutral" and class_counts["neutral"] < NEEDED_NEUTRAL:
                pass
            elif label == "negative" and class_counts["negative"] < NEEDED_NEGATIVE:
                pass
            else:
                continue  # positive'ları veya fazlaları alma

            all_rows.append({
                "review_text": review_text,
                "label": label,
                "rating": rating,
                "movie_id": movie_id
            })
            class_counts[label] += 1
            added += 1

            if class_counts["neutral"] >= NEEDED_NEUTRAL and class_counts["negative"] >= NEEDED_NEGATIVE:
                break

        print(
            mid,
            f"LoadMore={clicks}",
            f"Added={added}",
            f"Counts={class_counts}"
        )

finally:
    driver.quit()


# ====== DATAFRAME & CSV ======
df = (
    pd.DataFrame(all_rows)
    .drop_duplicates(subset=["review_text"])
    .reset_index(drop=True)
)

print("\nFINAL CLASS DISTRIBUTION:")
print(df["label"].value_counts())

df.to_csv(
    "imdb_extra_neutral_negative.csv",
    index=False,
    encoding="utf-8-sig"
)

print("\nSaved: imdb_extra_neutral_negative.csv")

tt1571249 LoadMore=4 Added=1 Counts={'neutral': 0, 'negative': 1}

FINAL CLASS DISTRIBUTION:
label
negative    1
Name: count, dtype: int64

Saved: imdb_extra_neutral_negative.csv


In [16]:
import pandas as pd

df1 = pd.read_csv("balanced_imdb_comments.csv")
df2 = pd.read_csv("imdb_extra_neutral_negative.csv")

merged = pd.concat([df1, df2], ignore_index=True).drop_duplicates(subset=["review_text"])

merged.to_csv("balanced_imdb_comments.csv", index=False, encoding="utf-8-sig")

In [17]:
import pandas as pd

df = pd.read_csv("balanced_imdb_comments.csv")

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.to_csv("balanced_imdb_comments.csv", index=False)