In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Headless Chrome setup
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# IMDb URL
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31"
driver.get(url)

# Storage
all_movies = set()
movie_data = []

max_clicks = 40

for i in range(max_clicks):
    print(f"Scraping page {i+1}...")
    
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//div[@class="sc-995e3276-1 jziSZL dli-parent"]'))
        )
    except:
        print("Failed to load movie list.")
        break

    movies = driver.find_elements(By.XPATH, '//div[@class="sc-995e3276-1 jziSZL dli-parent"]')

    for movie in movies:
        try:
            full_title = movie.find_element(By.XPATH, './/h3[@class="ipc-title__text"]').text
            title = full_title.split('. ', 1)[-1]
        except:
            title = ""
        
        try:
            summary = movie.find_element(By.XPATH, './/div[@class="ipc-html-content-inner-div"]').text
        except:
            summary = ""

        if title and title not in all_movies:
            all_movies.add(title)
            movie_data.append({"Title": title, "Summary": summary})

    # Try clicking "Load more" button
    try:
        load_more_btn = driver.find_element(By.XPATH, "//span[@class='ipc-btn__text']//span[text()='50 more']/ancestor::button")
        driver.execute_script("arguments[0].click();", load_more_btn)
        time.sleep(1)  # small delay for DOM to update
    except:
        print("No more 'Load more' button. Scraping complete.")
        break

driver.quit()

# Save to CSV
df = pd.DataFrame(movie_data)
df.to_csv("imdb_2024_movies_full.csv", index=False)
print(f"Saved {len(df)} unique movies.")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Saved 1942 unique movies.
