In [1]:
import re
import time
import json
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# ================= Configuration =================
TARGET_YEARS = list(range(2024, 1999 , -1))  # Target Years, including 2024 to 2000
MIN_VOTES = 10000
MIN_BOX_OFFICE = 50000000  # Minimum Box Office in USD with at least 50 million

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}
# =================================================


In [3]:
def get_high_res_poster(url):
    if not url: return ""
    return re.sub(r'V1_.*?.jpg', 'V1_.jpg', url)

def format_date_output(date_obj):
    return date_obj.strftime("%b %-d, %Y")

def get_movie_detail_date(driver, imdb_id):
    if not imdb_id: return None
    url = f"https://www.imdb.com/title/{imdb_id}/"
    try:
        driver.get(url)
        time.sleep(1.2) 
        
        soup = BeautifulSoup(driver.page_source, "lxml")
        
        json_ld_script = soup.find('script', type='application/ld+json')
        if json_ld_script:
            try:
                data = json.loads(json_ld_script.string)
                if 'datePublished' in data:
                    dt = datetime.strptime(data['datePublished'], "%Y-%m-%d")
                    return format_date_output(dt)
            except: pass

        release_date_item = soup.find("a", {"href": re.compile(r"releaseinfo")})
        if release_date_item:
            clean_text = re.split(r'\s*\(', release_date_item.get_text(strip=True))[0].strip()
            try:
                dt = datetime.strptime(clean_text, "%B %d, %Y")
                return format_date_output(dt)
            except: pass
        return None
    except Exception: return None

def sort_movies_by_date(movies_list, reverse=False):
    def parse_date(movie):
        try:
            return datetime.strptime(movie['date'], "%b %d, %Y")
        except ValueError:
            return datetime(1, 1, 1)
    movies_list.sort(key=parse_date, reverse=reverse)
    return movies_list

def load_all_results(driver):
    print("   ‚îî‚îÄ‚îÄ Loading full list (clicking '50 more')...")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        try:
            more_button = WebDriverWait(driver, 4).until(
                EC.presence_of_element_located((By.CLASS_NAME, "ipc-see-more__button"))
            )
            driver.execute_script("arguments[0].click();", more_button)
            print("      [+] Clicked '50 more' button...")
            time.sleep(3)
        except Exception:
            print("      [‚úì] List fully loaded (No more buttons).")
            break

def scrape_year(driver, year):
    print(f"\nüé¨ Scanning Year: {year}...")
    
    url = (
        f"https://www.imdb.com/search/title/?"
        f"title_type=feature"
        f"&release_date={year}-01-01,{year}-12-31"
        f"&num_votes={MIN_VOTES},"
        f"&gross={MIN_BOX_OFFICE},"
        f"&sort=boxoffice_gross_us,desc"
    )
    
    driver.get(url)
    time.sleep(3) 

    load_all_results(driver)
    
    print("   ‚è≥ Extracting HTML source code...")
    html_source = driver.page_source
    
    print("   ‚è≥ Parsing HTML with lxml...")
    soup = BeautifulSoup(html_source, "lxml") 
    
    movie_items = soup.select("li.ipc-metadata-list-summary-item")
    total_found = len(movie_items)
    
    print(f"   üìä Statistics for {year}:")
    print(f"      - Total Qualified Movies Found: {total_found}")
    
    movies_list = []
    
    for index, item in enumerate(movie_items):
        title_tag = item.select_one("h3.ipc-title__text")
        if not title_tag: continue
        clean_title = title_tag.get_text(strip=True).split('. ', 1)[-1]
        
        img_tag = item.select_one("img.ipc-image")
        poster_url = get_high_res_poster(img_tag.get("src")) if img_tag else ""
        
        link_tag = item.select_one("a.ipc-title-link-wrapper")
        imdb_id = re.search(r'/title/(tt\d+)/', link_tag.get("href")).group(1) if link_tag else ""
        
        print(f"      [{index+1}/{total_found}] Processing: {clean_title} ...")
        
        specific_date = get_movie_detail_date(driver, imdb_id)
        final_date = specific_date if specific_date else f"Jan 1, {year}"
        
        movies_list.append({
            "title": clean_title,
            "date": final_date,
            "poster": poster_url
        })
        
    sorted_movies = sort_movies_by_date(movies_list, reverse=False)
    
    saved_count = len(sorted_movies)
    print(f"   ‚úÖ Year {year} finished. Saved {saved_count} movies to JSON.")

    return {
        "year": year,
        "total_count": total_found,
        "saved_count": saved_count,
        "favorite": sorted_movies[0]['title'] if sorted_movies else "",
        "movies": sorted_movies
    }

def main():
    chrome_options = Options()
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--log-level=3")
    
    print("üöÄ Starting Browser...")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    try:
        all_years_data = []
        for year in TARGET_YEARS:
            year_data = scrape_year(driver, year)
            if year_data:
                all_years_data.append(year_data)
                
        output_file = "movies_tmp.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(all_years_data, f, indent=2, ensure_ascii=False)
            
        print(f"\nüéâ All Done! Saved to '{output_file}'.")
        
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


üöÄ Starting Browser...

üé¨ Scanning Year: 2024...
   ‚îî‚îÄ‚îÄ Loading full list (clicking '50 more')...
      [+] Clicked '50 more' button...
      [+] Clicked '50 more' button...
      [+] Clicked '50 more' button...
      [+] Clicked '50 more' button...
      [+] Clicked '50 more' button...
      [+] Clicked '50 more' button...
      [‚úì] List fully loaded (No more buttons).
   ‚è≥ Extracting HTML source code...
   ‚è≥ Parsing HTML with lxml...
   üìä Statistics for 2024:
      - Total Qualified Movies Found: 303
      [1/303] Processing: Inside Out 2 ...
      [2/303] Processing: Deadpool & Wolverine ...
      [3/303] Processing: Wicked ...
      [4/303] Processing: Moana 2 ...
      [5/303] Processing: Despicable Me 4 ...
      [6/303] Processing: Beetlejuice Beetlejuice ...
      [7/303] Processing: Dune: Part Two ...
      [8/303] Processing: Twisters ...
      [9/303] Processing: Mufasa: The Lion King ...
      [10/303] Processing: Sonic the Hedgehog 3 ...
      [11/303] 