In [8]:
import time 
import pandas as pd 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [9]:
'''prepare chrome'''

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
# driver


# webdriver.Chrome → launches a Chrome browser controlled by Selenium.
# Service(ChromeDriverManager().install()) → automatically downloads and manages the correct version of ChromeDriver (via the webdriver_manager package), instead of you having to manually download it.
# options=options → lets you pass customized Chrome settings, like headless mode, disabling popups, or setting user agents.

In [None]:
'''variables initiation'''

YEARS = [2020, 2021, 2022, 2023]
all_movies = []
all_movies_details = []

In [35]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver


for YEAR in YEARS:
    url = f"https://www.boxofficemojo.com/year/{YEAR}/"
    print(f"Scraping{url}...")
    driver.get(url)
    time.sleep(5)

    #By --> It tells Selenium how to find elements on a web page (e.g., by id, name, xpath, etc.).
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    time.sleep(10) 
    for row in rows:
        cols = row.find_elements(By.TAG_NAME, "td")
        if len(cols) < 7:
            continue



        rank = cols[0].text.strip()
        name = cols[1].text.strip()
        link = cols[1].find_element(By.TAG_NAME, "a").get_attribute("href")
        gross = cols[3].text.strip()
        theaters = cols[4].text.strip()
        total_gross = cols[5].text.strip()
        release_date = cols[6].text.strip()

        all_movies.append({
            "Year":YEAR,
            "rank":rank,
            "Movie Name":name,
            "MovieURL":link,
            "Gross":gross,
            "Theaters": theaters,
            "Total Gross": total_gross, 
            "Release Date": release_date
        })


driver.quit()

df = pd.DataFrame(all_movies)
df



Scrapinghttps://www.boxofficemojo.com/year/2023/...


Unnamed: 0,Year,rank,Movie Name,MovieURL,Gross,Theaters,Total Gross,Release Date
0,2023,1,Barbie,https://www.boxofficemojo.com/release/rl107790...,,,"$636,225,983",4337
1,2023,2,The Super Mario Bros. Movie,https://www.boxofficemojo.com/release/rl193059...,,,"$574,934,330",4371
2,2023,3,Spider-Man: Across the Spider-Verse,https://www.boxofficemojo.com/release/rl281218...,,,"$381,311,319",4332
3,2023,4,Guardians of the Galaxy Vol. 3,https://www.boxofficemojo.com/release/rl297720...,,,"$358,995,815",4450
4,2023,5,Oppenheimer,https://www.boxofficemojo.com/release/rl372588...,,,"$326,101,370",3761
...,...,...,...,...,...,...,...,...
195,2023,196,Emily,https://www.boxofficemojo.com/release/rl266669...,,,"$1,097,067",579
196,2023,197,Knights of the Zodiac,https://www.boxofficemojo.com/release/rl420318...,,,"$1,090,155",588
197,2023,198,Broker,https://www.boxofficemojo.com/release/rl586646...,,,"$1,001,892",271
198,2023,199,Showing Up,https://www.boxofficemojo.com/release/rl956203...,,,"$972,074",115


In [12]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver




def scrape_movie_details(url):
    details = {}

    try:
        driver.get(url)

        #summary block
        try:
            summary_items = driver.find_elements(By.CSS_SELECTOR, "div.mojo-summary-values div")
            for item in summary_items:
                text = item.text.strip()
                for lab in [
                "Distributor","Release Date","MPAA","Running Time","Genres",
                "Budget","Opening","Widest Release","Production Budget",
                "Production Company","Estimated"
                ]:
                    if text.startswith(lab):
                        details[lab] = text.replace(lab, "").strip()
        except:
            pass
        

        #Gross block 
        try:
            gross_blocks = driver.find_elements(By.CSS_SELECTOR, "div.mojo-performance-summary div.a-section")
            for block in gross_blocks:
                try:
                    label = block.find_element(By.CSS_SELECTOR, "span.a-size-small").text.strip()
                    value = block.find_element(By.CSS_SELECTOR, "span.a-size-medium").text.strip()
                    clean_label = "GROSS_" + label.split("(")[0].strip()
                    details[clean_label] = value
                except:
                    continue
        except:
            pass


        #Release Summary table (extra info)
        # try:
        #     rows = driver.find_elements(By.CSS_SELECTOR, "div#release-summary table tbody tr")
        # except:
        #     pass



    except Exception as e:
        print(f"Error while scraping {url} --> {e}")

    return details



for YEAR in YEARS:
    
    movie_links = []
    url = f"https://www.boxofficemojo.com/year/{YEAR}/"
    print(f"Scraping{url}...")
    driver.get(url)


    #By --> It tells Selenium how to find elements on a web page (e.g., by id, name, xpath, etc.).
    # rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")

    wait = WebDriverWait(driver, 10)
    rows = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table tbody tr")))
    # time.sleep(10) 
    for row in rows:
        cols = row.find_elements(By.TAG_NAME, "td")
        if len(cols) < 7:
            continue

        name = cols[1].text.strip()
        link = cols[1].find_element(By.TAG_NAME, "a").get_attribute("href")
        movie_links.append((YEAR, name, link))

    for year, name, link in movie_links:
        movie = {
            "Year":year,
            "Movie Name":name,
            "MovieURL":link
        }

        if link:
            details = scrape_movie_details(link)
            movie.update(details)
            all_movies_details.append(movie)


driver.quit()
# print(f"final movies details list : {all_movies_details}")

df2 = pd.DataFrame(all_movies_details)
df2



Scrapinghttps://www.boxofficemojo.com/year/2023/...


Unnamed: 0,Year,Movie Name,MovieURL,Distributor,Opening,Release Date,MPAA,Running Time,Genres,Widest Release,GROSS_DOMESTIC,GROSS_INTERNATIONAL,GROSS_WORLDWIDE,Budget
0,2023,Barbie,https://www.boxofficemojo.com/release/rl107790...,Warner Bros.\nSee full company information,"$162,022,044\n4,243 theaters","Jul 21, 2023",PG-13,1 hr 54 min,Adventure Comedy Fantasy,"4,337 theaters","$636,238,421","$810,800,000","$1,447,038,421",
1,2023,The Super Mario Bros. Movie,https://www.boxofficemojo.com/release/rl193059...,Universal Pictures\nSee full company information,"$146,361,865\n4,343 theaters","Apr 5, 2023 - Sep 7, 2023",PG,1 hr 32 min,Adventure Animation Comedy Family Fantasy,"4,371 theaters","$574,934,330","$785,913,335","$1,360,847,665",
2,2023,Spider-Man: Across the Spider-Verse,https://www.boxofficemojo.com/release/rl281218...,Columbia Pictures\nSee full company information,"$120,663,589\n4,313 theaters","Jun 2, 2023 - Sep 14, 2023",PG,2 hr 20 min,Action Adventure Animation Family Fantasy Sci-Fi,"4,332 theaters","$381,311,319","$309,230,984","$690,542,303",
3,2023,Guardians of the Galaxy Vol. 3,https://www.boxofficemojo.com/release/rl297720...,Walt Disney Studios Motion Pictures\nSee full ...,"$118,414,021\n4,450 theaters","May 5, 2023",PG-13,2 hr 30 min,Action Adventure Comedy Sci-Fi,"4,450 theaters","$358,995,815","$486,559,962","$845,555,777",
4,2023,Oppenheimer,https://www.boxofficemojo.com/release/rl372588...,Universal Pictures\nSee full company information,"$82,455,420\n3,610 theaters","Jul 21, 2023",R,3 hr,Biography Drama History,"3,761 theaters","$330,078,895","$645,732,438","$975,811,333",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,2023,Emily,https://www.boxofficemojo.com/release/rl266669...,Bleecker Street Media\nSee full company inform...,"$40,287\n5 theaters","Feb 17, 2023",R,2 hr 10 min,Biography Drama History Romance,579 theaters,"$1,097,067","$3,207,521","$4,304,588",
396,2023,Knights of the Zodiac,https://www.boxofficemojo.com/release/rl420318...,,"$557,533\n588 theaters","May 12, 2023",PG-13,1 hr 52 min,Action Adventure Drama Fantasy Sci-Fi,588 theaters,"$1,090,155","$5,896,022","$6,986,177",
397,2023,Broker,https://www.boxofficemojo.com/release/rl586646...,Neon\nSee full company information,"$3,265\n3 theaters","Dec 23, 2022",R,2 hr 9 min,Comedy Crime Drama,271 theaters,"$1,046,899","$17,804,517","$18,851,416",
398,2023,Showing Up,https://www.boxofficemojo.com/release/rl956203...,A24\nSee full company information,"$63,418\n4 theaters","Apr 7, 2023",R,1 hr 47 min,Comedy Drama,115 theaters,"$754,483","$516,066","$1,270,549",
