In [51]:
# Packages
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

import requests
from bs4 import BeautifulSoup

import json
import time
from datetime import datetime

In [107]:
# Chrome options
options = uc.ChromeOptions()
options.page_load_strategy = "eager"
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-extensions")
options.add_argument("--blink-settings=imagesEnabled=false")
options.add_argument("--disable-gpu")
options.add_argument("--lang=en-US")
options.add_argument("--disable-background-timer-throttling")
options.add_argument("--disable-backgrounding-occluded-windows")
options.add_argument("--disable-renderer-backgrounding")

# Driver
driver = uc.Chrome(options=options)
driver.set_page_load_timeout(30)

wait = WebDriverWait(driver, 20)

print("WebDriver initialized.")

# CHANGED: lock in the params you’re actually using
base_url = "https://www.metacritic.com/browse/game/"
params = "releaseYearMin=1958&releaseYearMax=2025"

driver.get(f"{base_url}?{params}&page=1")

url_list = []
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "c-productListings_grid")))

# How many pages to scrape?
pages = driver.find_elements(By.CLASS_NAME, "c-navigationPagination_item")
total_pages = int(pages[-2].text)
print(f"Total pages to scrape: {total_pages}")

for i in range(1, 5 + 1):
    
    time.sleep(1)

    driver.get(f"{base_url}?{params}&page={i}")

    # Wait for listings
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "c-productListings")))

    block = driver.find_element(By.CLASS_NAME, "c-productListings")
    items = block.find_elements(By.TAG_NAME, "a")

    for a in items:
        link = a.get_attribute("href")
        if link and "/game/" in link:
            url_list.append(link)

    print(f"Page {i}/{total_pages} | Total links scraped: {len(url_list)}")

print(f'Scraping urls is over, total games found: {len(url_list)}')



for url in url_list[:5]:
    driver.get(url)
    print(f'Scraping: {url}')

driver.quit()

print("Webdriver closed. Finished full scraping process succesfully!")

WebDriver initialized.
Total pages to scrape: 584
Page 1/584 | Total links scraped: 24
Page 2/584 | Total links scraped: 48
Page 3/584 | Total links scraped: 72
Page 4/584 | Total links scraped: 96
Page 5/584 | Total links scraped: 120
Scraping urls is over, total games found: 120
Scraping: https://www.metacritic.com/game/the-legend-of-zelda-ocarina-of-time/
Scraping: https://www.metacritic.com/game/soulcalibur/
Scraping: https://www.metacritic.com/game/grand-theft-auto-iv/
Scraping: https://www.metacritic.com/game/super-mario-galaxy/
Scraping: https://www.metacritic.com/game/super-mario-galaxy-2/
Webdriver closed. Finished full scraping process succesfully!


In [111]:
len(url_list)

120

In [None]:
options = uc.ChromeOptions()
options.page_load_strategy = "eager"
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-extensions")
options.add_argument("--blink-settings=imagesEnabled=false")
options.add_argument("--disable-gpu")
options.add_argument("--lang=en-US")
options.add_argument("--disable-background-timer-throttling")
options.add_argument("--disable-backgrounding-occluded-windows")
options.add_argument("--disable-renderer-backgrounding")

# Driver
driver = uc.Chrome(options=options)
driver.set_page_load_timeout(30)
wait = WebDriverWait(driver, 20)

# requests session ONCE (faster)
session = requests.Session()

times = []

print('Scraping 15 pages to calculate total scraping time')
for url in url_list[:15]:
    start = time.perf_counter()
    print(f'Scraping link: {url}')
    
    driver.get(url)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "c-productHero_score-container")))

    score_container = driver.find_element(By.CLASS_NAME, 'c-productHero_score-container')
    details_container = driver.find_element(By.CLASS_NAME, 'c-pageProductionDetails')

    game_title = score_container.find_element(By.CLASS_NAME, 'c-productHero_title').text
#    print(game_title)

    platform_list = []
    platforms_block = details_container.find_element(By.CLASS_NAME, 'c-gameDetails_Platforms')
    platforms = platforms_block.find_elements(By.CLASS_NAME, 'c-gameDetails_listItem')
    for platform in platforms:
        platform_list.append(platform.text)
#    print(f'Available on: {platform_list}')

    release_date_str = score_container.find_element(By.CLASS_NAME, 'u-text-uppercase').text
    release_date = datetime.strptime(release_date_str, "%b %d, %Y").date()
#    print(f'Released on: {release_date}')

    developer_block = details_container.find_element(By.CLASS_NAME, 'c-gameDetails_Developer')
    developer = developer_block.find_element(By.CLASS_NAME, 'c-gameDetails_listItem').text
#    print(f'By: {developer}')

    publisher_block = details_container.find_element(By.CLASS_NAME, 'c-gameDetails_Distributor')
    publisher = publisher_block.text.split(':')[-1].strip()
#    print(f'With publisher: {publisher}')

    genre = details_container.find_element(By.CLASS_NAME, 'c-genreList').text
#    print(f'Game genre: {genre}')

    metascore = score_container.find_element(By.CLASS_NAME, 'c-productScoreInfo_scoreNumber').text
#    print(f'Critic score: {metascore}')
    
    user_score = score_container.find_element(By.CLASS_NAME, 'c-siteReviewScore_background-user').text
#    print(f'User score: {user_score}')

    gamestat_row = driver.find_element(By.CLASS_NAME, 'gameStats_row')
    view_more_btn = gamestat_row.find_element(By.CLASS_NAME, 'c-globalButton_container')
    driver.execute_script("arguments[0].click();", view_more_btn)

    button_block = driver.find_element(By.CLASS_NAME, 'c-gameStatsWidgetModal_button')
    stat_link = button_block.find_element(By.TAG_NAME, 'a').get_attribute('href')

    # statistics page using requests for optimized speed
    session.headers.update({
        "User-Agent": driver.execute_script("return navigator.userAgent;"),
        "Accept-Language": "en-US,en;q=0.9",
    })

    resp = session.get(stat_link, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    stat_blocks = soup.find_all(class_='pie')
    if len(stat_blocks) < 5:
        print("Stats page missing blocks (requests failed?):", stat_link)
        continue
    
    rating_block = stat_blocks[0]
    diff_block = stat_blocks[1]
    playtime_block = stat_blocks[2]
    ownership_block = stat_blocks[3]
    completion_block = stat_blocks[4]

    body = soup.find('body')
    tables = body.find_all('table')
    rating_table = tables[0]
    diff_table = tables[1]
    playtime_table = tables[2]
    ownership_table = tables[3]
    completion_table = tables[4]

    # Rating statistics
    total_user_ratings = int(rating_block.find(class_='larger').text.split()[0])

    rating_distribution = {
        0.5: 0.0,
        1.0: 0.0,
        1.5: 0.0,
        2.0: 0.0,
        2.5: 0.0,
        3.0: 0.0,
        3.5: 0.0,
        4.0: 0.0,
        4.5: 0.0,
        5.0: 0.0,
    }

    ratings_rows = rating_table.find_all('tr')
    for row in ratings_rows:
        text = row.get_text(" ", strip=True)

        percent = float(text.split()[-1].replace('%', ''))
        if text.startswith('½'):
            stars = 0.5
        elif '½' in text:
            stars = float(text.split('½', 1)[0].strip()) + 0.5
        else:
            stars = float(text.split()[0])

        rating_distribution[stars] = percent

    user_counts = {stars: round(pct * 0.01 * total_user_ratings) for stars, pct in rating_distribution.items()}

    user_05 = user_counts[0.5]
    user_10 = user_counts[1.0]
    user_15 = user_counts[1.5]
    user_20 = user_counts[2.0]
    user_25 = user_counts[2.5]
    user_30 = user_counts[3.0]
    user_35 = user_counts[3.5]
    user_40 = user_counts[4.0]
    user_45 = user_counts[4.5]
    user_50 = user_counts[5.0]

    # Difficulty statistics
    total_diff_votes = int(diff_block.find(class_='larger').text.split()[0])

    difficulty_distribution = {
        "Simple": 0.0,
        "Easy": 0.0,
        "Just Right": 0.0,
        "Tough": 0.0,
        "Unforgiving": 0.0,
    }

    diff_rows = diff_table.find_all("tr")
    for row in diff_rows:
        text = row.get_text(" ", strip=True)
        label = " ".join(text.split()[:-1])
        percent = float(text.split()[-1].replace("%", ""))
        difficulty_distribution[label] = percent

    difficulty_counts = {k: round(v * 0.01 * total_diff_votes) for k, v in difficulty_distribution.items()}
    
    diff_simple = difficulty_counts["Simple"]
    diff_easy = difficulty_counts["Easy"]
    diff_justright = difficulty_counts["Just Right"]
    diff_tough = difficulty_counts["Tough"]
    diff_unforgiving = difficulty_counts["Unforgiving"]

    # Playtime statistics
    total_playtime_votes = int(playtime_block.find(class_='larger').text.split()[0])

    playtime_distribution = {
        "< 1 Hour": 0.0,
        "~1 Hour": 0.0,
        "~2 Hours": 0.0,
        "~4 Hours": 0.0,
        "~8 Hours": 0.0,
        "~12 Hours": 0.0,
        "~20 Hours": 0.0,
        "~40 Hours": 0.0,
        "~60 Hours": 0.0,
        ">= 80 Hours": 0.0,
    }

    play_rows = playtime_table.find_all("tr")
    for row in play_rows:
        text = row.get_text(" ", strip=True)

        label = " ".join(text.split()[:-1])
        percent = float(text.split()[-1].replace('%', ''))

        label = label.replace("<1", "< 1").replace(">=80", ">= 80")
        playtime_distribution[label] = percent

    playtime_counts = {k: round(v * 0.01 * total_playtime_votes) for k, v in playtime_distribution.items()}

    time_under1 = playtime_counts["< 1 Hour"]
    time_1 = playtime_counts["~1 Hour"]
    time_2 = playtime_counts["~2 Hours"]
    time_4 = playtime_counts["~4 Hours"]
    time_8 = playtime_counts["~8 Hours"]
    time_12 = playtime_counts["~12 Hours"]
    time_20 = playtime_counts["~20 Hours"]
    time_40 = playtime_counts["~40 Hours"]
    time_60 = playtime_counts["~60 Hours"]
    time_over80 = playtime_counts[">= 80 Hours"]

    # Ownership statistics
    total_ownership_votes = int(ownership_block.find(class_='larger').text.split()[0])

    ownership_distribution = {
        "Played It": 0.0,
        "Owned": 0.0,
        "Own It": 0.0,
    }

    ownership_rows = ownership_table.find_all('tr')
    for row in ownership_rows:
        text = row.get_text(" ", strip=True)

        label = " ".join(text.split()[:-1])
        percent = float(text.split()[-1].replace('%', ''))
        
        ownership_distribution[label] = percent

    ownership_counts = {k: round(v * 0.01 * total_ownership_votes) for k, v in ownership_distribution.items()}

    own_played = ownership_counts["Played It"]
    own_owned = ownership_counts["Owned"]
    own_ownit = ownership_counts["Own It"]

    # Completion statistics
    total_completion_votes = int(completion_block.find(class_='larger').text.split()[0])

    completion_distribution = {
        "Tried It": 0.0,
        "Played It": 0.0,
        "Halfway": 0.0,
        "Beat It": 0.0,
        "Conquered It": 0.0,
    }

    completion_rows = completion_table.find_all('tr')
    for row in completion_rows:
        text = row.get_text(" ", strip=True)

        label = " ".join(text.split()[:-1])
        percent = float(text.split()[-1].replace('%', ''))
                        
        completion_distribution[label] = percent

    completion_counts = {k: round(v * 0.01 * total_completion_votes) for k, v in completion_distribution.items()}

    comp_triedit = completion_counts["Tried It"]
    comp_playedit = completion_counts["Played It"]
    comp_halfway = completion_counts["Halfway"]
    comp_beatit = completion_counts["Beat It"]
    comp_conqueredit = completion_counts["Conquered It"]

    end = time.perf_counter()
    times.append(end - start)

average_time = sum(times) / len(times)

print(
    f"Average time per URL: {average_time:.2f} seconds. "
    f"Total expected scraping time = approximately {(average_time * 14011) / 60:.0f} minutes"
)

driver.quit()


Scraping 15 pages to calculate total scraping time
Scraping link: https://www.metacritic.com/game/the-legend-of-zelda-ocarina-of-time/
