## Imports

In [41]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
import pandas as pd
import json
import os
import random

## Load and Clean FPS Benchmark json file

In [None]:
#Load json
with open("gpus.json", "r") as f:
    data = json.load(f)

#Flatten and extract nested columns
df = pd.json_normalize(data)
nested_columns = [col for col in df.columns if "Settings." in col and col.endswith(".Games")]

#Extract setting and resolution for each game
all_games = []
for col in nested_columns:
    temp = df[["Name", col]].explode(col).dropna().reset_index(drop=True)
    games_expanded = pd.json_normalize(temp[col])
    
    parts = col.split(".")
    setting = parts[1]        # ultra / high / medium / low
    resolution = parts[3]     # resolution
    
    temp_df = pd.concat([temp[["Name"]], games_expanded], axis=1)
    temp_df["Setting"] = setting
    temp_df["Resolution"] = resolution
    
    all_games.append(temp_df)

# Combine into one clean DataFrame
fps_df = pd.concat(all_games, ignore_index=True)

# Keep only desired columns
fps_df = fps_df[["Name", "Game_Name", "Min_FPS", "Avg_FPS", "Setting", "Resolution"]]
fps_df = fps_df.sort_values(by="Name").set_index("Name")

# Save to CSV
fps_df.to_csv("data/gpu_fps_only.csv", index=True)


## Proxy Server Setup

In [82]:
proxy_host = "brd.superproxy.io"
proxy_port = 33335
proxy_user = "brd-customer-hl_02785ee1-zone-residential_proxy1"
proxy_pass = "r2i2po7qyi5s"

country_codes = [
    "ph",  # Philippines
    "vn",  # Vietnam
    "eg",  # Egypt
    "ng",  # Nigeria
    "ke",  # Kenya
    "cz",  # Czech Republic
    "gr",  # Greece
    "pt",  # Portugal
    "ro"   # Romania
]

PROXY_POOL = [
    {
        "http": f"http://{proxy_user}-country-{cc}:{proxy_pass}@{proxy_host}:{proxy_port}",
        "https": f"http://{proxy_user}-country-{cc}:{proxy_pass}@{proxy_host}:{proxy_port}"
    }
    for cc in country_codes
]

def get_random_proxy():
    return random.choice(PROXY_POOL)

## Scraping Functions

In [None]:
def get_gpu_page_url(gpu_name, proxy=None):
    
    """
    Retrieve the TechPowerUp GPU specification page URL for a given GPU name.

    This function sends a request to the TechPowerUp GPU database search page
    with the provided GPU name, parses the results, and returns the URL of
    the first matching GPU's detail page.

    Parameters
    ----------
    gpu_name : str
        The name of the GPU to search for (e.g., "RTX 3080").
    proxy : dict, optional
        An optional dictionary of proxy settings to route the request through.
        Example: {"http": "http://127.0.0.1:8080", "https": "http://127.0.0.1:8080"}

    Returns
    -------
    str or None
        A fully-qualified URL to the GPU's detail page on TechPowerUp
        (e.g., "https://www.techpowerup.com/gpu-specs/nvidia-rtx-3080.c3621"),
        or None if no result is found.
    """
    url = f"https://www.techpowerup.com/gpu-specs/?q={gpu_name}"

    resp = requests.get(url, proxies=proxy, timeout=15, verify=False) if proxy else requests.get(url, timeout=15)
    soup = BeautifulSoup(resp.text, "html.parser")

    table = soup.find("table", class_="items-desktop-table")
    if not table:
        return None
    first_link = table.find("td").find("div", class_="item-name").find("a")
    if first_link:
        return "https://www.techpowerup.com" + first_link["href"]
    return None

In [None]:
def scrape_gpu_specs(url, gpu_name, proxy=None):
        
    """
    Scrape GPU specification details from a TechPowerUp GPU page.

    This function retrieves and parses the HTML from a given GPU detail page 
    on TechPowerUp, extracts specification data from the "details" sections,
    and returns it as a structured dictionary.

    Parameters
    ----------
    url : str
        The full TechPowerUp GPU specification page URL to scrape.
        (e.g., "https://www.techpowerup.com/gpu-specs/nvidia-rtx-3080.c3621")
    gpu_name : str
        The name of the GPU being scraped, stored under the 'name' key.
    proxy : dict, optional
        An optional dictionary of proxy settings for the HTTP request.
        Example: {"http": "http://127.0.0.1:8080", "https": "http://127.0.0.1:8080"}

    Returns
    -------
    dict
        A dictionary containing GPU specifications, with:
          - "name": the GPU name provided in `gpu_name`
          - additional key-value pairs for each scraped specification
            (keys are normalized: lowercase with spaces replaced by underscores).

    """
    resp = requests.get(url, proxies=proxy, timeout=15, verify=False) if proxy else requests.get(url, timeout=15)
    soup = BeautifulSoup(resp.text, "html.parser")

    specs = {"name": gpu_name}
    sections = soup.find("div", class_="sectioncontainer").find_all("section", class_="details")
    
    for table in sections:
        dl_lists = table.find_all("dl")
        for dl in dl_lists:
            dt_elements = dl.find_all("dt")
            dd_elements = dl.find_all("dd")
            
            if len(dt_elements) != len(dd_elements):
                continue
            
            for dt, dd in zip(dt_elements, dd_elements):
                first_value = next(dd.stripped_strings, "").strip()
                key = dt.text.strip().lower().replace(" ", "_")
                specs[key] = first_value
                
    return specs

## Initialize Variables for Scraping

In [None]:
# gpu_specs_build = {}
# index = 0

## Scraping Loop

In [2]:
# for name in gpu_names[index:]:
#     # if run_count >= max_runs:
#     #     print("Reached maximum runs. Exiting loop.")
#     #     break

#     proxy = get_random_proxy()   # rotate proxy for each GPU

#     url = get_gpu_page_url(name, proxy=proxy)
#     if url:
#         print(f"Scraping {name} from {url} via {proxy['http']}")
#         specs = scrape_gpu_specs(url, name, proxy=proxy)
#         index += 1

#         name_clean = name.replace("+", " ")
#         if name_clean in gpu_specs_build:
#             print(f"Warning: Duplicate entry for {name_clean}")
#         else:
#             gpu_specs_build[name_clean] = specs

#         # Randomized delay between 5-15 seconds
#         sleep_time = random.uniform(5, 15)
#         print(f"Sleeping for {sleep_time:.2f} seconds...\n")
#         time.sleep(sleep_time)

#     else:
#         raise Exception(f"No page found for {name}")

## Save data as CSV

In [None]:
#Turn original data into DataFrame 
#gpu_specs_df_original = pd.DataFrame.from_dict(gpu_specs_build, orient='index')

#Save orginal gpu spec data after scraping
#gpu_specs_df_original.to_csv("data/gpu_specs_original.csv", index=True)