In [41]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
import pandas as pd
import json
import os
import random

In [None]:
with open("gpus.json", "r") as f:
    data = json.load(f)

df = pd.json_normalize(data)

nested_columns = [col for col in df.columns if "Settings." in col and col.endswith(".Games")]

all_games = []
for col in nested_columns:
    temp = df[["Name", col]].explode(col).dropna().reset_index(drop=True)
    games_expanded = pd.json_normalize(temp[col])
    
    parts = col.split(".")
    setting = parts[1]        # ultra / high / medium / low
    resolution = parts[3]     # resolution
    
    temp_df = pd.concat([temp[["Name"]], games_expanded], axis=1)
    temp_df["Setting"] = setting
    temp_df["Resolution"] = resolution
    
    all_games.append(temp_df)

# Combine into one clean DataFrame
fps_df = pd.concat(all_games, ignore_index=True)

# Keep only desired columns
fps_df = fps_df[["Name", "Game_Name", "Min_FPS", "Avg_FPS", "Setting", "Resolution"]]
fps_df = fps_df.sort_values(by="Name").set_index("Name")

# Save to CSV
fps_df.to_csv("data/gpu_fps_only.csv", index=True)


In [82]:
proxy_host = "brd.superproxy.io"
proxy_port = 33335
proxy_user = "brd-customer-hl_02785ee1-zone-residential_proxy1"
proxy_pass = "r2i2po7qyi5s"

country_codes = [
    "ph",  # Philippines
    "vn",  # Vietnam
    "eg",  # Egypt
    "ng",  # Nigeria
    "ke",  # Kenya
    "cz",  # Czech Republic
    "gr",  # Greece
    "pt",  # Portugal
    "ro"   # Romania
]

PROXY_POOL = [
    {
        "http": f"http://{proxy_user}-country-{cc}:{proxy_pass}@{proxy_host}:{proxy_port}",
        "https": f"http://{proxy_user}-country-{cc}:{proxy_pass}@{proxy_host}:{proxy_port}"
    }
    for cc in country_codes
]

def get_random_proxy():
    return random.choice(PROXY_POOL)

In [63]:
def test_proxy_pool(pool):
    results = []
    for i, proxy in enumerate(pool, 1):
        try:
            resp = requests.get("https://ipinfo.io/json", proxies=proxy, timeout=10, verify=False)
            results.append((proxy["http"], resp.json()))
        except Exception as e:
            results.append((proxy["http"], f"FAILED: {e}"))
    return results

for proxy, result in test_proxy_pool(PROXY_POOL):
    print(proxy, "→", result)



KeyboardInterrupt: 

In [None]:
def get_gpu_page_url(gpu_name, proxy=None):
    url = f"https://www.techpowerup.com/gpu-specs/?q={gpu_name}"

    resp = requests.get(url, proxies=proxy, timeout=15, verify=False) if proxy else requests.get(url, timeout=15)
    soup = BeautifulSoup(resp.text, "html.parser")

    table = soup.find("table", class_="items-desktop-table")
    if not table:
        return None
    first_link = table.find("td").find("div", class_="item-name").find("a")
    if first_link:
        return "https://www.techpowerup.com" + first_link["href"]
    return None

In [None]:
def scrape_gpu_specs(url, gpu_name, proxy=None):
    resp = requests.get(url, proxies=proxy, timeout=15, verify=False) if proxy else requests.get(url, timeout=15)
    soup = BeautifulSoup(resp.text, "html.parser")

    specs = {"name": gpu_name}
    sections = soup.find("div", class_="sectioncontainer").find_all("section", class_="details")
    
    for table in sections:
        dl_lists = table.find_all("dl")
        for dl in dl_lists:
            dt_elements = dl.find_all("dt")
            dd_elements = dl.find_all("dd")
            
            if len(dt_elements) != len(dd_elements):
                continue
            
            for dt, dd in zip(dt_elements, dd_elements):
                first_value = next(dd.stripped_strings, "").strip()
                key = dt.text.strip().lower().replace(" ", "_")
                specs[key] = first_value
                
    return specs

In [None]:
print(fps_df)

                                                  Game_Name Min_FPS Avg_FPS  \
Name                                                                          
AMD Radeon 530 Mobile              Assassin's Creed Origins       5     7.0   
AMD Radeon 530 Mobile                                Hitman       8    10.7   
AMD Radeon 530 Mobile                               F1 2016       8    11.2   
AMD Radeon 530 Mobile                                  Doom      11    14.5   
AMD Radeon 530 Mobile  Ashes of the Singularity: Escalation       6     8.1   
...                                                     ...     ...     ...   
NVIDIA TITAN Xp                   Call of Duty: Black Ops 4      87   115.2   
NVIDIA TITAN Xp                        Need For Speed: Heat      48    63.9   
NVIDIA TITAN Xp                       Red Dead Redemption 2      32    42.4   
NVIDIA TITAN Xp                            Final Fantasy XV      62    82.0   
NVIDIA TITAN Xp                   Assassin's Creed V

In [None]:
gpu_names = fps_df.index.unique().tolist()
gpu_names = [name.replace(" ", "+") for name in gpu_names]
print(gpu_names)

['AMD+Radeon+530+Mobile', 'AMD+Radeon+540+Mobile', 'AMD+Radeon+HD+6850', 'AMD+Radeon+HD+6870', 'AMD+Radeon+HD+6950', 'AMD+Radeon+HD+6970', 'AMD+Radeon+HD+6990', 'AMD+Radeon+HD+7750', 'AMD+Radeon+HD+7750M', 'AMD+Radeon+HD+7790', 'AMD+Radeon+HD+7850', 'AMD+Radeon+HD+7850M', 'AMD+Radeon+HD+7950', 'AMD+Radeon+HD+7950M', 'AMD+Radeon+HD+7970', 'AMD+Radeon+HD+7970+GHz+Edition', 'AMD+Radeon+HD+7970M', 'AMD+Radeon+HD+7990', 'AMD+Radeon+Pro+WX+7100+Mobile', 'AMD+Radeon+R5', 'AMD+Radeon+R7+250', 'AMD+Radeon+R7+265', 'AMD+Radeon+R7+370', 'AMD+Radeon+R9+270', 'AMD+Radeon+R9+280', 'AMD+Radeon+R9+280X', 'AMD+Radeon+R9+285', 'AMD+Radeon+R9+290', 'AMD+Radeon+R9+290X', 'AMD+Radeon+R9+295X2', 'AMD+Radeon+R9+380', 'AMD+Radeon+R9+380X', 'AMD+Radeon+R9+390', 'AMD+Radeon+R9+390X', 'AMD+Radeon+R9+FURY', 'AMD+Radeon+R9+FURY+X', 'AMD+Radeon+R9+M270X', 'AMD+Radeon+R9+M280X', 'AMD+Radeon+R9+M280X+2GB', 'AMD+Radeon+R9+M290X', 'AMD+Radeon+R9+M380', 'AMD+Radeon+R9+Nano', 'AMD+Radeon+RX+460', 'AMD+Radeon+RX+470', 'AM

In [None]:
# gpu_specs_build = {}
# index = 0

In [None]:
# for name in gpu_names[index:]:
#     # if run_count >= max_runs:
#     #     print("Reached maximum runs. Exiting loop.")
#     #     break

#     proxy = get_random_proxy()   # 🌍 rotate proxy for each GPU

#     url = get_gpu_page_url(name, proxy=proxy)
#     if url:
#         print(f"Scraping {name} from {url} via {proxy['http']}")
#         specs = scrape_gpu_specs(url, name, proxy=proxy)
#         index += 1

#         name_clean = name.replace("+", " ")
#         if name_clean in gpu_specs_build:
#             print(f"Warning: Duplicate entry for {name_clean}")
#         else:
#             gpu_specs_build[name_clean] = specs

#         # Randomized delay between 5-15 seconds
#         sleep_time = random.uniform(5, 15)
#         print(f"Sleeping for {sleep_time:.2f} seconds...\n")
#         time.sleep(sleep_time)

#     else:
#         raise Exception(f"No page found for {name}")



Scraping NVIDIA+GeForce+RTX+4060+Ti from https://www.techpowerup.com/gpu-specs/geforce-rtx-4060-ti-16-gb.c4155 via http://brd-customer-hl_02785ee1-zone-residential_proxy1-country-ro:r2i2po7qyi5s@brd.superproxy.io:33335




Sleeping for 11.46 seconds...





Scraping NVIDIA+GeForce+RTX+4070 from https://www.techpowerup.com/gpu-specs/geforce-rtx-4070.c3924 via http://brd-customer-hl_02785ee1-zone-residential_proxy1-country-ph:r2i2po7qyi5s@brd.superproxy.io:33335




Sleeping for 6.52 seconds...





Scraping NVIDIA+GeForce+RTX+4080 from https://www.techpowerup.com/gpu-specs/geforce-rtx-4080.c3888 via http://brd-customer-hl_02785ee1-zone-residential_proxy1-country-gr:r2i2po7qyi5s@brd.superproxy.io:33335




Sleeping for 8.60 seconds...





Scraping NVIDIA+GeForce+RTX+4080+Ti from https://www.techpowerup.com/gpu-specs/geforce-rtx-4080-ti.c3887 via http://brd-customer-hl_02785ee1-zone-residential_proxy1-country-cz:r2i2po7qyi5s@brd.superproxy.io:33335




Sleeping for 13.26 seconds...





Scraping NVIDIA+GeForce+RTX+4090 from https://www.techpowerup.com/gpu-specs/geforce-rtx-4090.c3889 via http://brd-customer-hl_02785ee1-zone-residential_proxy1-country-eg:r2i2po7qyi5s@brd.superproxy.io:33335




Sleeping for 13.68 seconds...





Scraping NVIDIA+TITAN+RTX from https://www.techpowerup.com/gpu-specs/titan-rtx.c3311 via http://brd-customer-hl_02785ee1-zone-residential_proxy1-country-ng:r2i2po7qyi5s@brd.superproxy.io:33335




Sleeping for 6.79 seconds...





Scraping NVIDIA+TITAN+V from https://www.techpowerup.com/gpu-specs/titan-v.c3051 via http://brd-customer-hl_02785ee1-zone-residential_proxy1-country-pt:r2i2po7qyi5s@brd.superproxy.io:33335




Sleeping for 13.48 seconds...





Scraping NVIDIA+TITAN+Xp from https://www.techpowerup.com/gpu-specs/titan-xp.c2948 via http://brd-customer-hl_02785ee1-zone-residential_proxy1-country-ke:r2i2po7qyi5s@brd.superproxy.io:33335




Sleeping for 7.25 seconds...



In [84]:
print(gpu_specs_build)
print(f"Scraped {len(gpu_specs_build)} GPUs")

{'AMD Radeon 530 Mobile': {'name': 'AMD+Radeon+530+Mobile', 'gpu_name': 'Weston', 'gpu_variant': 'Weston PRO', 'architecture': 'GCN 3.0', 'foundry': 'TSMC', 'process_size': '28 nm', 'transistors': '1,550 million', 'density': '12.4M / mm²', 'die_size': '125 mm²', 'release_date': 'Apr 18th, 2017', 'generation': 'Polaris Mobile', 'predecessor': 'Gem System', 'successor': 'Navi Mobile', 'production': 'End-of-life', 'bus_interface': 'PCIe 3.0 x8', 'base_clock': '730 MHz', 'boost_clock': '1021 MHz', 'memory_clock': '900 MHz', 'memory_size': '2 GB', 'memory_type': 'DDR3', 'memory_bus': '64 bit', 'bandwidth': '14.40 GB/s', 'shading_units': '384', 'tmus': '24', 'rops': '8', 'compute_units': '6', 'l1_cache': '16 KB (per CU)', 'l2_cache': '128 KB', 'pixel_rate': '8.168 GPixel/s', 'texture_rate': '24.50 GTexel/s', 'fp16_(half)': '784.1 GFLOPS', 'fp32_(float)': '784.1 GFLOPS', 'fp64_(double)': '49.01 GFLOPS', 'slot_width': 'IGP', 'tdp': '50 W', 'outputs': 'Portable Device Dependent', 'power_connect

In [None]:
#Turn original data into DataFrame 
#gpu_specs_df_original = pd.DataFrame.from_dict(gpu_specs_build, orient='index')

In [None]:
#Save orginal gpu spec data after scraping
#gpu_specs_df_original.to_csv("data/gpu_specs_original.csv", index=True)