In [5]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import random

In [6]:

BASE_URL = "https://www.techpowerup.com/gpu-specs/"

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/117.0"
]

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

brands = [
    "3dfx", "AMD", "ATI", "ARM", "Broadcom", "Chrome", "Creative", "Intel", "Matrox",
    "Moore+Threads", "NVIDIA", "Sony", "XGI"
]

years = [
    "2025", "2024", "2023", "2022", "2021", "2020", "2019", "2018", "2017", "2016",
    "2015", "2014", "2013", "2012", "2011", "2010", "2009", "2008", "2007", "2006",
    "2005", "2004", "2003", "2002", "2001", "2000", "1999", "1998", "1997", "1996",
    "1995", "1994", "1993", "1992", "1991", "1990", "1989", "1988", "1987", "1986"
]

igp_options = ["Yes", "No"]


In [7]:

# Get a random user-agent header
def get_headers():
    return {
        "User-Agent": random.choice(USER_AGENTS)
    }

# Make a request with retry logic
def safe_request(url, retries=3):
    delay = 2
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=get_headers(), timeout=10)
            if response.status_code == 429:
                print("Rate limited. Sleeping longer...")
                time.sleep(delay * 5)
                continue
            if response.status_code == 200:
                return response
        except requests.RequestException as e:
            print(f"Request failed ({e}), retrying in {delay} seconds...")
        time.sleep(delay)
        delay *= 2
    return None

def parse_page(url):
    response = safe_request(url)
    if not response:
        print(f"Failed to fetch: {url}")
        return []
    soup = BeautifulSoup(response.text, "html.parser")
    rows = soup.select("table.gputable tbody tr")
    results = []
    for row in rows:
        cols = row.find_all("td")
        if not cols or len(cols) < 2:
            continue
        gpu_data = [col.get_text(strip=True) for col in cols]
        results.append(gpu_data)
    return results

# Begin scraping
all_data = []
count = 0

for brand in brands:
    for year in years:
        for igp in igp_options:
            page = 1
            while True:
                query = f"?manufacturer={brand}&released={year}&igp={igp}&sort=name&page={page}"
                url = BASE_URL + query
                print(f"Scraping {url}")
                data = parse_page(url)
                if not data:
                    break
                all_data.extend(data)
                page += 1
                count += len(data)

                # Sleep between requests to avoid rate-limiting
                sleep_time = random.uniform(3, 7)
                print(f"Sleeping for {sleep_time:.2f} seconds...")
                time.sleep(sleep_time)

print(f"Total GPUs scraped: {count}")

# Save to CSV
with open("gpu_specs_all.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(all_data)

print("Data saved to gpu_specs_all.csv")


Scraping https://www.techpowerup.com/gpu-specs/?manufacturer=3dfx&released=2025&igp=Yes&sort=name&page=1
Scraping https://www.techpowerup.com/gpu-specs/?manufacturer=3dfx&released=2025&igp=No&sort=name&page=1
Scraping https://www.techpowerup.com/gpu-specs/?manufacturer=3dfx&released=2024&igp=Yes&sort=name&page=1
Scraping https://www.techpowerup.com/gpu-specs/?manufacturer=3dfx&released=2024&igp=No&sort=name&page=1
Scraping https://www.techpowerup.com/gpu-specs/?manufacturer=3dfx&released=2023&igp=Yes&sort=name&page=1
Scraping https://www.techpowerup.com/gpu-specs/?manufacturer=3dfx&released=2023&igp=No&sort=name&page=1
Scraping https://www.techpowerup.com/gpu-specs/?manufacturer=3dfx&released=2022&igp=Yes&sort=name&page=1
Scraping https://www.techpowerup.com/gpu-specs/?manufacturer=3dfx&released=2022&igp=No&sort=name&page=1
Scraping https://www.techpowerup.com/gpu-specs/?manufacturer=3dfx&released=2021&igp=Yes&sort=name&page=1
Scraping https://www.techpowerup.com/gpu-specs/?manufacture

KeyboardInterrupt: 