In [None]:
import argparse, time, requests, pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm.auto import tqdm   

BASE = "http://ufcstats.com"              
HEAD = {"User-Agent": "Mozilla/5.0"}       
DEFAULT_PAUSE = 0.5                        

In [26]:

def event_links():
    """Return list of all completed UFC event URLs."""
    url  = f"{BASE}/statistics/events/completed?page=all"
    html = requests.get(url, headers=HEAD, timeout=30).text
    soup = BeautifulSoup(html, "lxml")
    return [a["href"] for a in soup.select("tr.b-statistics__table-row a")]


def fight_links(event_url: str):
    """Return list of fight-details URLs for a single event page."""
    html = requests.get(event_url, headers=HEAD, timeout=30).text
    soup = BeautifulSoup(html, "lxml")
    return [a["href"] for a in soup.select("a.b-link.b-link_style_black")
            if "/fight-details/" in a["href"]]


def parse_fight(url: str) -> dict:
    """Scrape one fight page → dict of columns we need."""
    s = BeautifulSoup(requests.get(url, headers=HEAD, timeout=30).text, "lxml")

    title = s.select_one("h2.b-content__title").text.strip().split("  ")[0]
    fighter_1, fighter_2 = [x.strip() for x in title.split(" vs ")]

    meta = {li.text.split(":")[0].strip(): li.text.split(":")[1].strip()
            for li in s.select("li.b-list__box-list-item")}

    win_tag = s.select_one(
        "div.b-fight-details__person:nth-child(1) i.b-fight-details__person-status"
    )
    winner = win_tag.text.strip() if win_tag else None

    return {
        "fight_id"    : url.split("/")[-1],
        "event"       : meta.get("Event"),
        "date"        : meta.get("Date"),
        "weight_class": meta.get("Weight class"),
        "fighter_1"   : fighter_1,
        "fighter_2"   : fighter_2,
        "winner"      : winner,
        "method"      : meta.get("Method"),
        "round"       : meta.get("Round"),
        "time"        : meta.get("Time"),
    }


In [27]:
def crawl(max_events=None, pause=DEFAULT_PAUSE) -> pd.DataFrame:
    events = event_links()[:max_events] if max_events else event_links()
    rows   = []
    for ev in tqdm(events, desc="Events", unit="event"):
        for lk in fight_links(ev):
            rows.append(parse_fight(lk))
            time.sleep(pause)                    
        pd.DataFrame(rows).to_csv("data/fights_partial.csv", index=False)
    return pd.DataFrame(rows)

In [None]:
if __name__ == "__main__":
    df = crawl()                                    
    Path("data").mkdir(exist_ok=True)
    df.to_csv("data/fights.csv", index=False)
    print(f"\n✅ fights.csv saved with {len(df):,} rows")

Events:   0%|          | 0/731 [00:00<?, ?event/s]


✅ fights.csv saved with 0 rows
