In [None]:
import argparse
import time
from pathlib import Path

import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm 

BASE_URL = "http://ufcstats.com"  
HEADERS = {
    "User-Agent": "Mozilla/5.0" 
DEFAULT_PAUSE_TIME = 0.5  

In [None]:
def get_all_event_links():
    full_url = f"{BASE}/statistics/events/completed?page=all"
    try:
        resp = requests.get(full_url, headers=HEAD, timeout=30)
        html_content = resp.text
    except Exception as e:
        print("Failed to fetch event links:", e)
        return []  

    soup = BeautifulSoup(html_content, "lxml")
    
    links = [a_tag["href"] for a_tag in soup.select("tr.b-statistics__table-row a")]
    return links

In [None]:
def get_fight_links_from_event(event_page_url):
    try:
        response = requests.get(event_page_url, headers=HEAD, timeout=30)
        event_html = response.text
    except:
        return []

    soup = BeautifulSoup(event_html, "lxml")
    
    all_links = soup.select("a.b-link.b-link_style_black")
    
    fight_links = []
    for link in all_links:
        href = link.get("href", "")
        if "/fight-details/" in href:
            fight_links.append(href)
    
    return fight_links

In [26]:
def scrape_fight_details(fight_url):
    try:
        page_html = requests.get(fight_url, headers=HEAD, timeout=30).text
    except:
        print("Could not fetch fight:", fight_url)
        return {}

    soup = BeautifulSoup(page_html, "lxml")

    # Example title looks like: "Fighter A vs Fighter B"
    headline = soup.select_one("h2.b-content__title")
    if headline:
        fight_title = headline.text.strip().split("  ")[0]  
    else:
        fight_title = "Unknown vs Unknown"

    # Breaking the title down
    try:
        p1, p2 = [name.strip() for name in fight_title.split(" vs ")]
    except:
        p1, p2 = "Unknown", "Unknown"

    # Pulling metadata like weight class, round, etc.
    info_tags = soup.select("li.b-list__box-list-item")
    fight_meta = {}
    for li in info_tags:
        if ":" in li.text:
            key, val = li.text.split(":", 1)
            fight_meta[key.strip()] = val.strip()

    # Winner is denoted by a tag in a specific spot — can be empty sometimes
    winner_tag = soup.select_one(
        "div.b-fight-details__person:nth-child(1) i.b-fight-details__person-status"
    )
    who_won = winner_tag.text.strip() if winner_tag else None  

    # Here's the final dictionary with all the stuff we care about
    fight_data = {
        "fight_id": fight_url.split("/")[-1], 
        "event": fight_meta.get("Event"),
        "date": fight_meta.get("Date"),
        "weight_class": fight_meta.get("Weight class"),
        "fighter_1": p1,
        "fighter_2": p2,
        "winner": who_won,
        "method": fight_meta.get("Method"),
        "round": fight_meta.get("Round"),
        "time": fight_meta.get("Time"),
    }

    return fight_data

In [27]:
def crawl(max_events=None, pause=DEFAULT_PAUSE_TIME) -> pd.DataFrame:
    event_list = event_links()
    if max_events:
        event_list = event_list[:max_events]

    fight_data = [] 

    for event_url in tqdm(event_list, desc="Events", unit="event"):
        fight_links_list = fight_links(event_url)

        for fight_url in fight_links_list:
            try:
                fight_info = parse_fight(fight_url)
                fight_data.append(fight_info)
            except Exception as err:
                print(f"Error parsing {fight_url}: {err}")
                continue

            time.sleep(pause)  

        # Dump intermediate data — just in case it crashes midway
        partial_df = pd.DataFrame(fight_data)
        partial_df.to_csv("data/fights_partial.csv", index=False)

    final_df = pd.DataFrame(fight_data)
    return final_df

In [None]:
if __name__ == "__main__":
    print("Starting UFC data crawl...\n")
    
    data_frame = crawl()

    output_dir = Path("data")
    output_dir.mkdir(exist_ok=True)

    output_path = output_dir / "fights.csv"
    data_frame.to_csv(output_path, index=False)

    print(f"\n Done! Saved {len(data_frame):,} rows to {output_path}")

Events:   0%|          | 0/731 [00:00<?, ?event/s]


✅ fights.csv saved with 0 rows


In [None]:
#Initial setup for models

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [None]:
#Placeholder for engineered dataset

df = pd.read_csv('..')

X = df.drop('Winner_label', axis=1)  #target name
y = df['Winner_label']

In [None]:
#Datasplit (Training/testing)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
#Model setup
def train_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, predictions)}")
    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    return model

In [None]:
#Will run it when its ready

#logistic regression
print("Logistic Regression Results")
logistic_model = LogisticRegression(max_iter=1000)
train_evaluate_model(logistic_model, X_train, y_train, X_test, y_test)

#random forest
print("Random Forest Results")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
train_evaluate_model(rf_model, X_train, y_train, X_test, y_test)

#XGboost
print("XGBoost Results")
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
train_evaluate_model(xgb_model, X_train, y_train, X_test, y_test)

In [None]:
#For front-end use (Streamlit)
