In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm


In [None]:
def get_fighter_links():
    base_url = "http://www.ufcstats.com/statistics/fighters?char={}&page=all"
    letters = 'abcdefghijklmnopqrstuvwxyz'
    fighter_links = []

    for letter in letters:
        url = base_url.format(letter)
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', class_='b-statistics__table')
            if table:
                links = table.find_all('a')
                for link in links:
                    href = link.get('href')
                    if href and 'fighter-details' in href:
                        fighter_links.append(href)
            time.sleep(0.5)
        except Exception as e:
            print(f"Ran into an error while checking letter {letter}: {e}")

    return list(set(fighter_links))


In [None]:
def get_fighter_data(fighter_url):
    response = requests.get(fighter_url, timeout=10)
    soup = BeautifulSoup(response.content, 'html.parser')

    name = soup.find('span', class_='b-content__title-highlight')
    name = name.text.strip() if name else None

    record = soup.find('span', class_='b-content__title-record')
    record = record.text.strip().replace('Record: ', '') if record else None

    stats = soup.find_all('li', class_='b-list__box-list-item b-list__box-list-item_type_block')
    data = {'Name': name, 'Record': record, 'Fighter URL': fighter_url}

    for stat in stats:
        parts = stat.text.strip().split(':')
        if len(parts) == 2:
            key = parts[0].strip()
            value = parts[1].strip()
            data[key] = value

    return data


In [None]:
def main():
    print("Grabbing all the fighter profile links...")
    fighter_links = get_fighter_links()
    print(f"Found {len(fighter_links)} fighters.")

    fighters_data = []
    errors = []

    for link in tqdm(fighter_links):
        try:
            fighter = get_fighter_data(link)
            fighters_data.append(fighter)
        except Exception as e:
            print(f"Couldn’t get data from {link}: {e}")
            errors.append((link, str(e)))
        time.sleep(0.5)

    df = pd.DataFrame(fighters_data)
    df.to_csv("ufc_fighters_dataset.csv", index=False)
    print("Done! Saved everything to ufc_fighters_dataset.csv")

    if errors:
        print(f"Had trouble with {len(errors)} links. You might want to check the log.")

main()


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_ufc_event_links_wikipedia():
    url = "https://en.wikipedia.org/wiki/List_of_UFC_events"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = []

    tables = soup.find_all("table", {"class": "wikitable"})
    for table in tables:
        for row in table.find_all("tr")[1:]:  # skip header
            cols = row.find_all("td")
            if cols and len(cols) >= 2:
                link_tag = cols[1].find("a")
                if link_tag and 'href' in link_tag.attrs:
                    event_link = "https://en.wikipedia.org" + link_tag['href']
                    links.append(event_link)
    print(f" {len(links)} UFC event links found.")
    return links

def main():
    print(" Scraping UFC event links from Wikipedia...")
    wiki_links = get_ufc_event_links_wikipedia()

    df = pd.DataFrame({"Wikipedia Event Links": wiki_links})
    df.to_csv("wikipedia_ufc_event_links.csv", index=False)
    print("\n Wikipedia event links saved to wikipedia_ufc_event_links.csv.")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time


In [None]:
def clean_url(url):
    if not url.startswith("http"):
        return "https://en.wikipedia.org" + url
    return url


In [None]:
def extract_fight_cards_v2pro(event_url, event_name):
    response = requests.get(event_url)
    if response.status_code != 200:
        print(f"Couldn’t load page: {event_url}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    fight_data = []

    tables = soup.find_all("table", class_="wikitable")
    table_number = 0

    for table in tables:
        headers = [th.text.strip().lower() for th in table.find_all("th")]
        if any('fighter' in h or 'method' in h or 'weight class' in h for h in headers):
            table_number += 1
            if table_number == 1:
                card_type = "Main Card"
            elif table_number == 2:
                card_type = "Prelims"
            elif table_number == 3:
                card_type = "Early Prelims"
            else:
                card_type = "Other"

            rows = table.find_all("tr")[1:]
            for row in rows:
                cols = row.find_all(["td", "th"])
                if len(cols) < 2:
                    continue

                try:
                    fighter1 = cols[0].text.strip()
                    fighter2 = cols[1].text.strip()
                    method = cols[2].text.strip() if len(cols) > 2 else ""
                    round_ = cols[3].text.strip() if len(cols) > 3 else ""
                    time_ = cols[4].text.strip() if len(cols) > 4 else ""
                    notes = cols[5].text.strip() if len(cols) > 5 else ""

                    fight_data.append({
                        "Event Name": event_name,
                        "Event URL": event_url,
                        "Card Type": card_type,
                        "Fighter 1": fighter1,
                        "Fighter 2": fighter2,
                        "Method": method,
                        "Round": round_,
                        "Time": time_,
                        "Notes": notes
                    })
                except:
                    continue

    return fight_data


In [None]:
def main():
    print("Loading all Wikipedia UFC event links...")
    links_df = pd.read_csv("wikipedia_ufc_event_links.csv")
    event_links = links_df["Wikipedia Event Links"].tolist()
    event_links = [clean_url(url) for url in event_links]

    all_fights = []
    print(f"Scraping fight card data from {len(event_links)} events...")

    for i, link in enumerate(event_links):
        print(f"[{i+1}/{len(event_links)}] Getting data from: {link}")
        event_name = link.split("/")[-1].replace("_", " ")
        fights = extract_fight_cards_v2pro(link, event_name)
        all_fights.extend(fights)
        time.sleep(1)

    df = pd.DataFrame(all_fights)
    df.to_csv("wikipedia_ufc_fight_card_dataset_v2pro.csv", index=False)
    print("Done. Everything saved to wikipedia_ufc_fight_card_dataset_v2pro.csv.")


In [None]:
main()
