In [2]:
from unittest.mock import inplace

from IPython.display import display, HTML
from werkzeug.routing.rules import Weighting

display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [4]:
import requests                 # How Python gets the webpages
from bs4 import BeautifulSoup   # Creates structured, searchable object
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from datetime import datetime

In [5]:
from pylab import rcParams

rcParams['figure.dpi'] = 150
rcParams['lines.linewidth'] = 1.2
rcParams['axes.facecolor'] = 'white'
rcParams['patch.edgecolor'] = 'white'
rcParams['font.family'] = 'DejaVu Sans'
rcParams['figure.figsize'] = 5,3
rcParams['font.size'] = 10

In [6]:
rcParams['axes.labelsize'] = 'medium'
rcParams['xtick.labelsize'] = 8
rcParams['ytick.labelsize'] = 8

In [16]:
# I don't think we will need this table, but we can keep it to show the process of getting the data.
url_ranking = "http://ufcstats.com/statistics/fighters?page=all"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response_ranking = requests.get(url_ranking, headers=headers)

# Always check if the request was successful
if response_ranking.status_code != 200:
    print(f"Error: Received status code {response_ranking.status_code}")
    exit()

print(f"Response status: {response_ranking.status_code}")

Response status: 200


In [17]:
# I don't think we will need this table, but we can keep it to show the process of getting the data.
fighters_page = response_ranking.content
scraping = BeautifulSoup(fighters_page, "lxml")
tables_fighters = scraping.find_all('table')
tables_fighters_df = pd.read_html(str(tables_fighters))
fighters_table = tables_fighters_df[0]
fighters_table

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt
0,,,,,,,,,,,
1,Tom,Aaron,,--,155 lbs.,--,,5.0,3.0,0.0,
2,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4.0,6.0,0.0,
3,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28.0,4.0,0.0,
4,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10.0,15.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...
222,Abu,Azaitar,Captain Morocco,"5' 9""",185 lbs.,"76.0""",Orthodox,14.0,4.0,1.0,
223,Ottman,Azaitar,Bulldozer,"5' 8""",155 lbs.,"71.0""",Switch,13.0,3.0,0.0,
224,Luiz,Azeredo,,"5' 9""",154 lbs.,--,Orthodox,15.0,10.0,0.0,
225,Luciano,Azevedo,,"6' 3""",161 lbs.,--,Orthodox,17.0,9.0,1.0,


In [18]:
fighters_table.to_csv('./csv/fighters_table.csv', index=False)

In [12]:
url = "https://api.sportradar.com/mma/trial/v2/en/rankings.json"
headers = {
    "accept": "application/json",
    "x-api-key": #PLEASE CREATE YOUR ACCOUNT TO OBTAIN YOUR API KEY
}
response = requests.get(url, headers=headers)

In [13]:
res = json.loads(response.content)
rankings = res.get('rankings',[])

records = []
for cat_ranking in rankings:
    category = cat_ranking.get('name')
    year = cat_ranking.get('year')
    week = cat_ranking.get('week')
    competitor_rankings = cat_ranking.get('competitor_rankings', [])
    for competitor in competitor_rankings:
        rank = competitor.get('rank')
        comp = competitor.get('competitor')
        comp_id = comp.get('id')
        name = comp.get('name')
        gender = comp.get('gender')
        records.append({
            'Name': name,
            'Id': comp_id,
            'Ranking': rank,
            "Gender": gender,
            'Category': category,
            'Year': year,
            "Week": week
        })
df_rankings_fighters = pd.DataFrame(records)
df_rankings_fighters.sort_values(by='Id', ascending=False)
df_rankings_fighters

Unnamed: 0,Name,Id,Ranking,Gender,Category,Year,Week
0,"Makhachev, Islam",sr:competitor:251835,1,male,pound_for_pound,2025,19
1,"Jones, Jon",sr:competitor:253371,2,male,pound_for_pound,2025,19
2,"Topuria, Ilia",sr:competitor:750503,3,male,pound_for_pound,2025,19
3,"Dvalishvili, Merab",sr:competitor:399183,4,male,pound_for_pound,2025,19
4,"Du Plessis, Dricus",sr:competitor:400461,5,male,pound_for_pound,2025,19
...,...,...,...,...,...,...,...
186,"Cavalcanti, Jacqueline",sr:competitor:1049265,11,female,womens_bantamweight,2025,19
187,"Cornolle, Nora",sr:competitor:1027333,12,female,womens_bantamweight,2025,19
188,"Tate, Miesha",sr:competitor:246049,13,female,womens_bantamweight,2025,19
189,"Edwards, Joselyne",sr:competitor:768194,14,female,womens_bantamweight,2025,19


In [14]:
df_rankings_fighters.to_csv('./csv/df_ranking_fighters.csv', index=False)

In [41]:
profiles = []
for index, fighter in df_rankings_fighters.iterrows():
    fighterId = fighter['Id'].replace(":", "%3A")
    profile_url = "https://api.sportradar.com/mma/trial/v2/en/competitors/" + fighterId + "/profile.json"
    response_fighter = requests.get(profile_url, headers=headers)
    res_fighter = json.loads(response_fighter.content)
    competitor_profile = res_fighter.get('competitor')
    if competitor_profile is not None:
        profile_id = competitor_profile.get('id')
        info = res_fighter.get('info')
        country = info.get('birth_country')
        Birth_country_code = info.get('birth_country_code')
        DOB = info.get('birth_date')
        reach = info.get('reach')
        Height = info.get('height')
        Weight = info.get('weight')
        Nickname = info.get('nickname')
        record = res_fighter.get('record')
        Wins = record.get('wins')
        Draws = record.get('draws')
        Losses = record.get('losses')
        profiles.append({
            'Id': profile_id,
            'Country': country,
            'Brith_Code': Birth_country_code,
            'DOB': DOB,
            'Reach': reach,
            'Height': Height,
            'Weight': Weight,
            'Nickname': Nickname,
            'Wins': Wins,
            'Draws': Draws,
            'Losses': Losses
        })
df_profiles_fighters = pd.DataFrame(profiles)
df_profiles_fighters

Unnamed: 0,Id,Country,Brith_Code,DOB,Reach,Height,Weight,Nickname,Wins,Draws,Losses
0,sr:competitor:251835,RUSSIAN FEDERATION,RUS,1991-09-27,179,178,70.3,,27,0,2
1,sr:competitor:750503,GERMANY,DEU,1997-01-21,175,170,65.5,El Matador,16,0,0
2,sr:competitor:399183,GEORGIA,GEO,1991-01-10,173,168,60.8,The Machine,18,0,4
3,sr:competitor:400461,,,1994-01-14,193,185,83.9,Stillknocks,23,0,2
4,sr:competitor:419867,RUSSIAN FEDERATION,RUS,1992-06-02,191,191,93.0,,21,1,1
...,...,...,...,...,...,...,...,...,...,...,...
184,sr:competitor:1049265,BRAZIL,BRA,1997-08-29,178,175,61.2,,9,0,1
185,sr:competitor:1027333,,,1989-06-12,170,169,62.6,,9,0,2
186,sr:competitor:246049,UNITED STATES,USA,1986-08-18,165,168,61.5,Cupcake,20,0,10
187,sr:competitor:768194,PANAMA,PAN,1995-09-29,178,173,61.7,La Pantera,15,0,6


In [15]:
df_profiles_fighters.to_csv('./csv/df_profiles_fighters.csv', index=False)

NameError: name 'df_profiles_fighters' is not defined

In [66]:
# In the Dataframe fighter_df_info we have the best 15 rankings per category
fighters_df = pd.merge(df_rankings_fighters, df_profiles_fighters, on='Id', how='left')
fighters_df_info = fighters_df.drop_duplicates(subset=['Id', 'Category'])
fighters_df_info = fighters_df_info.rename(columns={'Year': 'Year_Ranking', "Week": 'Week_ranking'})
fighters_df_info

Unnamed: 0,Name,Id,Ranking,Gender,Category,Year_Ranking,Week_ranking,Country,Brith_Code,DOB,Reach,Height,Weight,Nickname,Wins,Draws,Losses
0,"Makhachev, Islam",sr:competitor:251835,1,male,pound_for_pound,2025,19,RUSSIAN FEDERATION,RUS,1991-09-27,179.0,178.0,70.3,,27.0,0.0,2.0
2,"Jones, Jon",sr:competitor:253371,2,male,pound_for_pound,2025,19,UNITED STATES,USA,1987-07-19,215.0,194.0,112.5,Bones,28.0,0.0,1.0
3,"Topuria, Ilia",sr:competitor:750503,3,male,pound_for_pound,2025,19,GERMANY,DEU,1997-01-21,175.0,170.0,65.5,El Matador,16.0,0.0,0.0
5,"Dvalishvili, Merab",sr:competitor:399183,4,male,pound_for_pound,2025,19,GEORGIA,GEO,1991-01-10,173.0,168.0,60.8,The Machine,18.0,0.0,4.0
7,"Du Plessis, Dricus",sr:competitor:400461,5,male,pound_for_pound,2025,19,,,1994-01-14,193.0,185.0,83.9,Stillknocks,23.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,"Cavalcanti, Jacqueline",sr:competitor:1049265,11,female,womens_bantamweight,2025,19,BRAZIL,BRA,1997-08-29,178.0,175.0,61.2,,9.0,0.0,1.0
221,"Cornolle, Nora",sr:competitor:1027333,12,female,womens_bantamweight,2025,19,,,1989-06-12,170.0,169.0,62.6,,9.0,0.0,2.0
222,"Tate, Miesha",sr:competitor:246049,13,female,womens_bantamweight,2025,19,UNITED STATES,USA,1986-08-18,165.0,168.0,61.5,Cupcake,20.0,0.0,10.0
223,"Edwards, Joselyne",sr:competitor:768194,14,female,womens_bantamweight,2025,19,PANAMA,PAN,1995-09-29,178.0,173.0,61.7,La Pantera,15.0,0.0,6.0


In [None]:
fighters_df_info.to_csv('./csv/Fighters_info.csv', index=False)

In [22]:
#This is another URL that contains information about the fighters
url_info = "https://www.ufc.com/search?query=athlete&tabOrder=.%2Findex.html%2Clinks%2Cathletes%2Cevents%2Cfaqs%2Chelp_articles%2Cvideos%2Cpromotions&referrerPageUrl=https%3A%2F%2Fwww.ufc.com%2F&facetFilters=%7B%22gender%22%3A%5B%5D%2C%22c_weightClass%22%3A%5B%5D%2C%22c_homeCountry%22%3A%5B%5D%2C%22c_accolades%22%3A%5B%5D%7D&filters=%7B%7D&search-offset=4305&verticalUrl=Athletes.html?page=all"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response_ranking = requests.get(url_info, headers=headers)

# Always check if the request was successful
if response_ranking.status_code != 200:
    print(f"Error: Received status code {response_ranking.status_code}")
    exit()

print(f"Response status: {response_ranking.status_code}")

Response status: 200


In [23]:
page_fighters = response_ranking.content
scraping_fighters = BeautifulSoup(page_fighters, "lxml")
scraping_fighters

<!DOCTYPE html>
<html dir="ltr" lang="en-nz" prefix="og: https://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<link href="https://www.ufc.com/search" rel="canonical"/>
<meta content="origin" name="referrer"/>
<meta content="Search results | UFC" property="og:title"/>
<meta content="2022-08-31T20:44:09+1200" property="article:published_time"/>
<meta content="2022-08-31T20:57:50+1200" property="article:modified_time"/>
<meta content="Search results | UFC" name="twitter:title"/>
<meta content="Drupal 10 (https://www.drupal.org)" name="Generator"/>
<meta content="width" name="MobileOptimized"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<script>
  function initAnswers() {
    ANSWERS.init({
      apiKey: "850a88aeb3c29599ce2db46832aa229f",
      experienceKey: "answers-en",
      experienceVersion: "PRODUCTION",
      locale: "en",
      businessId: "3668711",
      templateBundle: TemplateBundle.default,
      onRe

In [None]:
# I am going to start scraping from this URL to find the data about the events (Event, Winner, Losser, Round, Time, Method)
url_events= 'http://www.ufcstats.com/statistics/events/completed?page=all'