In [1]:
import pandas as pd

In [2]:
ufc_stats = pd.read_csv('data/ufc_stats.csv')

In [3]:
ufc_stats.iloc[500]

Unnamed: 0                                                       501
fighter                                              Jessica Andrade
knockdowns                                                         0
significant_strikes_landed                                         0
significant_strikes_attempted                                      2
significant_strikes_rate                                         0.0
total_strikes_landed                                               0
total_strikes_attempted                                            2
takedown_successful                                                0
takedown_attempted                                                 0
takedown_rate                                                    0.0
submission_attempt                                                 0
reversals                                                          0
head_landed                                                        0
head_attempted                    

In [4]:
def get_fights(fighter_name):
    all_fights = ufc_stats[ufc_stats['fighter'] == fighter_name]
    return all_fights

def clean_columns(data):
    # remove columns named "fighter", "Unnamed: 0", "event", "fight_date", "location", "attendance"
    cleaned_data = data.drop(columns=['fighter', 'Unnamed: 0', 'event', 'location', 'attendance', 'time', 'scheduled_rounds', 'weight_class'])
    # Convert the "fight_date" column to a datetime data type
    cleaned_data['fight_date'] = pd.to_datetime(cleaned_data['fight_date'])
    return cleaned_data

def get_last_fights(data, nb_fights):
    df = data.sort_values(by=['fight_date'], ascending=False)
    # Group the DataFrame by the unique fight ID
    grouped = df.groupby('id', sort=False)
    # Get the first nb_fights groups (corresponding to the first nb_fights fights)
    first_nb_fights = [group for _, group in grouped][:nb_fights]
    # Concatenate the DataFrames for the first nb_fights fights
    result_df = pd.concat(first_nb_fights)
    # Reset the index of the result DataFrame if needed
    result_df.reset_index(drop=True, inplace=True)
    return result_df

In [5]:
fights = get_fights('Conor McGregor')
fights = clean_columns(fights)
fights = get_last_fights(fights, nb_fights=5)
fights

Unnamed: 0,knockdowns,significant_strikes_landed,significant_strikes_attempted,significant_strikes_rate,total_strikes_landed,total_strikes_attempted,takedown_successful,takedown_attempted,takedown_rate,submission_attempt,...,clinch_landed,clinch_attempted,ground_landed,ground_attempted,round,result,last_round,winner,fight_date,id
0,0,27,38,0.71,43,54,0,0,0.0,1,...,1,1,9,11,round 1,TKO - Doctor's Stoppage,1,L,2021-07-10,6140
1,0,14,32,0.43,14,32,0,1,0.0,0,...,0,0,0,0,round 2,KO/TKO,2,L,2021-01-23,5906
2,0,15,34,0.44,38,60,0,0,0.0,0,...,2,3,0,0,round 1,KO/TKO,2,L,2021-01-23,5906
3,1,19,26,0.73,20,27,0,0,0.0,0,...,3,3,13,16,round 1,KO/TKO,1,W,2020-01-18,5426
4,0,6,7,0.85,39,40,0,0,0.0,0,...,0,0,5,5,round 1,Submission,4,L,2018-10-06,4785
5,0,5,13,0.38,7,17,0,0,0.0,0,...,1,1,1,1,round 2,Submission,4,L,2018-10-06,4785
6,0,34,52,0.65,44,62,0,0,0.0,0,...,9,9,0,0,round 3,Submission,4,L,2018-10-06,4785
7,0,6,9,0.66,6,9,0,0,0.0,0,...,0,0,0,0,round 4,Submission,4,L,2018-10-06,4785
8,2,18,57,0.31,20,60,0,0,0.0,0,...,1,2,4,10,round 1,KO/TKO,2,W,2016-11-12,3871
9,1,14,27,0.51,20,33,0,0,0.0,0,...,2,3,1,2,round 2,KO/TKO,2,W,2016-11-12,3871


In [6]:
fights.dtypes

knockdowns                                int64
significant_strikes_landed                int64
significant_strikes_attempted             int64
significant_strikes_rate                float64
total_strikes_landed                      int64
total_strikes_attempted                   int64
takedown_successful                       int64
takedown_attempted                        int64
takedown_rate                           float64
submission_attempt                        int64
reversals                                 int64
head_landed                               int64
head_attempted                            int64
body_landed                               int64
body_attempted                            int64
leg_landed                                int64
leg_attempted                             int64
distance_landed                           int64
distance_attempted                        int64
clinch_landed                             int64
clinch_attempted                        

In [7]:
# Remove columns
fights = fights.drop(columns=['round', 'last_round', 'id', 'result', 'winner', 'fight_date'])

In [8]:
fights.dtypes

knockdowns                         int64
significant_strikes_landed         int64
significant_strikes_attempted      int64
significant_strikes_rate         float64
total_strikes_landed               int64
total_strikes_attempted            int64
takedown_successful                int64
takedown_attempted                 int64
takedown_rate                    float64
submission_attempt                 int64
reversals                          int64
head_landed                        int64
head_attempted                     int64
body_landed                        int64
body_attempted                     int64
leg_landed                         int64
leg_attempted                      int64
distance_landed                    int64
distance_attempted                 int64
clinch_landed                      int64
clinch_attempted                   int64
ground_landed                      int64
ground_attempted                   int64
dtype: object

In [9]:
fights

Unnamed: 0,knockdowns,significant_strikes_landed,significant_strikes_attempted,significant_strikes_rate,total_strikes_landed,total_strikes_attempted,takedown_successful,takedown_attempted,takedown_rate,submission_attempt,...,body_landed,body_attempted,leg_landed,leg_attempted,distance_landed,distance_attempted,clinch_landed,clinch_attempted,ground_landed,ground_attempted
0,0,27,38,0.71,43,54,0,0,0.0,1,...,6,9,8,8,17,26,1,1,9,11
1,0,14,32,0.43,14,32,0,1,0.0,0,...,1,1,2,2,14,32,0,0,0,0
2,0,15,34,0.44,38,60,0,0,0.0,0,...,3,3,0,0,13,31,2,3,0,0
3,1,19,26,0.73,20,27,0,0,0.0,0,...,0,1,0,0,3,7,3,3,13,16
4,0,6,7,0.85,39,40,0,0,0.0,0,...,0,1,0,0,1,2,0,0,5,5
5,0,5,13,0.38,7,17,0,0,0.0,0,...,2,3,0,0,3,11,1,1,1,1
6,0,34,52,0.65,44,62,0,0,0.0,0,...,12,13,0,0,25,43,9,9,0,0
7,0,6,9,0.66,6,9,0,0,0.0,0,...,2,2,1,2,6,9,0,0,0,0
8,2,18,57,0.31,20,60,0,0,0.0,0,...,3,6,0,0,13,45,1,2,4,10
9,1,14,27,0.51,20,33,0,0,0.0,0,...,4,5,1,2,11,22,2,3,1,2


In [10]:
# merge rows into one row by meaning each value
fights = fights.mean().to_frame().transpose()
fights

Unnamed: 0,knockdowns,significant_strikes_landed,significant_strikes_attempted,significant_strikes_rate,total_strikes_landed,total_strikes_attempted,takedown_successful,takedown_attempted,takedown_rate,submission_attempt,...,body_landed,body_attempted,leg_landed,leg_attempted,distance_landed,distance_attempted,clinch_landed,clinch_attempted,ground_landed,ground_attempted
0,0.4,15.8,29.5,0.567,25.1,39.4,0.0,0.1,0.0,0.1,...,3.3,4.4,1.2,1.4,10.6,22.8,1.9,2.2,3.3,4.5


In [11]:
def get_fighter_data(fighter_name):
    all_fights = get_fights(fighter_name)
    all_fights = clean_columns(all_fights)
    last_fights = get_last_fights(all_fights, nb_fights=5)
    last_fights = last_fights.drop(columns=['round', 'last_round', 'id', 'result', 'winner', 'fight_date'])
    last_fights = last_fights.mean().to_frame().transpose()
    return last_fights

In [12]:
fighter_data_1 = get_fighter_data('Conor McGregor')
fighter_data_1

Unnamed: 0,knockdowns,significant_strikes_landed,significant_strikes_attempted,significant_strikes_rate,total_strikes_landed,total_strikes_attempted,takedown_successful,takedown_attempted,takedown_rate,submission_attempt,...,body_landed,body_attempted,leg_landed,leg_attempted,distance_landed,distance_attempted,clinch_landed,clinch_attempted,ground_landed,ground_attempted
0,0.4,15.8,29.5,0.567,25.1,39.4,0.0,0.1,0.0,0.1,...,3.3,4.4,1.2,1.4,10.6,22.8,1.9,2.2,3.3,4.5


In [13]:
def get_all_oponents(fighter_name):
    all_fights = get_fights(fighter_name)
    fights_ids = all_fights['id'].unique()

    filtered_fights = ufc_stats[(ufc_stats['id'].isin(fights_ids)) & (ufc_stats['fighter'] != fighter_name)]['fighter'].unique()
    return filtered_fights


oponents = get_all_oponents('Conor McGregor')
oponents

array(['Dustin Poirier', 'Donald Cerrone', 'Khabib Nurmagomedov',
       'Eddie Alvarez', 'Nate Diaz', 'Jose Aldo', 'Chad Mendes',
       'Dennis Siver', 'Diego Brandao', 'Max Holloway', 'Marcus Brimage'],
      dtype=object)

In [15]:
fighter_data_2 = get_fighter_data(oponents[0])
fighter_data_2

Unnamed: 0,knockdowns,significant_strikes_landed,significant_strikes_attempted,significant_strikes_rate,total_strikes_landed,total_strikes_attempted,takedown_successful,takedown_attempted,takedown_rate,submission_attempt,...,body_landed,body_attempted,leg_landed,leg_attempted,distance_landed,distance_attempted,clinch_landed,clinch_attempted,ground_landed,ground_attempted
0,0.181818,18.818182,34.454545,0.495455,22.545455,38.727273,0.181818,0.272727,0.136364,0.181818,...,0.727273,1.0,2.090909,2.545455,14.545455,27.636364,0.909091,1.272727,3.363636,5.545455


In [23]:
def get_winner(fighter_name_1, fighter_name_2):
    all_fights = get_fights(fighter_name_1)
    fights_ids = all_fights['id'].unique()

    filtered_fights = ufc_stats[(ufc_stats['id'].isin(fights_ids)) & (ufc_stats['fighter'] != fighter_name_1) & (ufc_stats['fighter'] == fighter_name_2)]
    
    res = filtered_fights['winner'].value_counts().index[0]
    return 0 if res == 'L' else 1

get_winner('Conor McGregor', 'Dustin Poirier')

1

In [31]:
def get_all_fighters_name():
    return ufc_stats['fighter'].unique()

## Prepare input for xgboost

In [68]:
columns = fighter_data_1.columns.to_list()
for index, col in enumerate(fighter_data_1.columns):
    new_col = col + '_1'
    columns.append(new_col)

columns.append('winner')

In [73]:
from tqdm import tqdm

results = []
data = []

fighters_name = get_all_fighters_name()
for fighter in tqdm(fighters_name):
    all_oponents = get_all_oponents(fighter)
    fighter_data_1 = get_fighter_data(fighter)
    for oponent in all_oponents:
        fighter_data_2 = get_fighter_data(oponent)
        winner = get_winner(fighter, oponent)
        data.append(fighter_data_1.values.tolist()[0] + fighter_data_2.values.tolist()[0] + [winner])

df = pd.DataFrame(data=data, columns=columns)



  0%|          | 0/2448 [00:00<?, ?it/s]

100%|██████████| 2448/2448 [03:32<00:00, 11.54it/s]


In [77]:
df.to_csv('fighter_stats.csv', index=False)