In [11]:
import pandas as pd
from IPython.display import display

In [2]:
ufc_stats = pd.read_csv('data/ufc_stats.csv')

In [6]:
ufc_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34436 entries, 0 to 34435
Data columns (total 37 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     34436 non-null  int64  
 1   fighter                        34436 non-null  object 
 2   knockdowns                     34436 non-null  int64  
 3   significant_strikes_landed     34436 non-null  int64  
 4   significant_strikes_attempted  34436 non-null  int64  
 5   significant_strikes_rate       34436 non-null  float64
 6   total_strikes_landed           34436 non-null  int64  
 7   total_strikes_attempted        34436 non-null  int64  
 8   takedown_successful            34436 non-null  int64  
 9   takedown_attempted             34436 non-null  int64  
 10  takedown_rate                  34436 non-null  float64
 11  submission_attempt             34436 non-null  int64  
 12  reversals                      34436 non-null 

In [72]:
def get_fights(fighter_name):
    all_fights = ufc_stats[ufc_stats['fighter'] == fighter_name]
    return all_fights

def clean_columns(data):
    cleaned_data = data.drop(columns=['fighter', 'Unnamed: 0', 'event', 'location', 'attendance', 'time', 'scheduled_rounds', 'weight_class', 'round', 'last_round', 'id', 'result', 'winner', 'fight_date'])
    # remove columns named "fighter", "Unnamed: 0", "event", "fight_date", "location", "attendance"
    # Convert the "fight_date" column to a datetime data type
    #cleaned_data['fight_date'] = pd.to_datetime(cleaned_data['fight_date'])
    return cleaned_data

def get_last_fights(data, nb_fights):
    df = data.sort_values(by=['fight_date'], ascending=False)
    # Group the DataFrame by the unique fight ID
    grouped = df.groupby('id', sort=False)
    # Get the first nb_fights groups (corresponding to the first nb_fights fights)
    first_nb_fights = [group for _, group in grouped]
    # Concatenate the DataFrames for the first nb_fights fights
    result_df = pd.concat(first_nb_fights)
    # Reset the index of the result DataFrame if needed
    result_df.reset_index(drop=True, inplace=True)
    return result_df

def get_all_oponents(fighter_name):
    all_fights = get_fights(fighter_name)
    fights_ids = all_fights['id'].unique()
    filtered_fights = ufc_stats[(ufc_stats['id'].isin(fights_ids)) & (ufc_stats['fighter'] != fighter_name)]['fighter'].unique()
    return filtered_fights

def get_winner(fighter_name_1, fighter_name_2):
    all_fights = get_fights(fighter_name_1)
    fights_ids = all_fights['id'].unique()
    filtered_fights = ufc_stats[(ufc_stats['id'].isin(fights_ids)) & (ufc_stats['fighter'] != fighter_name_1) & (ufc_stats['fighter'] == fighter_name_2)]
    grouped = filtered_fights.groupby('id', sort=False)
    fights = [group.iloc[0] for _, group in grouped]
    fights = pd.DataFrame(fights)
    res = fights['winner'].value_counts()
    
    # return -1 if no fight found
    if len(res) == 0:
         return -1

    res = res.index[0]
    return 0 if res == 'L' else 1

def get_all_fighters_name():
    return ufc_stats['fighter'].unique()

def get_fighter_data(fighter_name):
    all_fights = get_fights(fighter_name)
    all_fights = clean_columns(all_fights)
    #last_fights = get_last_fights(all_fights)
    #all_fights = all_fights.drop(columns=['round', 'last_round', 'id', 'result', 'winner', 'fight_date'])
    all_fights = all_fights.mean().to_frame().transpose()
    return all_fights

In [None]:
print(get_all_fighters_name()[:500])

In [45]:
get_all_oponents('Conor McGregor')

array(['Dustin Poirier', 'Donald Cerrone', 'Khabib Nurmagomedov',
       'Eddie Alvarez', 'Nate Diaz', 'Jose Aldo', 'Chad Mendes',
       'Dennis Siver', 'Diego Brandao', 'Max Holloway', 'Marcus Brimage'],
      dtype=object)

In [73]:
fighter_data_1 = get_fighter_data('Conor McGregor')
fighter_data_2 = get_fighter_data('Dustin Poirier')
winner = get_winner('Conor McGregor', 'Dustin Poirier')
df = pd.concat([fighter_data_1, fighter_data_2])
display(df)
print("Winner ", winner)

Unnamed: 0,knockdowns,significant_strikes_landed,significant_strikes_attempted,significant_strikes_rate,total_strikes_landed,total_strikes_attempted,takedown_successful,takedown_attempted,takedown_rate,submission_attempt,...,body_landed,body_attempted,leg_landed,leg_attempted,distance_landed,distance_attempted,clinch_landed,clinch_attempted,ground_landed,ground_attempted
0,0.464286,21.392857,43.0,0.530714,28.464286,50.464286,0.178571,0.321429,0.107143,0.035714,...,3.642857,5.642857,2.785714,3.535714,16.464286,36.321429,2.178571,2.642857,2.75,4.035714
0,0.188406,23.913043,47.101449,0.506667,28.695652,52.42029,0.405797,1.101449,0.222609,0.289855,...,2.246377,3.130435,2.550725,3.101449,17.956522,37.956522,3.623188,5.492754,2.333333,3.652174


Winner  1


## Prepare input for xgboost

In [75]:
fighter_data_1 = get_fighter_data('Conor McGregor')
columns = fighter_data_1.columns.to_list()
for index, col in enumerate(fighter_data_1.columns):
    new_col = col + '_1'
    columns.append(new_col)

columns.append('winner')

print(columns)

['knockdowns', 'significant_strikes_landed', 'significant_strikes_attempted', 'significant_strikes_rate', 'total_strikes_landed', 'total_strikes_attempted', 'takedown_successful', 'takedown_attempted', 'takedown_rate', 'submission_attempt', 'reversals', 'head_landed', 'head_attempted', 'body_landed', 'body_attempted', 'leg_landed', 'leg_attempted', 'distance_landed', 'distance_attempted', 'clinch_landed', 'clinch_attempted', 'ground_landed', 'ground_attempted', 'knockdowns_1', 'significant_strikes_landed_1', 'significant_strikes_attempted_1', 'significant_strikes_rate_1', 'total_strikes_landed_1', 'total_strikes_attempted_1', 'takedown_successful_1', 'takedown_attempted_1', 'takedown_rate_1', 'submission_attempt_1', 'reversals_1', 'head_landed_1', 'head_attempted_1', 'body_landed_1', 'body_attempted_1', 'leg_landed_1', 'leg_attempted_1', 'distance_landed_1', 'distance_attempted_1', 'clinch_landed_1', 'clinch_attempted_1', 'ground_landed_1', 'ground_attempted_1', 'winner']


In [77]:
from tqdm import tqdm

results = []
data = []

fighters_name = get_all_fighters_name()
for fighter in tqdm(fighters_name):
    all_oponents = get_all_oponents(fighter)
    fighter_data_1 = get_fighter_data(fighter)
    for oponent in all_oponents:
        fighter_data_2 = get_fighter_data(oponent)
        winner = get_winner(fighter, oponent)
        data.append(fighter_data_1.values.tolist()[0] + fighter_data_2.values.tolist()[0] + [winner])

df = pd.DataFrame(data=data, columns=columns)


  0%|          | 0/2448 [00:00<?, ?it/s]

100%|██████████| 2448/2448 [02:11<00:00, 18.59it/s]


In [78]:
df.to_csv('fighter_stats.csv', index=False)