This script creates a preprocessed dataset containing **only information known before each fight**. 

In [19]:
import pandas as pd
import pandas as pd
import numpy as np
import os
import datetime
from typing import Optional



In [20]:
# Load data
csv_path = '../../fights_data.csv'
df = pd.read_csv(csv_path)

print(df.head())

   Unnamed: 0   fighter_a_name     fighter_b_name fighter_a_record  \
0           0  Khamzat Chimaev  Dricus Du Plessis           15-0-0   
1           1       Aaron Pico      Lerone Murphy           13-5-0   
2           2       Geoff Neal      Carlos Prates           16-7-0   
3           3  Jared Cannonier       Michael Page           18-9-0   
4           4      Kai Asakura        Tim Elliott           21-6-0   

   fighter_a_height  fighter_a_weight  fighter_a_reach fighter_a_stance  \
0              74.0               185             75.0         Orthodox   
1              68.0               145             70.0         Orthodox   
2              71.0               170             75.0         Southpaw   
3              71.0               185             77.0           Switch   
4              68.0               125             69.0         Orthodox   

  fighter_a_dob fighter_b_record  ...  fighter_b_distance_landed_05  \
0    01/05/1994           23-3-0  ...                    

In [21]:
def calculate_fight_length_seconds(final_round:int, finish_time:str) -> Optional[int]:
    """
    Convert final round and finish time to total fight length in seconds.
    
    Args:
        final_round: Round number where fight ended (1, 2, 3, 4, 5)
        finish_time: Time in round when fight ended (format "M:SS" like "2:45")
    
    Returns:
        Total fight length in seconds
    """
    try:
        # Convert final_round to int
        round_num = final_round
        
        # Parse finish time (M:SS format)
        if ':' in str(finish_time):
            time_parts = str(finish_time).split(':')
            minutes = int(time_parts[0])
            seconds = int(time_parts[1])
            finish_seconds = (minutes * 60) + seconds
        else:
            # If no colon, assume it's just seconds
            finish_seconds = int(finish_time)
        
        # Calculate total seconds
        # Complete previous rounds (5 minutes each = 300 seconds)
        completed_rounds = round_num - 1
        completed_seconds = completed_rounds * 300
        
        # Add time from final round
        total_seconds = completed_seconds + finish_seconds
        
        return total_seconds
        
    except (ValueError, TypeError):
        return None

In [22]:
# Apply to your dataframe:
df['fight_length_seconds'] = df.apply(
    lambda row: calculate_fight_length_seconds(row['final_round'], row['finish_time']), 
    axis=1
)

df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')


In [23]:
# Basic statistics
print("Fight Length Statistics:")
print(f"Average fight length: {df['fight_length_seconds'].mean():.0f} seconds ({df['fight_length_seconds'].mean()/60:.1f} minutes)")
print(f"Shortest fight: {df['fight_length_seconds'].min()} seconds")
print(f"Longest fight: {df['fight_length_seconds'].max()} seconds")
print(f"Median fight length: {df['fight_length_seconds'].median():.0f} seconds")

Fight Length Statistics:
Average fight length: 645 seconds (10.7 minutes)
Shortest fight: 5 seconds
Longest fight: 1500 seconds
Median fight length: 865 seconds


In [24]:
# Filter for fights scheduled for 5 rounds
five_round_fights = df[df['number_of_rounds'] == 5]

print(f"Fights scheduled for 5 rounds: {len(five_round_fights)}")
if len(five_round_fights) > 0:
    avg_five_round = five_round_fights['fight_length_seconds'].mean()
    print(f"Average length: {avg_five_round:.0f} seconds ({avg_five_round/60:.1f} minutes)")

Fights scheduled for 5 rounds: 711
Average length: 912 seconds (15.2 minutes)


In [34]:
def fights_before_current(fighter_name:str, fight_date:datetime.datetime, fights_df:pd.DataFrame) -> pd.DataFrame:
    # Get all fights for this fighter BEFORE the current fight date
    fighter_fights = fights_df[
        (fights_df['date'] < fight_date) &  # CRITICAL: Only previous fights
        (
            (fights_df['fighter_a_name'] == fighter_name) |
            (fights_df['fighter_b_name'] == fighter_name)
        )
    ].copy()

    return fighter_fights

In [36]:
today = datetime.date.today()
print(today)

today_datetime = datetime.datetime.combine(datetime.date.today(), datetime.datetime.min.time())
print(today_datetime)

previous_fights = fights_before_current("Johnny Walker",today_datetime,df)

print(previous_fights)

2025-08-23
2025-08-23 00:00:00
      Unnamed: 0    fighter_a_name       fighter_b_name fighter_a_record  \
598          598     Johnny Walker      Volkan Oezdemir           22-9-0   
844          844     Johnny Walker     Magomed Ankalaev           22-9-0   
938          938  Magomed Ankalaev        Johnny Walker           21-1-1   
1182        1182     Johnny Walker        Anthony Smith           22-9-0   
1353        1353     Johnny Walker           Paul Craig           22-9-0   
1512        1512     Johnny Walker         Ion Cutelaba           22-9-0   
1826        1826     Johnny Walker         Jamahal Hill           22-9-0   
2009        2009     Thiago Santos        Johnny Walker          22-11-0   
2537        2537        Ryan Spann        Johnny Walker          23-11-0   
2759        2759     Johnny Walker        Nikita Krylov           22-9-0   
2919        2919    Corey Anderson        Johnny Walker           14-5-0   
3299        3299     Johnny Walker       Misha Cirkunov  

In [38]:
print(list(previous_fights.columns))

['Unnamed: 0', 'fighter_a_name', 'fighter_b_name', 'fighter_a_record', 'fighter_a_height', 'fighter_a_weight', 'fighter_a_reach', 'fighter_a_stance', 'fighter_a_dob', 'fighter_b_record', 'fighter_b_height', 'fighter_b_weight', 'fighter_b_reach', 'fighter_b_stance', 'fighter_b_dob', 'date', 'location', 'weight_class', 'victory_method', 'final_round', 'finish_time', 'number_of_rounds', 'winner', 'fighter_a_kd', 'fighter_b_kd', 'fighter_a_sig_str_landed', 'fighter_a_sig_str_attempted', 'fighter_b_sig_str_landed', 'fighter_b_sig_str_attempted', 'fighter_a_sig_str_pct', 'fighter_b_sig_str_pct', 'fighter_a_total_str_landed', 'fighter_a_total_str_attempted', 'fighter_b_total_str_landed', 'fighter_b_total_str_attempted', 'fighter_a_td_landed', 'fighter_a_td_attempted', 'fighter_b_td_landed', 'fighter_b_td_attempted', 'fighter_a_td_pct', 'fighter_b_td_pct', 'fighter_a_sub_att', 'fighter_b_sub_att', 'fighter_a_rev', 'fighter_b_rev', 'fighter_a_ctrl_seconds', 'fighter_b_ctrl_seconds', 'fighter_a_