In [1]:
import pandas as pd
import numpy as np
import os

# Config
INPUT_PATH = '../data/raw/raw_fights_detailed.csv'
OUTPUT_PATH = '../data/processed/ufc_fight_stats_cleaned.csv'

def convert_time_to_seconds(time_str):
    """Converts 'MM:SS' string to total seconds."""
    if pd.isna(time_str) or str(time_str) == '---':
        return 0
    try:
        minutes, seconds = map(int, str(time_str).split(':'))
        return minutes * 60 + seconds
    except:
        return 0

def split_landed_attempted(stat_str):
    """Splits '15 of 30' into (15, 30). Returns (0,0) if missing."""
    if pd.isna(stat_str) or 'of' not in str(stat_str):
        return 0, 0
    try:
        landed, attempted = str(stat_str).split(' of ')
        return int(landed), int(attempted)
    except:
        return 0, 0

# Load data
print("Loading raw data.")
if not os.path.exists(INPUT_PATH):
    print(f"Error: Input file not found at {INPUT_PATH}")
else:
    df = pd.read_csv(INPUT_PATH)

    # Process fight time and total duration
    # Note: Assuming 5 minute rounds for simplicity
    # Real duration = (Round-1) * 300 + Last_Round_Time
    df['last_round_seconds'] = df['Fight_Time'].apply(convert_time_to_seconds)
    df['total_fight_seconds'] = ((df['Round'] - 1) * 300) + df['last_round_seconds']

    # Define the columns we want to extract for each fighter
    # Create two temp dataframes (one for Red/F1, one for Blue/F2) and stack them
    stats_map = {
        'Sig. Str.': 'Sig_Str',     
        'Total Str.': 'Total_Str',  
        'Td': 'Takedowns',          
        'Sub. Att': 'Sub_Attempts',
        'Ctrl': 'Control_Time',     
        'Head': 'Head_Strikes',     
        'Body': 'Body_Strikes',     
        'Leg': 'Leg_Strikes',       
        'Distance': 'Distance_Strikes', 
        'Clinch': 'Clinch_Strikes',     
        'Ground': 'Ground_Strikes'      
    }

    processed_rows = []

    print("Processing fights.")
    # Iterate through the mapping to build the stack
    for i in [1, 2]: # Fighter 1 and Fighter 2
        suffix = f"_{i}"
        
        # Select columns for this fighter
        temp_df = df[['Fight_Id', 'Event_Id_x', 'total_fight_seconds', 'Method', 'Weight_Class']].copy()
        
        # Add Fighter Name
        temp_df['Fighter'] = df[f'Fighter_{i}']
        temp_df['Opponent'] = df[f'Fighter_{3-i}'] # If i=1, opp=2. Else if i=2, opp=1
        
        # Determine Win/Loss
        temp_df['Won'] = np.where(df['Win/No Contest/Draw'] == 'win', 
                                  (df[f'Result_{i}'] == 'W'), 
                                  False)
        
        # Process Stats
        # Time Columns (Ctrl)
        if f'Ctrl_{i}' in df.columns:
            temp_df['Control_Seconds'] = df[f'Ctrl_{i}'].apply(convert_time_to_seconds)
        else:
            temp_df['Control_Seconds'] = 0
        
        # Split "X of Y" columns
        for original_col, clean_name in stats_map.items():
            col_name = f"{original_col}_{i}"
            if col_name in df.columns:
                # Get the 'Landed' and 'Attempted' tuple
                landed_attempted = df[col_name].apply(split_landed_attempted)
                # Create two new columns
                if original_col in ['Sub. Att', 'Ctrl']: # These might not be split format depending on version
                     continue
                
                temp_df[f'{clean_name}_Landed'] = [x[0] for x in landed_attempted]
                temp_df[f'{clean_name}_Att'] = [x[1] for x in landed_attempted]

        # Handle simple columns that didn't need splitting (like Sub Attempts which are usually just ints)
        if f'Sub. Att_{i}' in df.columns:
            temp_df['Sub_Attempts'] = df[f'Sub. Att_{i}']
        if f'Kd_{i}' in df.columns:
            temp_df['Knockdowns'] = df[f'Kd_{i}']

        processed_rows.append(temp_df)

    # Concatenate both sides
    final_df = pd.concat(processed_rows, ignore_index=True)

    # Filter out empty/dummy rows
    final_df = final_df[final_df['Fighter'].notna()]

    # Save and ensure directory exists
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
    final_df.to_csv(OUTPUT_PATH, index=False)
    print(f"Success. Saved {len(final_df)} fighter-rows to {OUTPUT_PATH}")
    print(final_df.head())


Loading raw data...
Processing fights... (This may take a moment)
Success! Saved 16964 fighter-rows to ../data/processed/ufc_fight_stats_cleaned.csv
           Fight_Id        Event_Id_x  total_fight_seconds          Method  \
0  4a0db214d9721d6e  bd92cf5da5413d2a                 1500           U-DEC   
1  dfa692db6d39330c  bd92cf5da5413d2a                   26          KO/TKO   
2  fbbb9e72900b71f5  bd92cf5da5413d2a                  444  KO/TKO Punches   
3  1dc29f4c6fcdd356  bd92cf5da5413d2a                  900           U-DEC   
4  6d6ab10cbaa45e8c  bd92cf5da5413d2a                  900           M-DEC   

        Weight_Class            Fighter        Opponent    Won  \
0       Bantamweight  Merab Dvalishvili        Petr Yan  False   
1          Flyweight  Alexandre Pantoja      Joshua Van  False   
2          Flyweight     Brandon Moreno   Tatsuro Taira  False   
3       Bantamweight       Henry Cejudo  Payton Talbott  False   
4  Light Heavyweight     Jan Blachowicz   Bogdan Gus