In [1]:
import pandas as pd
import numpy as np
import os

# Config
INPUT_PATH = '../data/processed/ufc_fighter_style_features.csv'
CLEANED_FIGHTS_PATH = '../data/processed/ufc_fight_stats_cleaned.csv' # Needed for weight class context
OUTPUT_PATH = '../data/processed/ufc_modeling_data_final.csv'

# DECISIONS (Based on EDA results)
MIN_FIGHTS = 5 # Removes fighters with < 5 fights to reduce noise
NORMALIZE_WEIGHT = True # Z-scores stats relative to division average

def z_score_by_group(df, value_col, group_col):
    # Normalizes a column relative to its weight class average. Formula: (Value - Division_Mean) / Division_Std_Dev
    groups = df.groupby(group_col)[value_col]
    mean = groups.transform('mean')
    std = groups.transform('std').replace(0, 1) # Prevent division by zero
    return (df[value_col] - mean) / std

# Load data
if os.path.exists(INPUT_PATH) and os.path.exists(CLEANED_FIGHTS_PATH):
    df = pd.read_csv(INPUT_PATH)
    df_raw = pd.read_csv(CLEANED_FIGHTS_PATH)
    
    # Re-Attach Weight Class
    # We use the mode for each fighter to assign their primary division
    fighter_weights = df_raw.groupby('Fighter')['Weight_Class'].agg(
        lambda x: x.mode()[0] if len(x.mode()) > 0 else "Unknown"
    ).reset_index()
    
    df = df.merge(fighter_weights, on='Fighter')
    
    initial_count = len(df)
    
    # Apply Experience Filter
    df_clean = df[df['Total_Fights'] >= MIN_FIGHTS].copy()
    dropped_count = initial_count - len(df_clean)
    
    print(f"1. Experience Filter (Min {MIN_FIGHTS} fights):")
    print(f"   - Dropped {dropped_count} fighters")
    print(f"   - Remaining: {len(df_clean)} fighters")

    # Apply Weight Class Normalization (Z-Scoring)
    if NORMALIZE_WEIGHT:
        print("\n2. Applying Weight Class Normalization...")
        
        # Columns to normalize (Volume/Pace metrics that vary by weight)
        metrics_to_norm = [
            'Sig_Str_PM', 'Takedown_Att_PM', 'Sub_Att_PM', 
            'Control_Ratio'
        ]
        
        for col in metrics_to_norm:
            new_col_name = f"{col}_Z"
            df_clean[new_col_name] = z_score_by_group(df_clean, col, 'Weight_Class')
            
        print(f"   - Created normalized columns: {[c+'_Z' for c in metrics_to_norm]}")

    # Save
    df_clean.to_csv(OUTPUT_PATH, index=False)
    print(f"\nSUCCESS! Final dataset saved to: {OUTPUT_PATH}")
    print("You are ready for Week 6: Clustering.")
    
else:
    print(f"Error: Missing input files at {INPUT_PATH} or {CLEANED_FIGHTS_PATH}")

1. Experience Filter (Min 5 fights):
   - Dropped 1410 fighters
   - Remaining: 1191 fighters

2. Applying Weight Class Normalization...
   - Created normalized columns: ['Sig_Str_PM_Z', 'Takedown_Att_PM_Z', 'Sub_Att_PM_Z', 'Control_Ratio_Z']

SUCCESS! Final dataset saved to: ../data/processed/ufc_modeling_data_final.csv
You are ready for Week 6: Clustering.
