In [11]:
# ! pip install tqdm

In [12]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Loading the dataset
df = pd.read_csv("historical_raw_data.csv")

# Convert Event Date to datetime for proper sorting
df['Event Date'] = pd.to_datetime(df['Event Date'], format='%d-%m-%Y')

# Function to merge names
def merge_names(row, first_name_col, last_name_col):
    return f"{row[first_name_col]} {row[last_name_col]}"

# Applying the merge_names function to create a full name column for fighters and winners
df['Fighter Full Name'] = df.apply(lambda row: merge_names(row, 'Fighter First Name', 'Fighter Last Name'), axis=1)
df['Winner Names'] = df.apply(lambda row: merge_names(row, 'Winner First Name', 'Winner Last Name'), axis=1)

# Dropping the original name columns
df = df.drop(['Fighter First Name', 'Fighter Last Name', 'Winner First Name', 'Winner Last Name'], axis=1)

# Defining columns that are common to both fighters
common_columns = ['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 
                  'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names']

# Defining fighter-specific columns
fighter_columns = [col for col in df.columns if col not in common_columns]
fighter_columns.append('Fighter Full Name')

# Splitting the dataset into fighter_x and fighter_y based on alternating rows
fighter_x = df.iloc[::2].reset_index(drop=True)
fighter_y = df.iloc[1::2].reset_index(drop=True)

# Renaming fighter-specific columns
fighter_x.columns = [f"fighter_x_{col}" if col in fighter_columns else col for col in fighter_x.columns]
fighter_y.columns = [f"fighter_y_{col}" if col in fighter_columns else col for col in fighter_y.columns]

# Merging the two dataframes on common columns
merged_df = pd.merge(fighter_x, fighter_y, on=common_columns, how='inner')

# Defining the desired column order
ordered_columns = (common_columns + 
                   ['fighter_x_Fighter Full Name'] + 
                   [f"fighter_x_{col}" for col in fighter_columns if col != 'Fighter Full Name'] +
                   ['fighter_y_Fighter Full Name'] + 
                   [f"fighter_y_{col}" for col in fighter_columns if col != 'Fighter Full Name'])

# Reordering columns
merged_df = merged_df[ordered_columns]

# Saving the merged dataframe to a new CSV file
merged_df.to_csv('merged_fighters_with_attempted_stats.csv', index=False)

print("Merged dataset with stats saved as 'merged_fighters_with_attempted_stats.csv'")

  df = pd.read_csv("historical_raw_data.csv")


Merged dataset with stats saved as 'merged_fighters_with_attempted_stats.csv'


In [None]:
df = pd.read_csv("                                                    merged_fighters_with_attempted_stats.csv")

  df = pd.read_csv("merged_fighters_with_attempted_stats.csv")


In [14]:
print(df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Knockdown Total', 'fighter_x_Significant Strike Total Attempted', 'fighter_x_Significant Strike Total Landed', 'fighter_x_Takedown Total Attempted', 'fighter_x_Takedown Total Landed', 'fighter_x_Submission Attempted', 'fighter_x_Reversal', 'fighter_x_Ground and Cage Control Time', 'fighter_x_Significant Strike Head Attempted', 'fighter_x_Significant Strike Head Landed', 'fighter_x_Significant Strike Body Attempted', 'fighter_x_Significant Strike Body Landed', 'fighter_x_Significant Strike Leg Attempted', 'fighter_x_Significant Strike Leg Landed', 'fighter_x_Significant Strike Clinch Attempted', 'fighter_x_Significant Strike Clinch Landed', 'fighter_x_Sign

In [15]:
df["Event Date"] = pd.to_datetime(df["Event Date"])

In [16]:
# Load merged dataset
merged_df = pd.read_csv("merged_fighters_with_attempted_stats.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_columns = [
    'avg_past_4_attempted', 
    'highest_past_4_attempted', 
    'lowest_past_4_attempted', 
    'range_past_4_attempted',
    'std_dev_past_4_attempted', 
    'median_strikes_past_4_attempted', 
    'num_40_plus_strike_games_past_4_attempted',
    'strike_increase_trend_attempted', 
    'game_with_most_strikes_attempted', 
    'game_with_least_strikes_attempted',
    'strike_drop_from_peak_to_lowest_attempted', 
    'outlier_strike_above_1_5xIQR_attempted'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for stat in stat_columns:
        merged_df[prefix + stat] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        strike_attempts = row[prefix + 'Significant Strike Total Attempted']

        # Append to fighter history list
        if pd.notna(strike_attempts):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Significant Strike Total Attempted': float(strike_attempts)
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, strike_col='Significant Strike Total Attempted'):
    """
    Calculate stats using only past fights before current one.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    strikes = past_fights[strike_col].astype(float).values

    result = {col: 0 for col in stat_columns}
    
    if len(strikes) > 0:
        recent_4 = strikes[-4:]
        result['avg_past_4_attempted'] = round(np.mean(recent_4), 2)
        result['highest_past_4_attempted'] = np.max(recent_4)
        result['lowest_past_4_attempted'] = np.min(recent_4)
        result['range_past_4_attempted'] = int(result['highest_past_4_attempted'] - result['lowest_past_4_attempted'])
        result['std_dev_past_4_attempted'] = round(np.std(recent_4), 2)
        result['median_strikes_past_4_attempted'] = np.median(recent_4)
        result['num_40_plus_strike_games_past_4_attempted'] = int(np.sum(recent_4 >= 40))
        result['strike_increase_trend_attempted'] = 1 if len(recent_4) > 1 and np.all(np.diff(recent_4) >= 0) else 0
        result['game_with_most_strikes_attempted'] = int(np.argmax(recent_4) + 1) if len(recent_4) > 0 else 0
        result['game_with_least_strikes_attempted'] = int(np.argmin(recent_4) + 1) if len(recent_4) > 0 else 0
        result['strike_drop_from_peak_to_lowest_attempted'] = int(result['highest_past_4_attempted'] - result['lowest_past_4_attempted'])

        q1, q3 = np.percentile(recent_4, [25, 75])
        iqr = q3 - q1
        result['outlier_strike_above_1_5xIQR_attempted'] = 1 if any(recent_4 > q3 + 1.5 * iqr) else 0

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):

    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, strike_col='Significant Strike Total Attempted')
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, strike_col='Significant Strike Total Attempted')
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value

  merged_df = pd.read_csv("merged_fighters_with_attempted_stats.csv")
Building Fighter History: 100%|██████████| 6250/6250 [00:00<00:00, 6521.93it/s]
Generating Stats: 100%|██████████| 6250/6250 [00:51<00:00, 120.24it/s]


In [17]:
# List of original strike columns to drop
original_strike_cols = ['Significant Strike Total Attempted']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv'")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv'


In [18]:
# # List of newly generated stat columns
# new_stat_columns = []
# for prefix in ['fighter_x_', 'fighter_y_']:
#     for stat in stat_columns:
#         new_stat_columns.append(prefix + stat)

# # Print them
# print("New Feature Engineering Columns:")
# for col in new_stat_columns:
#     print(f"- {col}")

In [19]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Knockdown Total', 'fighter_x_Significant Strike Total Landed', 'fighter_x_Takedown Total Attempted', 'fighter_x_Takedown Total Landed', 'fighter_x_Submission Attempted', 'fighter_x_Reversal', 'fighter_x_Ground and Cage Control Time', 'fighter_x_Significant Strike Head Attempted', 'fighter_x_Significant Strike Head Landed', 'fighter_x_Significant Strike Body Attempted', 'fighter_x_Significant Strike Body Landed', 'fighter_x_Significant Strike Leg Attempted', 'fighter_x_Significant Strike Leg Landed', 'fighter_x_Significant Strike Clinch Attempted', 'fighter_x_Significant Strike Clinch Landed', 'fighter_x_Significant Strike Ground Attempted', 'fighter_x_Sig

In [20]:
# Load merged dataset *******************************************************8888
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_columns = [
    'avg_past_4_landded', 
    'highest_past_4_landded', 
    'lowest_past_4_landded', 
    'range_past_4_landded',
    'std_dev_past_4_landded', 
    'median_strikes_past_4_landded', 
    'num_40_plus_strike_games_past_4_landded',
    'strike_increase_trend_landded', 
    'game_with_most_strikes_landded', 
    'game_with_least_strikes_landded',
    'strike_drop_from_peak_to_lowest_landded', 
    'outlier_strike_above_1_5xIQR_landded'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for stat in stat_columns:
        merged_df[prefix + stat] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        strike_attempts = row[prefix + 'Significant Strike Total Landed']

        # Append to fighter history list
        if pd.notna(strike_attempts):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Significant Strike Total Landed': float(strike_attempts)
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, strike_col='Significant Strike Total Landed'):
    """
    Calculate stats using only past fights before current one.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    strikes = past_fights[strike_col].astype(float).values

    result = {col: 0 for col in stat_columns}
    
    if len(strikes) > 0:
        recent_4 = strikes[-4:]
        result['avg_past_4_landded'] = round(np.mean(recent_4), 2)
        result['highest_past_4_landded'] = np.max(recent_4)
        result['lowest_past_4_landded'] = np.min(recent_4)
        result['range_past_4_landded'] = int(result['highest_past_4_landded'] - result['lowest_past_4_landded'])
        result['std_dev_past_4_landded'] = round(np.std(recent_4), 2)
        result['median_strikes_past_4_landded'] = np.median(recent_4)
        result['num_40_plus_strike_games_past_4_landded'] = int(np.sum(recent_4 >= 40))
        result['strike_increase_trend_landded'] = 1 if len(recent_4) > 1 and np.all(np.diff(recent_4) >= 0) else 0
        result['game_with_most_strikes_landded'] = int(np.argmax(recent_4) + 1) if len(recent_4) > 0 else 0
        result['game_with_least_strikes_landded'] = int(np.argmin(recent_4) + 1) if len(recent_4) > 0 else 0
        result['strike_drop_from_peak_to_lowest_landded'] = int(result['highest_past_4_landded'] - result['lowest_past_4_landded'])

        q1, q3 = np.percentile(recent_4, [25, 75])
        iqr = q3 - q1
        result['outlier_strike_above_1_5xIQR_landded'] = 1 if any(recent_4 > q3 + 1.5 * iqr) else 0

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):

    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, strike_col='Significant Strike Total Landed')
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, strike_col='Significant Strike Total Landed')
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value


  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
Building Fighter History: 100%|██████████| 6250/6250 [00:00<00:00, 7342.64it/s]
Generating Stats: 100%|██████████| 6250/6250 [00:51<00:00, 122.34it/s]


In [21]:
# List of original strike columns to drop  ************************
original_strike_cols = ['Significant Strike Total Landed']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv'")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv'


In [22]:
print(merged_df.columns.to_list()) #****************************

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Knockdown Total', 'fighter_x_Takedown Total Attempted', 'fighter_x_Takedown Total Landed', 'fighter_x_Submission Attempted', 'fighter_x_Reversal', 'fighter_x_Ground and Cage Control Time', 'fighter_x_Significant Strike Head Attempted', 'fighter_x_Significant Strike Head Landed', 'fighter_x_Significant Strike Body Attempted', 'fighter_x_Significant Strike Body Landed', 'fighter_x_Significant Strike Leg Attempted', 'fighter_x_Significant Strike Leg Landed', 'fighter_x_Significant Strike Clinch Attempted', 'fighter_x_Significant Strike Clinch Landed', 'fighter_x_Significant Strike Ground Attempted', 'fighter_x_Significant Strike Ground Landed', 'fighter_x_Ro

In [23]:
# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_columns = [
    'avg_past_4_takedown_attempted', 
    'highest_past_4_takedown_attempted', 
    'lowest_past_4_takedown_attempted', 
    'range_past_4_takedown_attempted',
    'std_dev_past_4_takedown_attempted', 
    'median_strikes_past_4_takedown_attempted', 
    'num_40_plus_strike_games_past_4_takedown_attempted',
    'strike_increase_trend_takedown_attempted', 
    'game_with_most_takedown_attempted', 
    'game_with_least_strikes_takedown_attempted',
    'strike_drop_from_peak_to_lowest_takedown_attempted', 
    'outlier_strike_above_1_5xIQR_takedown_attempted'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for stat in stat_columns:
        merged_df[prefix + stat] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        strike_attempts = row[prefix + 'Takedown Total Attempted']

        # Append to fighter history list
        if pd.notna(strike_attempts):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Takedown Total Attempted': float(strike_attempts)
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, strike_col='Takedown Total Attempted'):
    """
    Calculate stats using only past fights before current one.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    strikes = past_fights[strike_col].astype(float).values

    result = {col: 0 for col in stat_columns}
    
    if len(strikes) > 0:
        recent_4 = strikes[-4:]
        result['avg_past_4_takedown_attempted'] = round(np.mean(recent_4), 2)
        result['highest_past_4_takedown_attempted'] = np.max(recent_4)
        result['lowest_past_4_takedown_attempted'] = np.min(recent_4)
        result['range_past_4_takedown_attempted'] = int(result['highest_past_4_takedown_attempted'] - result['lowest_past_4_takedown_attempted'])
        result['std_dev_past_4_takedown_attempted'] = round(np.std(recent_4), 2)
        result['median_strikes_past_4_takedown_attempted'] = np.median(recent_4)
        result['num_40_plus_strike_games_past_4_takedown_attempted'] = int(np.sum(recent_4 >= 40))
        result['strike_increase_trend_takedown_attempted'] = 1 if len(recent_4) > 1 and np.all(np.diff(recent_4) >= 0) else 0
        result['game_with_most_strikes_takedown_attempted'] = int(np.argmax(recent_4) + 1) if len(recent_4) > 0 else 0
        result['game_with_least_strikes_takedown_attempted'] = int(np.argmin(recent_4) + 1) if len(recent_4) > 0 else 0
        result['strike_drop_from_peak_to_lowest_takedown_attempted'] = int(result['highest_past_4_takedown_attempted'] - result['lowest_past_4_takedown_attempted'])

        q1, q3 = np.percentile(recent_4, [25, 75])
        iqr = q3 - q1
        result['outlier_strike_above_1_5xIQR_takedown_attempted'] = 1 if any(recent_4 > q3 + 1.5 * iqr) else 0

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):

    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, strike_col='Takedown Total Attempted')
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, strike_col='Takedown Total Attempted')
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
Building Fighter History: 100%|██████████| 6250/6250 [00:00<00:00, 7547.05it/s]
Generating Stats: 100%|██████████| 6250/6250 [00:32<00:00, 191.80it/s]


In [24]:
# List of original strike columns to drop
original_strike_cols = ['Takedown Total Attempted']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv'")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv'


In [25]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Knockdown Total', 'fighter_x_Takedown Total Landed', 'fighter_x_Submission Attempted', 'fighter_x_Reversal', 'fighter_x_Ground and Cage Control Time', 'fighter_x_Significant Strike Head Attempted', 'fighter_x_Significant Strike Head Landed', 'fighter_x_Significant Strike Body Attempted', 'fighter_x_Significant Strike Body Landed', 'fighter_x_Significant Strike Leg Attempted', 'fighter_x_Significant Strike Leg Landed', 'fighter_x_Significant Strike Clinch Attempted', 'fighter_x_Significant Strike Clinch Landed', 'fighter_x_Significant Strike Ground Attempted', 'fighter_x_Significant Strike Ground Landed', 'fighter_x_Round 1 Knockdown Total', 'fighter_x_Rou

In [26]:
# Load merged dataset #time,round 1
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_columns = [
    'avg_past_4_takedown_landed', 
    'highest_past_4_takedown_landed', 
    'lowest_past_4_takedown_landed', 
    'range_past_4_takedown_landed',
    'std_dev_past_4_takedown_landed', 
    'median_strikes_past_4_takedown_landed', 
    'num_40_plus_strike_games_past_4_takedown_landed',
    'strike_increase_trend_takedown_landed', 
    'game_with_most_takedown_landed', 
    'game_with_least_strikes_takedown_landed',
    'strike_drop_from_peak_to_lowest_takedown_landed', 
    'outlier_strike_above_1_5xIQR_takedown_landed'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for stat in stat_columns:
        merged_df[prefix + stat] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        strike_attempts = row[prefix + 'Takedown Total Landed']

        # Append to fighter history list
        if pd.notna(strike_attempts):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Takedown Total Landed': float(strike_attempts)
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, strike_col='Takedown Total Landed'):
    """
    Calculate stats using only past fights before current one.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    strikes = past_fights[strike_col].astype(float).values

    result = {col: 0 for col in stat_columns}
    
    if len(strikes) > 0:
        recent_4 = strikes[-4:]
        result['avg_past_4_takedown_landed'] = round(np.mean(recent_4), 2)
        result['highest_past_4_takedown_landed'] = np.max(recent_4)
        result['lowest_past_4_takedown_landed'] = np.min(recent_4)
        result['range_past_4_takedown_landed'] = int(result['highest_past_4_takedown_landed'] - result['lowest_past_4_takedown_landed'])
        result['std_dev_past_4_takedown_landed'] = round(np.std(recent_4), 2)
        result['median_strikes_past_4_takedown_landed'] = np.median(recent_4)
        result['num_40_plus_strike_games_past_4_takedown_landed'] = int(np.sum(recent_4 >= 40))
        result['strike_increase_trend_takedown_landed'] = 1 if len(recent_4) > 1 and np.all(np.diff(recent_4) >= 0) else 0
        result['game_with_most_strikes_takedown_landed'] = int(np.argmax(recent_4) + 1) if len(recent_4) > 0 else 0
        result['game_with_least_strikes_takedown_landed'] = int(np.argmin(recent_4) + 1) if len(recent_4) > 0 else 0
        result['strike_drop_from_peak_to_lowest_takedown_landed'] = int(result['highest_past_4_takedown_landed'] - result['lowest_past_4_takedown_landed'])

        q1, q3 = np.percentile(recent_4, [25, 75])
        iqr = q3 - q1
        result['outlier_strike_above_1_5xIQR_takedown_landed'] = 1 if any(recent_4 > q3 + 1.5 * iqr) else 0

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):

    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, strike_col='Takedown Total Landed')
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, strike_col='Takedown Total Landed')
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
Building Fighter History: 100%|██████████| 6250/6250 [00:00<00:00, 19287.19it/s]
Generating Stats: 100%|██████████| 6250/6250 [00:17<00:00, 352.39it/s]


In [27]:
# List of original strike columns to drop
original_strike_cols = ['Takedown Total Landed']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv'")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv'


In [28]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Knockdown Total', 'fighter_x_Submission Attempted', 'fighter_x_Reversal', 'fighter_x_Ground and Cage Control Time', 'fighter_x_Significant Strike Head Attempted', 'fighter_x_Significant Strike Head Landed', 'fighter_x_Significant Strike Body Attempted', 'fighter_x_Significant Strike Body Landed', 'fighter_x_Significant Strike Leg Attempted', 'fighter_x_Significant Strike Leg Landed', 'fighter_x_Significant Strike Clinch Attempted', 'fighter_x_Significant Strike Clinch Landed', 'fighter_x_Significant Strike Ground Attempted', 'fighter_x_Significant Strike Ground Landed', 'fighter_x_Round 1 Knockdown Total', 'fighter_x_Round 1 Significant Strike Total Attem

In [29]:
# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_columns = [
    'avg_past_4_Knockdown Total', 
    'highest_past_4_Knockdown Total', 
    'lowest_past_4_Knockdown Total', 
    'range_past_4_Knockdown Total',
    'std_dev_past_4_Knockdown Total', 
    'median_strikes_past_4_Knockdown Total', 
    'num_40_plus_strike_games_past_4_Knockdown Total',
    'strike_increase_trend_Knockdown Total', 
    'game_with_most_Knockdown Total', 
    'game_with_least_strikes_Knockdown Total',
    'strike_drop_from_peak_to_lowest_Knockdown Total', 
    'outlier_strike_above_1_5xIQR_Knockdown Total'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for stat in stat_columns:
        merged_df[prefix + stat] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        strike_attempts = row[prefix + 'Knockdown Total']

        # Append to fighter history list
        if pd.notna(strike_attempts):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Knockdown Total': float(strike_attempts)
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, strike_col='Knockdown Total'):
    """
    Calculate stats using only past fights before current one.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    strikes = past_fights[strike_col].astype(float).values

    result = {col: 0 for col in stat_columns}
    
    if len(strikes) > 0:
        recent_4 = strikes[-4:]
        result['avg_past_4_Knockdown Total'] = round(np.mean(recent_4), 2)
        result['highest_past_4_Knockdown Total'] = np.max(recent_4)
        result['lowest_past_4_Knockdown Total'] = np.min(recent_4)
        result['range_past_4_Knockdown Total'] = int(result['highest_past_4_Knockdown Total'] - result['lowest_past_4_Knockdown Total'])
        result['std_dev_past_4_Knockdown Total'] = round(np.std(recent_4), 2)
        result['median_strikes_past_4_Knockdown Total'] = np.median(recent_4)
        result['num_40_plus_strike_games_past_4_Knockdown Total'] = int(np.sum(recent_4 >= 40))
        result['strike_increase_trend_Knockdown Total'] = 1 if len(recent_4) > 1 and np.all(np.diff(recent_4) >= 0) else 0
        result['game_with_most_strikes_Knockdown Total'] = int(np.argmax(recent_4) + 1) if len(recent_4) > 0 else 0
        result['game_with_least_strikes_Knockdown Total'] = int(np.argmin(recent_4) + 1) if len(recent_4) > 0 else 0
        result['strike_drop_from_peak_to_lowest_Knockdown Total'] = int(result['highest_past_4_Knockdown Total'] - result['lowest_past_4_Knockdown Total'])

        q1, q3 = np.percentile(recent_4, [25, 75])
        iqr = q3 - q1
        result['outlier_strike_above_1_5xIQR_Knockdown Total'] = 1 if any(recent_4 > q3 + 1.5 * iqr) else 0

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):

    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, strike_col='Knockdown Total')
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, strike_col='Knockdown Total')
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
Building Fighter History: 100%|██████████| 6250/6250 [00:00<00:00, 19255.50it/s]
Generating Stats: 100%|██████████| 6250/6250 [00:17<00:00, 352.23it/s]


In [30]:
# List of original strike columns to drop
original_strike_cols = ['Knockdown Total']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv'")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv'


In [31]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Submission Attempted', 'fighter_x_Reversal', 'fighter_x_Ground and Cage Control Time', 'fighter_x_Significant Strike Head Attempted', 'fighter_x_Significant Strike Head Landed', 'fighter_x_Significant Strike Body Attempted', 'fighter_x_Significant Strike Body Landed', 'fighter_x_Significant Strike Leg Attempted', 'fighter_x_Significant Strike Leg Landed', 'fighter_x_Significant Strike Clinch Attempted', 'fighter_x_Significant Strike Clinch Landed', 'fighter_x_Significant Strike Ground Attempted', 'fighter_x_Significant Strike Ground Landed', 'fighter_x_Round 1 Knockdown Total', 'fighter_x_Round 1 Significant Strike Total Attempted', 'fighter_x_Round 1 Sig

In [32]:
# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_columns = [
    'avg_past_4_Submission Attempted', 
    'highest_past_4_Submission Attempted', 
    'lowest_past_4_Submission Attempted', 
    'range_past_4_Submission Attempted',
    'std_dev_past_4_Submission Attempted', 
    'median_strikes_past_4_Submission Attempted', 
    'num_40_plus_strike_games_past_4_Submission Attempted',
    'strike_increase_trend_Submission Attempted', 
    'game_with_most_Submission Attempted', 
    'game_with_least_strikes_Submission Attempted',
    'strike_drop_from_peak_to_lowest_Submission Attempted', 
    'outlier_strike_above_1_5xIQR_Submission Attempted'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for stat in stat_columns:
        merged_df[prefix + stat] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        strike_attempts = row[prefix + 'Submission Attempted']

        # Append to fighter history list
        if pd.notna(strike_attempts):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Submission Attempted': float(strike_attempts)
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, strike_col='Submission Attempted'):
    """
    Calculate stats using only past fights before current one.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    strikes = past_fights[strike_col].astype(float).values

    result = {col: 0 for col in stat_columns}
    
    if len(strikes) > 0:
        recent_4 = strikes[-4:]
        result['avg_past_4_Submission Attempted'] = round(np.mean(recent_4), 2)
        result['highest_past_4_Submission Attempted'] = np.max(recent_4)
        result['lowest_past_4_Submission Attempted'] = np.min(recent_4)
        result['range_past_4_Submission Attempted'] = int(result['highest_past_4_Submission Attempted'] - result['lowest_past_4_Submission Attempted'])
        result['std_dev_past_4_Submission Attempted'] = round(np.std(recent_4), 2)
        result['median_strikes_past_4_Submission Attempted'] = np.median(recent_4)
        result['num_40_plus_strike_games_past_4_Submission Attempted'] = int(np.sum(recent_4 >= 40))
        result['strike_increase_trend_Submission Attempted'] = 1 if len(recent_4) > 1 and np.all(np.diff(recent_4) >= 0) else 0
        result['game_with_most_strikes_Submission Attempted'] = int(np.argmax(recent_4) + 1) if len(recent_4) > 0 else 0
        result['game_with_least_strikes_Submission Attempted'] = int(np.argmin(recent_4) + 1) if len(recent_4) > 0 else 0
        result['strike_drop_from_peak_to_lowest_Submission Attempted'] = int(result['highest_past_4_Submission Attempted'] - result['lowest_past_4_Submission Attempted'])

        q1, q3 = np.percentile(recent_4, [25, 75])
        iqr = q3 - q1
        result['outlier_strike_above_1_5xIQR_Submission Attempted'] = 1 if any(recent_4 > q3 + 1.5 * iqr) else 0

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):

    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, strike_col='Submission Attempted')
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, strike_col='Submission Attempted')
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
Building Fighter History: 100%|██████████| 6250/6250 [00:00<00:00, 17584.47it/s]
Generating Stats: 100%|██████████| 6250/6250 [00:18<00:00, 341.66it/s]


In [33]:
# List of original strike columns to drop
original_strike_cols = ['Submission Attempted']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv'")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv'


In [34]:

# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns template
stat_columns_template = [
    'avg_past_4_{}', 
    'highest_past_4_{}', 
    'lowest_past_4_{}', 
    'range_past_4_{}',
    'std_dev_past_4_{}', 
    'median_strikes_past_4_{}', 
    'num_40_plus_strike_games_past_4_{}',
    'strike_increase_trend_{}', 
    'game_with_most_strikes_{}', 
    'game_with_least_strikes_{}',
    'strike_drop_from_peak_to_lowest_{}', 
    'outlier_strike_above_1_5xIQR_{}'
]

# List of metrics to process (ignoring 'fighter_x_' and 'fighter_y_')
metrics = [
    'Reversal',
    'Significant Strike Head Attempted',
    'Significant Strike Head Landed',
    'Significant Strike Body Attempted',
    'Significant Strike Body Landed',
    'Significant Strike Leg Attempted',
    'Significant Strike Leg Landed',
    'Significant Strike Clinch Attempted',
    'Significant Strike Clinch Landed',
    'Significant Strike Ground Attempted',
    'Significant Strike Ground Landed'
]

# Create new stat columns for both fighters for each metric
for prefix in ['fighter_x_', 'fighter_y_']:
    for metric in metrics:
        for stat in stat_columns_template:
            stat_col = stat.format(metric.replace(' ', '_'))  # Replace spaces with underscores for column names
            merged_df[prefix + stat_col] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        
        # Append to fighter history for each metric
        for metric in metrics:
            metric_value = row[prefix + metric]
            if pd.notna(metric_value):
                fighter_history.append({
                    'Fighter Full Name': fighter_name,
                    'Event Date': fight_date,
                    'Metric': metric,
                    'Value': float(metric_value)
                })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Metric', 'Event Date'], inplace=True)

# Function to calculate cumulative stats for a given metric
def calculate_cumulative_stats(fighter_name, fight_date, df, metric):
    """
    Calculate stats using only past fights before current one for a specific metric.
    Returns dictionary with safe defaults.
    """
    # Filter past fights for the specific metric
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Metric'] == metric) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    values = past_fights['Value'].astype(float).values
    metric_key = metric.replace(' ', '_')  # Replace spaces with underscores for column names
    result = {col.format(metric_key): 0 for col in stat_columns_template}
    
    if len(values) > 0:
        recent_4 = values[-4:]
        result[f'avg_past_4_{metric_key}'] = round(np.mean(recent_4), 2) if len(recent_4) > 0 else 0
        result[f'highest_past_4_{metric_key}'] = np.max(recent_4) if len(recent_4) > 0 else 0
        result[f'lowest_past_4_{metric_key}'] = np.min(recent_4) if len(recent_4) > 0 else 0
        result[f'range_past_4_{metric_key}'] = int(result[f'highest_past_4_{metric_key}'] - result[f'lowest_past_4_{metric_key}'])
        result[f'std_dev_past_4_{metric_key}'] = round(np.std(recent_4), 2) if len(recent_4) > 0 else 0
        result[f'median_strikes_past_4_{metric_key}'] = np.median(recent_4) if len(recent_4) > 0 else 0
        result[f'num_40_plus_strike_games_past_4_{metric_key}'] = int(np.sum(recent_4 >= 40)) if len(recent_4) > 0 else 0
        result[f'strike_increase_trend_{metric_key}'] = 1 if len(recent_4) > 1 and np.all(np.diff(recent_4) >= 0) else 0
        result[f'game_with_most_strikes_{metric_key}'] = int(np.argmax(recent_4) + 1) if len(recent_4) > 0 else 0
        result[f'game_with_least_strikes_{metric_key}'] = int(np.argmin(recent_4) + 1) if len(recent_4) > 0 else 0
        result[f'strike_drop_from_peak_to_lowest_{metric_key}'] = int(result[f'highest_past_4_{metric_key}'] - result[f'lowest_past_4_{metric_key}'])
        
        q1, q3 = np.percentile(recent_4, [25, 75]) if len(recent_4) > 0 else (0, 0)
        iqr = q3 - q1
        result[f'outlier_strike_above_1_5xIQR_{metric_key}'] = 1 if len(recent_4) > 0 and any(recent_4 > q3 + 1.5 * iqr) else 0

    return result

# Loop through each fight and compute stats for both fighters for each metric
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):
    fight_date = row['Event Date']
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X for each metric
    for metric in metrics:
        x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, metric)
        for stat, value in x_stats.items():
            merged_df.at[idx, f"fighter_x_{stat}"] = value

        # Compute stats for Fighter Y for each metric
        y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, metric)
        for stat, value in y_stats.items():
            merged_df.at[idx, f"fighter_y_{stat}"] = value

# Save the updated DataFrame if need  
#merged_df.to_csv("ufc_fights_with_new_stats.csv", index=False)

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[prefix + stat_col] = np.nan
  merged_df[

In [35]:
# List of original metrics to drop (ignoring 'fighter_x_' and 'fighter_y_')
original_metrics = [
    'Reversal',
    'Significant Strike Head Attempted',
    'Significant Strike Head Landed',
    'Significant Strike Body Attempted',
    'Significant Strike Body Landed',
    'Significant Strike Leg Attempted',
    'Significant Strike Leg Landed',
    'Significant Strike Clinch Attempted',
    'Significant Strike Clinch Landed', 
    'Significant Strike Ground Attempted',
    'Significant Strike Ground Landed'
]

# Drop original columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for metric in original_metrics:
        col_name = prefix + metric
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)
        else:
            print(f"Warning: Column '{col_name}' not found in DataFrame.")

# Drop those columns
if columns_to_drop:
    merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
    print(f"Dropped columns: {columns_to_drop}")
else:
    print("No columns to drop.")

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv'")

Dropped columns: ['fighter_x_Reversal', 'fighter_x_Significant Strike Head Attempted', 'fighter_x_Significant Strike Head Landed', 'fighter_x_Significant Strike Body Attempted', 'fighter_x_Significant Strike Body Landed', 'fighter_x_Significant Strike Leg Attempted', 'fighter_x_Significant Strike Leg Landed', 'fighter_x_Significant Strike Clinch Attempted', 'fighter_x_Significant Strike Clinch Landed', 'fighter_x_Significant Strike Ground Attempted', 'fighter_x_Significant Strike Ground Landed', 'fighter_y_Reversal', 'fighter_y_Significant Strike Head Attempted', 'fighter_y_Significant Strike Head Landed', 'fighter_y_Significant Strike Body Attempted', 'fighter_y_Significant Strike Body Landed', 'fighter_y_Significant Strike Leg Attempted', 'fighter_y_Significant Strike Leg Landed', 'fighter_y_Significant Strike Clinch Attempted', 'fighter_y_Significant Strike Clinch Landed', 'fighter_y_Significant Strike Ground Attempted', 'fighter_y_Significant Strike Ground Landed']
Final dataset sa

In [36]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Ground and Cage Control Time', 'fighter_x_Round 1 Knockdown Total', 'fighter_x_Round 1 Significant Strike Total Attempted', 'fighter_x_Round 1 Significant Strike Total Landed', 'fighter_x_Round 1 Takedown Total Attempted', 'fighter_x_Round 1 Takedown Total Landed', 'fighter_x_Round 1 Submission Attempted', 'fighter_x_Round 1 Reversal', 'fighter_x_Round 1 Ground and Cage Control Time', 'fighter_x_Round 1 Significant Strike Head Attempted', 'fighter_x_Round 1 Significant Strike Head Landed', 'fighter_x_Round 1 Significant Strike Body Attempted', 'fighter_x_Round 1 Significant Strike Body Landed', 'fighter_x_Round 1 Significant Strike Leg Attempted', 'fighte

In [37]:
# Helper function to convert MM:SS to seconds
def time_to_seconds(time_str):
    """
    Convert time string in MM:SS format to total seconds.
    Returns NaN if conversion fails or input is invalid.
    """
    if pd.isna(time_str) or not isinstance(time_str, str):
        return np.nan
    try:
        minutes, seconds = map(int, time_str.split(':'))
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        return np.nan

# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_columns = [
    'Past Average Ground and Cage Control Time per Fight', 
    'Past Total Ground and Cage Control Time', 
    'past Ground and Cage Control Time per Minute Fought', 
    'past Percentage of Fights with High Control Time',
    'Recent Ground and Cage Control Time (Last K Fights)', 
    'Past Ground and Cage Control Time Differential', 
    'Past Win-Weighted Control Time',
    'Past Control Time Trend'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for stat in stat_columns:
        merged_df[prefix + stat] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        control_time = row[prefix + 'Ground and Cage Control Time']
        fight_time = row.get('Fight Time', np.nan)
        opponent_control_time = row['fighter_y_Ground and Cage Control Time'] if prefix == 'fighter_x_' else row['fighter_x_Ground and Cage Control Time']
        is_winner = 1 if row['Winner Names'] == row[prefix + 'Fighter Full Name'] else 0

        # Convert time strings to seconds
        control_time_sec = time_to_seconds(control_time)
        fight_time_sec = time_to_seconds(fight_time)
        opponent_control_time_sec = time_to_seconds(opponent_control_time)

        # Append to fighter history list
        if pd.notna(control_time_sec):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Ground and Cage Control Time': control_time_sec,
                'Fight Time': fight_time_sec,
                'Opponent Control Time': opponent_control_time_sec,
                'Is Winner': is_winner
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, control_col='Ground and Cage Control Time'):
    """
    Calculate stats using only past fights before current one.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    control_times = past_fights[control_col].astype(float).values
    fight_times = past_fights['Fight Time'].astype(float).values
    opponent_control_times = past_fights['Opponent Control Time'].astype(float).values
    wins = past_fights['Is Winner'].astype(float).values

    result = {col: 0 for col in stat_columns}
    
    if len(control_times) > 0:
        # Past Average Ground and Cage Control Time per Fight
        result['Past Average Ground and Cage Control Time per Fight'] = round(np.mean(control_times), 2)
        
        # Past Total Ground and Cage Control Time
        result['Past Total Ground and Cage Control Time'] = round(np.sum(control_times), 2)
        
        # Past Ground and Cage Control Time per Minute Fought
        if len(fight_times) > 0 and np.sum(fight_times) > 0:
            result['past Ground and Cage Control Time per Minute Fought'] = round(np.sum(control_times) / (np.sum(fight_times) / 60), 2)
        
        # Past Percentage of Fights with High Control Time (threshold: 120 seconds)
        high_control_threshold = 120
        result['past Percentage of Fights with High Control Time'] = round(np.mean(control_times >= high_control_threshold) * 100, 2)
        
        # Recent Ground and Cage Control Time (Last 4 Fights)
        recent_4 = control_times[-4:]
        if len(recent_4) > 0:
            result['Recent Ground and Cage Control Time (Last K Fights)'] = round(np.mean(recent_4), 2)
        
        # Past Ground and Cage Control Time Differential
        if len(opponent_control_times) > 0:
            control_differentials = control_times - opponent_control_times
            result['Past Ground and Cage Control Time Differential'] = round(np.mean(control_differentials), 2)
        
        # Past Win-Weighted Control Time
        if np.sum(wins) > 0:
            win_control_times = control_times[wins == 1]
            result['Past Win-Weighted Control Time'] = round(np.mean(win_control_times), 2) if len(win_control_times) > 0 else 0
        
        # Past Control Time Trend (difference between last 2 and prior 2 fights)
        if len(recent_4) >= 2:
            last_2 = recent_4[-2:] if len(recent_4) >= 2 else recent_4
            prior_2 = recent_4[-4:-2] if len(recent_4) >= 4 else recent_4[:-2]
            last_2_avg = np.mean(last_2) if len(last_2) > 0 else 0
            prior_2_avg = np.mean(prior_2) if len(prior_2) > 0 else 0
            result['Past Control Time Trend'] = round(last_2_avg - prior_2_avg, 2)

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):
    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, control_col='Ground and Cage Control Time')
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, control_col='Ground and Cage Control Time')
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
Building Fighter History: 100%|██████████| 6250/6250 [00:00<00:00, 10232.83it/s]
Generating Stats: 100%|██████████| 6250/6250 [00:11<00:00, 549.80it/s]


In [38]:
# List of original strike columns to drop
original_strike_cols = ['Ground and Cage Control Time']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv'")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv'


In [39]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Round 1 Knockdown Total', 'fighter_x_Round 1 Significant Strike Total Attempted', 'fighter_x_Round 1 Significant Strike Total Landed', 'fighter_x_Round 1 Takedown Total Attempted', 'fighter_x_Round 1 Takedown Total Landed', 'fighter_x_Round 1 Submission Attempted', 'fighter_x_Round 1 Reversal', 'fighter_x_Round 1 Ground and Cage Control Time', 'fighter_x_Round 1 Significant Strike Head Attempted', 'fighter_x_Round 1 Significant Strike Head Landed', 'fighter_x_Round 1 Significant Strike Body Attempted', 'fighter_x_Round 1 Significant Strike Body Landed', 'fighter_x_Round 1 Significant Strike Leg Attempted', 'fighter_x_Round 1 Significant Strike Leg Landed'

In [40]:
# Helper function to convert MM:SS to seconds
def time_to_seconds(time_str):
    """
    Convert time string in MM:SS format to total seconds.
    Returns NaN if conversion fails or input is invalid.
    """
    if pd.isna(time_str) or not isinstance(time_str, str):
        return np.nan
    try:
        minutes, seconds = map(int, time_str.split(':'))
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        return np.nan

# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_types = [
    'Past Average Round 1 per Fight', 
    'Past Total Round 1', 
    'Past Round 1 per Minute Fought', 
    'Past Percentage of Fights with High Round 1 Value',
    'Recent Round 1 (Last 4 Fights)', 
    'Past Round 1 Differential', 
    'Past Round 1 Win-Weighted',
    'Past Round 1 Trend'
]

# Define input columns for feature engineering (Round 1 only)
input_columns = [
    'Round 1 Knockdown Total',
    'Round 1 Significant Strike Total Attempted',
    'Round 1 Significant Strike Total Landed',
    'Round 1 Takedown Total Attempted',
    'Round 1 Takedown Total Landed',
    'Round 1 Submission Attempted',
    'Round 1 Reversal',
    'Round 1 Ground and Cage Control Time',
    'Round 1 Significant Strike Head Attempted',
    'Round 1 Significant Strike Head Landed',
    'Round 1 Significant Strike Body Attempted',
    'Round 1 Significant Strike Body Landed',
    'Round 1 Significant Strike Leg Attempted',
    'Round 1 Significant Strike Leg Landed',
    'Round 1 Significant Strike Clinch Attempted',
    'Round 1 Significant Strike Clinch Landed',
    'Round 1 Significant Strike Ground Attempted',
    'Round 1 Significant Strike Ground Landed'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in input_columns:
        for stat in stat_types:
            merged_df[f"{prefix}{col} {stat}"] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        fight_time = row.get('Fight Time', np.nan)
        is_winner = 1 if row['Winner Names'] == row[prefix + 'Fighter Full Name'] else 0
        opponent_prefix = 'fighter_y_' if prefix == 'fighter_x_' else 'fighter_x_'

        # Collect Round 1 stats
        fight_stats = {}
        opponent_stats = {}
        for col in input_columns:
            col_name = f"{prefix}{col}"
            opponent_col_name = f"{opponent_prefix}{col}"
            if col == 'Round 1 Ground and Cage Control Time':
                fight_stats[col] = time_to_seconds(row.get(col_name, np.nan))
                opponent_stats[col] = time_to_seconds(row.get(opponent_col_name, np.nan))
            else:
                fight_stats[col] = float(row.get(col_name, 0)) if pd.notna(row.get(col_name)) else 0
                opponent_stats[col] = float(row.get(opponent_col_name, 0)) if pd.notna(row.get(opponent_col_name)) else 0

        # Convert fight time to seconds
        fight_time_sec = time_to_seconds(fight_time)

        # Append to fighter history list
        if any(pd.notna(fight_stats[col]) for col in input_columns):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Fight Time': fight_time_sec,
                'Is Winner': is_winner,
                **fight_stats,
                **{f"Opponent {col}": opponent_stats[col] for col in input_columns}
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, input_cols):
    """
    Calculate stats for each Round 1 column using only past fights.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    result = {f"{col} {stat}": 0 for col in input_cols for stat in stat_types}
    
    if len(past_fights) > 0:
        for col in input_cols:
            values = past_fights[col].astype(float).values
            fight_times = past_fights['Fight Time'].astype(float).values
            opponent_values = past_fights[f"Opponent {col}"].astype(float).values
            wins = past_fights['Is Winner'].astype(float).values

            # Past Average Round 1 per Fight
            result[f"{col} Past Average Round 1 per Fight"] = round(np.mean(values), 2)
            
            # Past Total Round 1
            result[f"{col} Past Total Round 1"] = round(np.sum(values), 2)
            
            # Past Round 1 per Minute Fought
            if len(fight_times) > 0 and np.sum(fight_times) > 0:
                result[f"{col} Past Round 1 per Minute Fought"] = round(np.sum(values) / (np.sum(fight_times) / 60), 2)
            
            # Past Percentage of Fights with High Round 1 Value
            high_value_threshold = 120 if col == 'Round 1 Ground and Cage Control Time' else 20 if 'Strike' in col else 1
            result[f"{col} Past Percentage of Fights with High Round 1 Value"] = round(np.mean(values >= high_value_threshold) * 100, 2)
            
            # Recent Round 1 (Last 4 Fights)
            recent_4 = values[-4:]
            if len(recent_4) > 0:
                result[f"{col} Recent Round 1 (Last 4 Fights)"] = round(np.mean(recent_4), 2)
            
            # Past Round 1 Differential
            if len(opponent_values) > 0:
                differentials = values - opponent_values
                result[f"{col} Past Round 1 Differential"] = round(np.mean(differentials), 2)
            
            # Past Round 1 Win-Weighted
            if np.sum(wins) > 0:
                win_values = values[wins == 1]
                result[f"{col} Past Round 1 Win-Weighted"] = round(np.mean(win_values), 2) if len(win_values) > 0 else 0
            
            # Past Round 1 Trend
            if len(recent_4) >= 2:
                last_2 = recent_4[-2:] if len(recent_4) >= 2 else recent_4
                prior_2 = recent_4[-4:-2] if len(recent_4) >= 4 else recent_4[:-2]
                last_2_avg = np.mean(last_2) if len(last_2) > 0 else 0
                prior_2_avg = np.mean(prior_2) if len(prior_2) > 0 else 0
                result[f"{col} Past Round 1 Trend"] = round(last_2_avg - prior_2_avg, 2)

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):
    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, input_columns)
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, input_columns)
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[

In [41]:
# List of original strike columns to drop
original_strike_cols = ['Round 1 Knockdown Total', 
    'Round 1 Significant Strike Total Attempted',
    'Round 1 Significant Strike Total Landed',
    'Round 1 Takedown Total Attempted',
    'Round 1 Takedown Total Landed',
    'Round 1 Submission Attempted',
    'Round 1 Reversal',
    'Round 1 Ground and Cage Control Time',
    'Round 1 Significant Strike Head Attempted',
    'Round 1 Significant Strike Head Landed',
    'Round 1 Significant Strike Body Attempted',
    'Round 1 Significant Strike Body Landed',
    'Round 1 Significant Strike Leg Attempted',
    'Round 1 Significant Strike Leg Landed',
    'Round 1 Significant Strike Clinch Attempted',
    'Round 1 Significant Strike Clinch Landed',
    'Round 1 Significant Strike Ground Attempted',
    'Round 1 Significant Strike Ground Landed']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv'")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv'


In [42]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Round 2 Knockdown Total', 'fighter_x_Round 2 Significant Strike Total Attempted', 'fighter_x_Round 2 Significant Strike Total Landed', 'fighter_x_Round 2 Takedown Total Attempted', 'fighter_x_Round 2 Takedown Total Landed', 'fighter_x_Round 2 Submission Attempted', 'fighter_x_Round 2 Reversal', 'fighter_x_Round 2 Ground and Cage Control Time', 'fighter_x_Round 2 Significant Strike Head Attempted', 'fighter_x_Round 2 Significant Strike Head Landed', 'fighter_x_Round 2 Significant Strike Body Attempted', 'fighter_x_Round 2 Significant Strike Body Landed', 'fighter_x_Round 2 Significant Strike Leg Attempted', 'fighter_x_Round 2 Significant Strike Leg Landed'

In [43]:
# Helper function to convert MM:SS to seconds
def time_to_seconds(time_str):
    """
    Convert time string in MM:SS format to total seconds.
    Returns NaN if conversion fails or input is invalid.
    """
    if pd.isna(time_str) or not isinstance(time_str, str):
        return np.nan
    try:
        minutes, seconds = map(int, time_str.split(':'))
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        return np.nan

# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_types = [
    'Past Average Round 2 per Fight', 
    'Past Total Round 2', 
    'Past Round 2 per Minute Fought', 
    'Past Percentage of Fights with High Round 2 Value',
    'Recent Round 2 (Last 4 Fights)', 
    'Past Round 2 Differential', 
    'Past Round 2 Win-Weighted',
    'Past Round 2 Trend'
]

# Define input columns for feature engineering (Round 2 only)
input_columns = [
    'Round 2 Knockdown Total',
    'Round 2 Significant Strike Total Attempted',
    'Round 2 Significant Strike Total Landed',
    'Round 2 Takedown Total Attempted',
    'Round 2 Takedown Total Landed',
    'Round 2 Submission Attempted',
    'Round 2 Reversal',
    'Round 2 Ground and Cage Control Time',
    'Round 2 Significant Strike Head Attempted',
    'Round 2 Significant Strike Head Landed',
    'Round 2 Significant Strike Body Attempted',
    'Round 2 Significant Strike Body Landed',
    'Round 2 Significant Strike Leg Attempted',
    'Round 2 Significant Strike Leg Landed',
    'Round 2 Significant Strike Clinch Attempted',
    'Round 2 Significant Strike Clinch Landed',
    'Round 2 Significant Strike Ground Attempted',
    'Round 2 Significant Strike Ground Landed'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in input_columns:
        for stat in stat_types:
            merged_df[f"{prefix}{col} {stat}"] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        fight_time = row.get('Fight Time', np.nan)
        is_winner = 1 if row['Winner Names'] == row[prefix + 'Fighter Full Name'] else 0
        opponent_prefix = 'fighter_y_' if prefix == 'fighter_x_' else 'fighter_x_'

        # Collect Round 2 stats
        fight_stats = {}
        opponent_stats = {}
        for col in input_columns:
            col_name = f"{prefix}{col}"
            opponent_col_name = f"{opponent_prefix}{col}"
            if col == 'Round 2 Ground and Cage Control Time':
                fight_stats[col] = time_to_seconds(row.get(col_name, np.nan))
                opponent_stats[col] = time_to_seconds(row.get(opponent_col_name, np.nan))
            else:
                fight_stats[col] = float(row.get(col_name, 0)) if pd.notna(row.get(col_name)) else 0
                opponent_stats[col] = float(row.get(opponent_col_name, 0)) if pd.notna(row.get(opponent_col_name)) else 0

        # Convert fight time to seconds
        fight_time_sec = time_to_seconds(fight_time)

        # Append to fighter history list
        if any(pd.notna(fight_stats[col]) for col in input_columns):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Fight Time': fight_time_sec,
                'Is Winner': is_winner,
                **fight_stats,
                **{f"Opponent {col}": opponent_stats[col] for col in input_columns}
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, input_cols):
    """
    Calculate stats for each Round 2 column using only past fights.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    result = {f"{col} {stat}": 0 for col in input_cols for stat in stat_types}
    
    if len(past_fights) > 0:
        for col in input_cols:
            values = past_fights[col].astype(float).values
            fight_times = past_fights['Fight Time'].astype(float).values
            opponent_values = past_fights[f"Opponent {col}"].astype(float).values
            wins = past_fights['Is Winner'].astype(float).values

            # Past Average Round 2 per Fight
            result[f"{col} Past Average Round 2 per Fight"] = round(np.mean(values), 2)
            
            # Past Total Round 2
            result[f"{col} Past Total Round 2"] = round(np.sum(values), 2)
            
            # Past Round 2 per Minute Fought
            if len(fight_times) > 0 and np.sum(fight_times) > 0:
                result[f"{col} Past Round 2 per Minute Fought"] = round(np.sum(values) / (np.sum(fight_times) / 60), 2)
            
            # Past Percentage of Fights with High Round 2 Value
            high_value_threshold = 120 if col == 'Round 2 Ground and Cage Control Time' else 20 if 'Strike' in col else 1
            result[f"{col} Past Percentage of Fights with High Round 2 Value"] = round(np.mean(values >= high_value_threshold) * 100, 2)
            
            # Recent Round 2 (Last 4 Fights)
            recent_4 = values[-4:]
            if len(recent_4) > 0:
                result[f"{col} Recent Round 2 (Last 4 Fights)"] = round(np.mean(recent_4), 2)
            
            # Past Round 2 Differential
            if len(opponent_values) > 0:
                differentials = values - opponent_values
                result[f"{col} Past Round 2 Differential"] = round(np.mean(differentials), 2)
            
            # Past Round 2 Win-Weighted
            if np.sum(wins) > 0:
                win_values = values[wins == 1]
                result[f"{col} Past Round 2 Win-Weighted"] = round(np.mean(win_values), 2) if len(win_values) > 0 else 0
            
            # Past Round 2 Trend
            if len(recent_4) >= 2:
                last_2 = recent_4[-2:] if len(recent_4) >= 2 else recent_4
                prior_2 = recent_4[-4:-2] if len(recent_4) >= 4 else recent_4[:-2]
                last_2_avg = np.mean(last_2) if len(last_2) > 0 else 0
                prior_2_avg = np.mean(prior_2) if len(prior_2) > 0 else 0
                result[f"{col} Past Round 2 Trend"] = round(last_2_avg - prior_2_avg, 2)

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):
    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, input_columns)
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, input_columns)
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value  

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[

In [44]:
# List of original strike columns to drop
original_strike_cols = ['Round 2 Knockdown Total', 
    'Round 2 Significant Strike Total Attempted',
    'Round 2 Significant Strike Total Landed',
    'Round 2 Takedown Total Attempted',
    'Round 2 Takedown Total Landed',
    'Round 2 Submission Attempted',
    'Round 2 Reversal',
    'Round 2 Ground and Cage Control Time',
    'Round 2 Significant Strike Head Attempted',
    'Round 2 Significant Strike Head Landed',
    'Round 2 Significant Strike Body Attempted',
    'Round 2 Significant Strike Body Landed',
    'Round 2 Significant Strike Leg Attempted',
    'Round 2 Significant Strike Leg Landed',
    'Round 2 Significant Strike Clinch Attempted',
    'Round 2 Significant Strike Clinch Landed',
    'Round 2 Significant Strike Ground Attempted',
    'Round 2 Significant Strike Ground Landed']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv'")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv'


In [45]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Round 3 Knockdown Total', 'fighter_x_Round 3 Significant Strike Total Attempted', 'fighter_x_Round 3 Significant Strike Total Landed', 'fighter_x_Round 3 Takedown Total Attempted', 'fighter_x_Round 3 Takedown Total Landed', 'fighter_x_Round 3 Submission Attempted', 'fighter_x_Round 3 Reversal', 'fighter_x_Round 3 Ground and Cage Control Time', 'fighter_x_Round 3 Significant Strike Head Attempted', 'fighter_x_Round 3 Significant Strike Head Landed', 'fighter_x_Round 3 Significant Strike Body Attempted', 'fighter_x_Round 3 Significant Strike Body Landed', 'fighter_x_Round 3 Significant Strike Leg Attempted', 'fighter_x_Round 3 Significant Strike Leg Landed'

In [46]:
# Helper function to convert MM:SS to seconds
def time_to_seconds(time_str):
    """
    Convert time string in MM:SS format to total seconds.
    Returns NaN if conversion fails or input is invalid.
    """
    if pd.isna(time_str) or not isinstance(time_str, str):
        return np.nan
    try:
        minutes, seconds = map(int, time_str.split(':'))
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        return np.nan

# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_types = [
    'Past Average Round 3 per Fight', 
    'Past Total Round 3', 
    'Past Round 3 per Minute Fought', 
    'Past Percentage of Fights with High Round 3 Value',
    'Recent Round 3 (Last 4 Fights)', 
    'Past Round 3 Differential', 
    'Past Round 3 Win-Weighted',
    'Past Round 3 Trend'
]

# Define input columns for feature engineering (Round 3 only)
input_columns = [
    'Round 3 Knockdown Total',
    'Round 3 Significant Strike Total Attempted',
    'Round 3 Significant Strike Total Landed',
    'Round 3 Takedown Total Attempted',
    'Round 3 Takedown Total Landed',
    'Round 3 Submission Attempted',
    'Round 3 Reversal',
    'Round 3 Ground and Cage Control Time',
    'Round 3 Significant Strike Head Attempted',
    'Round 3 Significant Strike Head Landed',
    'Round 3 Significant Strike Body Attempted',
    'Round 3 Significant Strike Body Landed',
    'Round 3 Significant Strike Leg Attempted',
    'Round 3 Significant Strike Leg Landed',
    'Round 3 Significant Strike Clinch Attempted',
    'Round 3 Significant Strike Clinch Landed',
    'Round 3 Significant Strike Ground Attempted',
    'Round 3 Significant Strike Ground Landed'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in input_columns:
        for stat in stat_types:
            merged_df[f"{prefix}{col} {stat}"] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        fight_time = row.get('Fight Time', np.nan)
        is_winner = 1 if row['Winner Names'] == row[prefix + 'Fighter Full Name'] else 0
        opponent_prefix = 'fighter_y_' if prefix == 'fighter_x_' else 'fighter_x_'

        # Collect Round 3 stats
        fight_stats = {}
        opponent_stats = {}
        for col in input_columns:
            col_name = f"{prefix}{col}"
            opponent_col_name = f"{opponent_prefix}{col}"
            if col == 'Round 3 Ground and Cage Control Time':
                fight_stats[col] = time_to_seconds(row.get(col_name, np.nan))
                opponent_stats[col] = time_to_seconds(row.get(opponent_col_name, np.nan))
            else:
                fight_stats[col] = float(row.get(col_name, 0)) if pd.notna(row.get(col_name)) else 0
                opponent_stats[col] = float(row.get(opponent_col_name, 0)) if pd.notna(row.get(opponent_col_name)) else 0

        # Convert fight time to seconds
        fight_time_sec = time_to_seconds(fight_time)

        # Append to fighter history list
        if any(pd.notna(fight_stats[col]) for col in input_columns):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Fight Time': fight_time_sec,
                'Is Winner': is_winner,
                **fight_stats,
                **{f"Opponent {col}": opponent_stats[col] for col in input_columns}
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, input_cols):
    """
    Calculate stats for each Round 3 column using only past fights.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    result = {f"{col} {stat}": 0 for col in input_cols for stat in stat_types}
    
    if len(past_fights) > 0:
        for col in input_cols:
            values = past_fights[col].astype(float).values
            fight_times = past_fights['Fight Time'].astype(float).values
            opponent_values = past_fights[f"Opponent {col}"].astype(float).values
            wins = past_fights['Is Winner'].astype(float).values

            # Past Average Round 3 per Fight
            result[f"{col} Past Average Round 3 per Fight"] = round(np.mean(values), 2)
            
            # Past Total Round 3
            result[f"{col} Past Total Round 3"] = round(np.sum(values), 2)
            
            # Past Round 3 per Minute Fought
            if len(fight_times) > 0 and np.sum(fight_times) > 0:
                result[f"{col} Past Round 3 per Minute Fought"] = round(np.sum(values) / (np.sum(fight_times) / 60), 2)
            
            # Past Percentage of Fights with High Round 3 Value
            high_value_threshold = 120 if col == 'Round 3 Ground and Cage Control Time' else 20 if 'Strike' in col else 1
            result[f"{col} Past Percentage of Fights with High Round 3 Value"] = round(np.mean(values >= high_value_threshold) * 100, 2)
            
            # Recent Round 3 (Last 4 Fights)
            recent_4 = values[-4:]
            if len(recent_4) > 0:
                result[f"{col} Recent Round 3 (Last 4 Fights)"] = round(np.mean(recent_4), 2)
            
            # Past Round 3 Differential
            if len(opponent_values) > 0:
                differentials = values - opponent_values
                result[f"{col} Past Round 3 Differential"] = round(np.mean(differentials), 2)
            
            # Past Round 3 Win-Weighted
            if np.sum(wins) > 0:
                win_values = values[wins == 1]
                result[f"{col} Past Round 3 Win-Weighted"] = round(np.mean(win_values), 2) if len(win_values) > 0 else 0
            
            # Past Round 3 Trend
            if len(recent_4) >= 2:
                last_2 = recent_4[-2:] if len(recent_4) >= 2 else recent_4
                prior_2 = recent_4[-4:-2] if len(recent_4) >= 4 else recent_4[:-2]
                last_2_avg = np.mean(last_2) if len(last_2) > 0 else 0
                prior_2_avg = np.mean(prior_2) if len(prior_2) > 0 else 0
                result[f"{col} Past Round 3 Trend"] = round(last_2_avg - prior_2_avg, 2)

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):
    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, input_columns)
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, input_columns)
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value  

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[

In [47]:
# List of original strike columns to drop
original_strike_cols = ['Round 3 Knockdown Total', 
    'Round 3 Significant Strike Total Attempted',
    'Round 3 Significant Strike Total Landed',
    'Round 3 Takedown Total Attempted',
    'Round 3 Takedown Total Landed',
    'Round 3 Submission Attempted',
    'Round 3 Reversal',
    'Round 3 Ground and Cage Control Time',
    'Round 3 Significant Strike Head Attempted',
    'Round 3 Significant Strike Head Landed',
    'Round 3 Significant Strike Body Attempted',
    'Round 3 Significant Strike Body Landed',
    'Round 3 Significant Strike Leg Attempted',
    'Round 3 Significant Strike Leg Landed',
    'Round 3 Significant Strike Clinch Attempted',
    'Round 3 Significant Strike Clinch Landed',
    'Round 3 Significant Strike Ground Attempted',
    'Round 3 Significant Strike Ground Landed']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv


In [48]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Round 4 Knockdown Total', 'fighter_x_Round 4 Significant Strike Total Attempted', 'fighter_x_Round 4 Significant Strike Total Landed', 'fighter_x_Round 4 Takedown Total Attempted', 'fighter_x_Round 4 Takedown Total Landed', 'fighter_x_Round 4 Submission Attempted', 'fighter_x_Round 4 Reversal', 'fighter_x_Round 4 Ground and Cage Control Time', 'fighter_x_Round 4 Significant Strike Head Attempted', 'fighter_x_Round 4 Significant Strike Head Landed', 'fighter_x_Round 4 Significant Strike Body Attempted', 'fighter_x_Round 4 Significant Strike Body Landed', 'fighter_x_Round 4 Significant Strike Leg Attempted', 'fighter_x_Round 4 Significant Strike Leg Landed'

In [49]:
# Helper function to convert MM:SS to seconds
def time_to_seconds(time_str):
    """
    Convert time string in MM:SS format to total seconds.
    Returns NaN if conversion fails or input is invalid.
    """
    if pd.isna(time_str) or not isinstance(time_str, str):
        return np.nan
    try:
        minutes, seconds = map(int, time_str.split(':'))
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        return np.nan

# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_types = [
    'Past Average Round 4 per Fight', 
    'Past Total Round 4', 
    'Past Round 4 per Minute Fought', 
    'Past Percentage of Fights with High Round 4 Value',
    'Recent Round 4 (Last 4 Fights)', 
    'Past Round 4 Differential', 
    'Past Round 4 Win-Weighted',
    'Past Round 4 Trend'
]

# Define input columns for feature engineering (Round 3 only)
input_columns = [
    'Round 4 Knockdown Total',
    'Round 4 Significant Strike Total Attempted',
    'Round 4 Significant Strike Total Landed',
    'Round 4 Takedown Total Attempted',
    'Round 4 Takedown Total Landed',
    'Round 4 Submission Attempted',
    'Round 4 Reversal',
    'Round 4 Ground and Cage Control Time',
    'Round 4 Significant Strike Head Attempted',
    'Round 4 Significant Strike Head Landed',
    'Round 4 Significant Strike Body Attempted',
    'Round 4 Significant Strike Body Landed',
    'Round 4 Significant Strike Leg Attempted',
    'Round 4 Significant Strike Leg Landed',
    'Round 4 Significant Strike Clinch Attempted',
    'Round 4 Significant Strike Clinch Landed',
    'Round 4 Significant Strike Ground Attempted',
    'Round 4 Significant Strike Ground Landed'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in input_columns:
        for stat in stat_types:
            merged_df[f"{prefix}{col} {stat}"] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        fight_time = row.get('Fight Time', np.nan)
        is_winner = 1 if row['Winner Names'] == row[prefix + 'Fighter Full Name'] else 0
        opponent_prefix = 'fighter_y_' if prefix == 'fighter_x_' else 'fighter_x_'

        # Collect Round 4 stats
        fight_stats = {}
        opponent_stats = {}
        for col in input_columns:
            col_name = f"{prefix}{col}"
            opponent_col_name = f"{opponent_prefix}{col}"
            if col == 'Round 4 Ground and Cage Control Time':
                fight_stats[col] = time_to_seconds(row.get(col_name, np.nan))
                opponent_stats[col] = time_to_seconds(row.get(opponent_col_name, np.nan))
            else:
                fight_stats[col] = float(row.get(col_name, 0)) if pd.notna(row.get(col_name)) else 0
                opponent_stats[col] = float(row.get(opponent_col_name, 0)) if pd.notna(row.get(opponent_col_name)) else 0

        # Convert fight time to seconds
        fight_time_sec = time_to_seconds(fight_time)

        # Append to fighter history list
        if any(pd.notna(fight_stats[col]) for col in input_columns):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Fight Time': fight_time_sec,
                'Is Winner': is_winner,
                **fight_stats,
                **{f"Opponent {col}": opponent_stats[col] for col in input_columns}
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, input_cols):
    """
    Calculate stats for each Round 4 column using only past fights.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    result = {f"{col} {stat}": 0 for col in input_cols for stat in stat_types}
    
    if len(past_fights) > 0:
        for col in input_cols:
            values = past_fights[col].astype(float).values
            fight_times = past_fights['Fight Time'].astype(float).values
            opponent_values = past_fights[f"Opponent {col}"].astype(float).values
            wins = past_fights['Is Winner'].astype(float).values

            # Past Average Round 4 per Fight
            result[f"{col} Past Average Round 4 per Fight"] = round(np.mean(values), 2)
            
            # Past Total Round 4
            result[f"{col} Past Total Round 4"] = round(np.sum(values), 2)
            
            # Past Round 4 per Minute Fought
            if len(fight_times) > 0 and np.sum(fight_times) > 0:
                result[f"{col} Past Round 4 per Minute Fought"] = round(np.sum(values) / (np.sum(fight_times) / 60), 2)
            
            # Past Percentage of Fights with High Round 4 Value
            high_value_threshold = 120 if col == 'Round 4 Ground and Cage Control Time' else 20 if 'Strike' in col else 1
            result[f"{col} Past Percentage of Fights with High Round 4 Value"] = round(np.mean(values >= high_value_threshold) * 100, 2)
            
            # Recent Round 4 (Last 4 Fights)
            recent_4 = values[-4:]
            if len(recent_4) > 0:
                result[f"{col} Recent Round 4 (Last 4 Fights)"] = round(np.mean(recent_4), 2)
            
            # Past Round 4 Differential
            if len(opponent_values) > 0:
                differentials = values - opponent_values
                result[f"{col} Past Round 4 Differential"] = round(np.mean(differentials), 2)
            
            # Past Round 4 Win-Weighted
            if np.sum(wins) > 0:
                win_values = values[wins == 1]
                result[f"{col} Past Round 4 Win-Weighted"] = round(np.mean(win_values), 2) if len(win_values) > 0 else 0
            
            # Past Round 4 Trend
            if len(recent_4) >= 2:
                last_2 = recent_4[-2:] if len(recent_4) >= 2 else recent_4
                prior_2 = recent_4[-4:-2] if len(recent_4) >= 4 else recent_4[:-2]
                last_2_avg = np.mean(last_2) if len(last_2) > 0 else 0
                prior_2_avg = np.mean(prior_2) if len(prior_2) > 0 else 0
                result[f"{col} Past Round 4 Trend"] = round(last_2_avg - prior_2_avg, 2)

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):
    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, input_columns)
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, input_columns)
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value  

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[

In [50]:
# List of original strike columns to drop
original_strike_cols = ['Round 4 Knockdown Total', 
    'Round 4 Significant Strike Total Attempted',
    'Round 4 Significant Strike Total Landed',
    'Round 4 Takedown Total Attempted',
    'Round 4 Takedown Total Landed',
    'Round 4 Submission Attempted',
    'Round 4 Reversal',
    'Round 4 Ground and Cage Control Time',
    'Round 4 Significant Strike Head Attempted',
    'Round 4 Significant Strike Head Landed',
    'Round 4 Significant Strike Body Attempted',
    'Round 4 Significant Strike Body Landed',
    'Round 4 Significant Strike Leg Attempted',
    'Round 4 Significant Strike Leg Landed',
    'Round 4 Significant Strike Clinch Attempted',
    'Round 4 Significant Strike Clinch Landed',
    'Round 4 Significant Strike Ground Attempted',
    'Round 4 Significant Strike Ground Landed']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv


In [51]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_Round 5 Knockdown Total', 'fighter_x_Round 5 Significant Strike Total Attempted', 'fighter_x_Round 5 Significant Strike Total Landed', 'fighter_x_Round 5 Takedown Total Attempted', 'fighter_x_Round 5 Takedown Total Landed', 'fighter_x_Round 5 Submission Attempted', 'fighter_x_Round 5 Reversal', 'fighter_x_Round 5 Ground and Cage Control Time', 'fighter_x_Round 5 Significant Strike Head Attempted', 'fighter_x_Round 5 Significant Strike Head Landed', 'fighter_x_Round 5 Significant Strike Body Attempted', 'fighter_x_Round 5 Significant Strike Body Landed', 'fighter_x_Round 5 Significant Strike Leg Attempted', 'fighter_x_Round 5 Significant Strike Leg Landed'

In [52]:
# Helper function to convert MM:SS to seconds
def time_to_seconds(time_str):
    """
    Convert time string in MM:SS format to total seconds.
    Returns NaN if conversion fails or input is invalid.
    """
    if pd.isna(time_str) or not isinstance(time_str, str):
        return np.nan
    try:
        minutes, seconds = map(int, time_str.split(':'))
        return minutes * 60 + seconds
    except (ValueError, AttributeError):
        return np.nan

# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_types = [
    'Past Average Round 5 per Fight', 
    'Past Total Round 5', 
    'Past Round 5 per Minute Fought', 
    'Past Percentage of Fights with High Round 5 Value',
    'Recent Round 5 (Last 4 Fights)', 
    'Past Round 5 Differential', 
    'Past Round 5 Win-Weighted',
    'Past Round 5 Trend'
]

# Define input columns for feature engineering (Round 3 only)
input_columns = [
    'Round 5 Knockdown Total',
    'Round 5 Significant Strike Total Attempted',
    'Round 5 Significant Strike Total Landed',
    'Round 5 Takedown Total Attempted',
    'Round 5 Takedown Total Landed',
    'Round 5 Submission Attempted',
    'Round 5 Reversal',
    'Round 5 Ground and Cage Control Time',
    'Round 5 Significant Strike Head Attempted',
    'Round 5 Significant Strike Head Landed',
    'Round 5 Significant Strike Body Attempted',
    'Round 5 Significant Strike Body Landed',
    'Round 5 Significant Strike Leg Attempted',
    'Round 5 Significant Strike Leg Landed',
    'Round 5 Significant Strike Clinch Attempted',
    'Round 5 Significant Strike Clinch Landed',
    'Round 5 Significant Strike Ground Attempted',
    'Round 5 Significant Strike Ground Landed'
]

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in input_columns:
        for stat in stat_types:
            merged_df[f"{prefix}{col} {stat}"] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        fight_time = row.get('Fight Time', np.nan)
        is_winner = 1 if row['Winner Names'] == row[prefix + 'Fighter Full Name'] else 0
        opponent_prefix = 'fighter_y_' if prefix == 'fighter_x_' else 'fighter_x_'

        # Collect Round 5 stats
        fight_stats = {}
        opponent_stats = {}
        for col in input_columns:
            col_name = f"{prefix}{col}"
            opponent_col_name = f"{opponent_prefix}{col}"
            if col == 'Round 5 Ground and Cage Control Time':
                fight_stats[col] = time_to_seconds(row.get(col_name, np.nan))
                opponent_stats[col] = time_to_seconds(row.get(opponent_col_name, np.nan))
            else:
                fight_stats[col] = float(row.get(col_name, 0)) if pd.notna(row.get(col_name)) else 0
                opponent_stats[col] = float(row.get(opponent_col_name, 0)) if pd.notna(row.get(opponent_col_name)) else 0

        # Convert fight time to seconds
        fight_time_sec = time_to_seconds(fight_time)

        # Append to fighter history list
        if any(pd.notna(fight_stats[col]) for col in input_columns):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'Fight Time': fight_time_sec,
                'Is Winner': is_winner,
                **fight_stats,
                **{f"Opponent {col}": opponent_stats[col] for col in input_columns}
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, input_cols):
    """
    Calculate stats for each Round 5 column using only past fights.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    result = {f"{col} {stat}": 0 for col in input_cols for stat in stat_types}
    
    if len(past_fights) > 0:
        for col in input_cols:
            values = past_fights[col].astype(float).values
            fight_times = past_fights['Fight Time'].astype(float).values
            opponent_values = past_fights[f"Opponent {col}"].astype(float).values
            wins = past_fights['Is Winner'].astype(float).values

            # Past Average Round 5 per Fight
            result[f"{col} Past Average Round 5 per Fight"] = round(np.mean(values), 2)
            
            # Past Total Round 5
            result[f"{col} Past Total Round 5"] = round(np.sum(values), 2)
            
            # Past Round 5 per Minute Fought
            if len(fight_times) > 0 and np.sum(fight_times) > 0:
                result[f"{col} Past Round 5 per Minute Fought"] = round(np.sum(values) / (np.sum(fight_times) / 60), 2)
            
            # Past Percentage of Fights with High Round 5 Value
            high_value_threshold = 120 if col == 'Round 5 Ground and Cage Control Time' else 20 if 'Strike' in col else 1
            result[f"{col} Past Percentage of Fights with High Round 5 Value"] = round(np.mean(values >= high_value_threshold) * 100, 2)
            
            # Recent Round 5 (Last 4 Fights)
            recent_4 = values[-4:]
            if len(recent_4) > 0:
                result[f"{col} Recent Round 5 (Last 4 Fights)"] = round(np.mean(recent_4), 2)
            
            # Past Round 5 Differential
            if len(opponent_values) > 0:
                differentials = values - opponent_values
                result[f"{col} Past Round 5 Differential"] = round(np.mean(differentials), 2)
            
            # Past Round 5 Win-Weighted
            if np.sum(wins) > 0:
                win_values = values[wins == 1]
                result[f"{col} Past Round 5 Win-Weighted"] = round(np.mean(win_values), 2) if len(win_values) > 0 else 0
            
            # Past Round 5 Trend
            if len(recent_4) >= 2:
                last_2 = recent_4[-2:] if len(recent_4) >= 2 else recent_4
                prior_2 = recent_4[-4:-2] if len(recent_4) >= 4 else recent_4[:-2]
                last_2_avg = np.mean(last_2) if len(last_2) > 0 else 0
                prior_2_avg = np.mean(prior_2) if len(prior_2) > 0 else 0
                result[f"{col} Past Round 5 Trend"] = round(last_2_avg - prior_2_avg, 2)

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):
    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, input_columns)
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, input_columns)
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_{stat}"] = value  

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[f"{prefix}{col} {stat}"] = np.nan
  merged_df[

In [53]:
# List of original strike columns to drop
original_strike_cols = ['Round 5 Knockdown Total', 
    'Round 5 Significant Strike Total Attempted',
    'Round 5 Significant Strike Total Landed',
    'Round 5 Takedown Total Attempted',
    'Round 5 Takedown Total Landed',
    'Round 5 Submission Attempted',
    'Round 5 Reversal',
    'Round 5 Ground and Cage Control Time',
    'Round 5 Significant Strike Head Attempted',
    'Round 5 Significant Strike Head Landed',
    'Round 5 Significant Strike Body Attempted',
    'Round 5 Significant Strike Body Landed',
    'Round 5 Significant Strike Leg Attempted',
    'Round 5 Significant Strike Leg Landed',
    'Round 5 Significant Strike Clinch Attempted',
    'Round 5 Significant Strike Clinch Landed',
    'Round 5 Significant Strike Ground Attempted',
    'Round 5 Significant Strike Ground Landed']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv


In [54]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_x_odds', 'fighter_y_Fighter Full Name', 'fighter_y_Height Feet', 'fighter_y_Height Inches', 'fighter_y_Weight Pounds', 'fighter_y_Reach Inches', 'fighter_y_Stance', 'fighter_y_Date of Birth', 'fighter_y_odds', 'fighter_x_avg_past_4_attempted', 'fighter_x_highest_past_4_attempted', 'fighter_x_lowest_past_4_attempted', 'fighter_x_range_past_4_attempted', 'fighter_x_std_dev_past_4_attempted', 'fighter_x_median_strikes_past_4_attempted', 'fighter_x_num_40_plus_strike_games_past_4_attempted', 'fighter_x_strike_increase_trend_attempted', 'fighter_x_game_with_most_strikes_attempted', 'fighter_x_game_with_least_strikes_attempted', 'fighter_x_strike_drop_from_peak_t

In [55]:
# Helper function to convert American odds to implied probability
def odds_to_probability(odds):
    """
    Convert American odds to implied probability.
    Returns NaN if conversion fails or input is invalid.

    American odds are converted to implied probability as follows:

    Positive odds (e.g., +150): Probability = 100 / (odds + 100)
    Negative odds (e.g., -200): Probability = |odds| / (|odds| + 100) Examples:
    +150 → 100 / (150 + 100) = 0.4 (40% chance of winning)
    -200 → 200 / (200 + 100) = 0.667 (66.7% chance)
    """
    if pd.isna(odds):
        return np.nan
    try:
        odds = float(odds)
        if odds > 0:
            return 100 / (odds + 100)
        else:
            return abs(odds) / (abs(odds) + 100)
    except (ValueError, TypeError):
        return np.nan

# Load merged dataset
merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
merged_df['Event Date'] = pd.to_datetime(merged_df['Event Date'])

# Define stat columns to generate
stat_columns = [
    'avg_past_4_odds', 
    'highest_past_4_odds', 
    'lowest_past_4_odds', 
    'range_past_4_odds',
    'std_dev_past_4_odds', 
    'median_past_4_odds', 
    'num_favorite_past_4_odds',
    'odds_increase_trend', 
    'game_with_highest_odds', 
    'game_with_lowest_odds',
    'odds_drop_from_peak_to_lowest', 
    'outlier_odds_above_1_5xIQR'
]

# Define input columns for odds
input_columns = ['odds']

# Create new stat columns for both fighters
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in input_columns:
        for stat in stat_columns:
            merged_df[f"{prefix}{col}_{stat}"] = np.nan

# Build fighter-level history for cumulative calculations
fighter_history = []

for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Building Fighter History"):
    for prefix in ['fighter_x_', 'fighter_y_']:
        fighter_name = row[prefix + 'Fighter Full Name']
        fight_date = row['Event Date']
        odds = row.get(f"{prefix}odds", np.nan)

        # Convert odds to implied probability
        prob = odds_to_probability(odds)

        # Append to fighter history list
        if pd.notna(prob):
            fighter_history.append({
                'Fighter Full Name': fighter_name,
                'Event Date': fight_date,
                'odds': prob
            })

# Build history DataFrame
fighter_history_df = pd.DataFrame(fighter_history)
fighter_history_df.sort_values(by=['Fighter Full Name', 'Event Date'], inplace=True)

# Function to calculate cumulative stats
def calculate_cumulative_stats(fighter_name, fight_date, df, odds_col='odds'):
    """
    Calculate stats for odds (implied probability) using only past fights.
    Returns dictionary with safe defaults.
    """
    # Filter past fights
    past_fights = df[
        (df['Fighter Full Name'] == fighter_name) &
        (df['Event Date'] < fight_date)
    ].sort_values(by='Event Date', ascending=True)

    probs = past_fights[odds_col].astype(float).values

    result = {col: 0 for col in stat_columns}
    
    if len(probs) > 0:
        recent_4 = probs[-4:]
        result['avg_past_4_odds'] = round(np.mean(recent_4), 4)
        result['highest_past_4_odds'] = round(np.max(recent_4), 4)
        result['lowest_past_4_odds'] = round(np.min(recent_4), 4)
        result['range_past_4_odds'] = round(result['highest_past_4_odds'] - result['lowest_past_4_odds'], 4)
        result['std_dev_past_4_odds'] = round(np.std(recent_4), 4)
        result['median_past_4_odds'] = round(np.median(recent_4), 4)
        result['num_favorite_past_4_odds'] = int(np.sum(recent_4 > 0.5))  # Favorite if prob > 0.5
        result['odds_increase_trend'] = 1 if len(recent_4) > 1 and np.all(np.diff(recent_4) >= 0) else 0
        result['game_with_highest_odds'] = int(np.argmax(recent_4) + 1) if len(recent_4) > 0 else 0
        result['game_with_lowest_odds'] = int(np.argmin(recent_4) + 1) if len(recent_4) > 0 else 0
        result['odds_drop_from_peak_to_lowest'] = round(result['highest_past_4_odds'] - result['lowest_past_4_odds'], 4)

        q1, q3 = np.percentile(recent_4, [25, 75])
        iqr = q3 - q1
        result['outlier_odds_above_1_5xIQR'] = 1 if any(recent_4 > q3 + 1.5 * iqr) else 0

    return result

# Loop through each fight and compute stats for both fighters
for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Generating Stats"):
    fight_date = row['Event Date']
    
    # Fighter names
    fighter_x_name = row['fighter_x_Fighter Full Name']
    fighter_y_name = row['fighter_y_Fighter Full Name']

    # Compute stats for Fighter X
    x_stats = calculate_cumulative_stats(fighter_x_name, fight_date, fighter_history_df, odds_col='odds')
    for stat, value in x_stats.items():
        merged_df.at[idx, f"fighter_x_odds_{stat}"] = value

    # Compute stats for Fighter Y
    y_stats = calculate_cumulative_stats(fighter_y_name, fight_date, fighter_history_df, odds_col='odds')
    for stat, value in y_stats.items():
        merged_df.at[idx, f"fighter_y_odds_{stat}"] = value

# Save the updated dataset
merged_df.to_csv("ufc_fights_with_odds_features.csv", index=False)

  merged_df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
Building Fighter History: 100%|██████████| 6250/6250 [00:01<00:00, 5030.34it/s]
Generating Stats: 100%|██████████| 6250/6250 [00:24<00:00, 256.61it/s]


In [56]:
# List of original strike columns to drop
original_strike_cols = ['odds']

# Drop original strike columns and keep only stats
columns_to_drop = []
for prefix in ['fighter_x_', 'fighter_y_']:
    for col in original_strike_cols:
        col_name = prefix + col
        if col_name in merged_df.columns:
            columns_to_drop.append(col_name)

# Drop those columns
merged_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save final clean dataset
merged_df.to_csv("ufc_fights_feature_engineered_full.csv", index=False)
print("Final dataset saved as 'ufc_fights_feature_engineered_full.csv")

Final dataset saved as 'ufc_fights_feature_engineered_full.csv


In [57]:
print(merged_df.columns.to_list())

['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_x_Height Feet', 'fighter_x_Height Inches', 'fighter_x_Weight Pounds', 'fighter_x_Reach Inches', 'fighter_x_Stance', 'fighter_x_Date of Birth', 'fighter_y_Fighter Full Name', 'fighter_y_Height Feet', 'fighter_y_Height Inches', 'fighter_y_Weight Pounds', 'fighter_y_Reach Inches', 'fighter_y_Stance', 'fighter_y_Date of Birth', 'fighter_x_avg_past_4_attempted', 'fighter_x_highest_past_4_attempted', 'fighter_x_lowest_past_4_attempted', 'fighter_x_range_past_4_attempted', 'fighter_x_std_dev_past_4_attempted', 'fighter_x_median_strikes_past_4_attempted', 'fighter_x_num_40_plus_strike_games_past_4_attempted', 'fighter_x_strike_increase_trend_attempted', 'fighter_x_game_with_most_strikes_attempted', 'fighter_x_game_with_least_strikes_attempted', 'fighter_x_strike_drop_from_peak_to_lowest_attempted', 'fighter_x_outl

## Shuffling data 

In [58]:
# Load your dataset (replace 'your_data.csv' with your file)
df = pd.read_csv('ufc_fights_feature_engineered_full.csv')

# Define common columns (not to be swapped)
common_cols = [
    'Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Weight Class',
    'Winning Method', 'Winning Round', 'Winning Time', 'Winner Names'
]

# Define fighter_x and fighter_y columns (to be swapped)
fighter_x_cols = [col for col in df.columns if col.startswith('fighter_x_')]
fighter_y_cols = [col for col in df.columns if col.startswith('fighter_y_')]

# Validate that fighter_x and fighter_y columns match in number
assert len(fighter_x_cols) == len(fighter_y_cols), "Mismatch in fighter_x and fighter_y columns"

# Randomly decide which rows to swap (50% chance)
np.random.seed(42)  # For reproducibility
swap_mask = np.random.choice([True, False], size=len(df), p=[0.5, 0.5])

# Create a copy of the dataframe
df_shuffled = df.copy()

# Perform the swap for selected rows
for idx in df_shuffled.index[swap_mask]:
    # Swap fighter_x and fighter_y columns
    for x_col, y_col in zip(fighter_x_cols, fighter_y_cols):
        df_shuffled.loc[idx, x_col], df_shuffled.loc[idx, y_col] = \
            df_shuffled.loc[idx, y_col], df_shuffled.loc[idx, x_col]
    
    # Update Winner Names to reflect the swap
    winner = df_shuffled.loc[idx, 'Winner Names']
    fighter_x_name = df_shuffled.loc[idx, 'fighter_x_Fighter Full Name']
    fighter_y_name = df_shuffled.loc[idx, 'fighter_y_Fighter Full Name']
    
    # If the original winner was fighter_x, it’s now fighter_y, and vice versa
    if winner == df.loc[idx, 'fighter_x_Fighter Full Name']:
        df_shuffled.loc[idx, 'Winner Names'] = fighter_y_name
    elif winner == df.loc[idx, 'fighter_y_Fighter Full Name']:
        df_shuffled.loc[idx, 'Winner Names'] = fighter_x_name

# Save the shuffled dataset
df_shuffled.to_csv('ufc_fights_feature_engineered_full_suffled.csv', index=False)
print("Shuffling complete! Saved to 'ufc_fights_feature_engineered_full.csv'")

  df = pd.read_csv('ufc_fights_feature_engineered_full.csv')


Shuffling complete! Saved to 'ufc_fights_feature_engineered_full.csv'


In [59]:
# Load the datasets (replace file paths as needed)
original_df = pd.read_csv('ufc_fights_feature_engineered_full.csv')
shuffled_df = pd.read_csv('ufc_fights_feature_engineered_full_suffled.csv')

# Verify that both datasets have the same columns
if set(original_df.columns) != set(shuffled_df.columns):
    raise ValueError("Datasets have different columns. Please ensure they match.")

# Verify that Fight IDs match
if not original_df['Fight ID'].sort_values().reset_index(drop=True).equals(
    shuffled_df['Fight ID'].sort_values().reset_index(drop=True)
):
    raise ValueError("Fight IDs do not match between datasets.")

# Sort both datasets by Fight ID to align rows
original_df = original_df.sort_values('Fight ID').reset_index(drop=True)
shuffled_df = shuffled_df.sort_values('Fight ID').reset_index(drop=True)

# Create a list to store interleaved rows
merged_rows = []

# Interleave rows: original row, then shuffled row for each Fight ID
for idx in range(len(original_df)):
    # Add row from original dataset
    merged_rows.append(original_df.iloc[idx])
    # Add row from shuffled dataset
    merged_rows.append(shuffled_df.iloc[idx])

# Convert the list of rows to a dataframe
merged_df = pd.DataFrame(merged_rows, columns=original_df.columns)

# Reset index for clean output
merged_df = merged_df.reset_index(drop=True)

# Save the merged dataset
merged_df.to_csv('ufc_fights_merged.csv', index=False)
print("Merging complete! Saved to 'ufc_fights_merged.csv'")

  original_df = pd.read_csv('ufc_fights_feature_engineered_full.csv')
  shuffled_df = pd.read_csv('ufc_fights_feature_engineered_full_suffled.csv')


Merging complete! Saved to 'ufc_fights_merged.csv'


## 1st Model Training with shuffled data set

In [60]:
! pip install pandas numpy scikit-learn xgboost

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting xgboost
  Using cached xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
Using cached xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
Using cached joblib-1.5.1-py3-none-any.whl (307 kB)
Using cached scipy-1.15.3-cp312-cp312-win_amd64.whl (41.0 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, xgboost, scikit-learn
Successfully installed joblib-1.5.1 scikit-learn-1.6.1 scipy-1.15.3 threadpoolctl-


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [61]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
from datetime import datetime

# Load the dataset
df = pd.read_csv("ufc_fights_merged.csv")

# Step 1: Handle missing data (replace null values with 0)
df.fillna(0, inplace=True)

# Step 2: Convert Event Date to datetime
df['Event Date'] = pd.to_datetime(df['Event Date'])

# Step 3: Convert Date of Birth columns to datetime, handle errors
for col in ['fighter_x_Date of Birth', 'fighter_y_Date of Birth']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Step 4: Calculate age for Fighter X and Fighter Y
df['Fighter_X_Age'] = (df['Event Date'] - df['fighter_x_Date of Birth']).dt.days / 365.25
df['Fighter_Y_Age'] = (df['Event Date'] - df['fighter_y_Date of Birth']).dt.days / 365.25

# Replace NaN ages with 0 (from null dates)
df['Fighter_X_Age'].fillna(0, inplace=True)
df['Fighter_Y_Age'].fillna(0, inplace=True)

# Step 5: Split the data based on years
train_data = df[df['Event Date'].dt.year <= 2020]
validation_data = df[(df['Event Date'].dt.year >= 2021) & (df['Event Date'].dt.year <= 2022)]
test_data = df[df['Event Date'].dt.year >= 2023]

# Step 6: Filter validation and test sets to include only fighters seen in training
train_fighters = set(train_data['Winner Names'])
validation_data = validation_data[validation_data['Winner Names'].isin(train_fighters)]
test_data = test_data[test_data['Winner Names'].isin(train_fighters)]

# Print the number of rows filtered out
print(f"Validation set size after filtering: {len(validation_data)} rows")
print(f"Test set size after filtering: {len(test_data)} rows")

# Step 7: Define features and target
common_columns = ['Fight ID', 'Event Title', 'Event Date', 'Event Location', 
                  'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 
                  'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_y_Fighter Full Name', 
                  'fighter_x_Date of Birth', 'fighter_y_Date of Birth']
# Exclude stance columns from common_columns to encode them
feature_columns = [col for col in df.columns if col not in common_columns]

# Step 8: One-hot encode categorical columns (stance)
categorical_columns = ['fighter_x_Stance', 'fighter_y_Stance']
train_data_encoded = pd.get_dummies(train_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])
validation_data_encoded = pd.get_dummies(validation_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])
test_data_encoded = pd.get_dummies(test_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])

# Align columns across datasets (in case of missing categories)
train_data_encoded, validation_data_encoded = train_data_encoded.align(validation_data_encoded, join='left', axis=1, fill_value=0)
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='left', axis=1, fill_value=0)

# Prepare features (X) and target (y)
X_train = train_data_encoded
y_train = train_data['Winner Names']
X_val = validation_data_encoded
y_val = validation_data['Winner Names']
X_test = test_data_encoded
y_test = test_data['Winner Names']

# Step 9: Encode the target variable (Winner Names)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Step 10: Train the model (XGBoost)
model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
model.fit(X_train, y_train_encoded)

# Step 11: Predict on validation and test sets
val_predictions = model.predict(X_val)
test_predictions = model.predict(X_test)

# Step 12: Calculate accuracy
val_accuracy = accuracy_score(y_val_encoded, val_predictions)
test_accuracy = accuracy_score(y_test_encoded, test_predictions)

print(f"Validation Accuracy (2021-2022): {val_accuracy:.4f}")
print(f"Test Accuracy (2023-2025): {test_accuracy:.4f}")

# Step 13: Calculate winning probabilities and predicted winner for test set
proba_predictions = model.predict_proba(X_test)

# Create a DataFrame for test set with predictions and probabilities
test_data = test_data.copy()  # Avoid modifying original test_data
test_data['Predicted_Winner'] = label_encoder.inverse_transform(test_predictions)

# Map probabilities to Fighter X and Fighter Y
proba_df = pd.DataFrame(proba_predictions, columns=label_encoder.classes_, index=X_test.index)
test_data['Prob_Fighter_X'] = test_data.apply(
    lambda row: proba_df.loc[row.name, row['fighter_x_Fighter Full Name']] if row['fighter_x_Fighter Full Name'] in proba_df.columns else 0, axis=1
)
test_data['Prob_Fighter_Y'] = test_data.apply(
    lambda row: proba_df.loc[row.name, row['fighter_y_Fighter Full Name']] if row['fighter_y_Fighter Full Name'] in proba_df.columns else 0, axis=1
)

# Step 14: Save the updated test set to a new CSV
test_data.to_csv("ufc_fights_merged_updated.csv", index=False)
print("\nUpdated test set saved to 'ufc_fights_merged_updated.csv' with new columns: Prob_Fighter_X, Prob_Fighter_Y, Predicted_Winner")

# Step 15: Display sample of updated test set
print("\nSample of Updated Test Set:")
print(test_data[['Fight ID', 'fighter_x_Fighter Full Name', 'fighter_y_Fighter Full Name', 'Prob_Fighter_X', 'Prob_Fighter_Y', 'Predicted_Winner']].head())

  df = pd.read_csv("ufc_fights_merged.csv")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fighter_X_Age'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fighter_Y_Age'].fillna(0, inplace=True)


Validation set size after filtering: 1428 rows
Test set size after filtering: 992 rows
Validation Accuracy (2021-2022): 0.1849
Test Accuracy (2023-2025): 0.1341

Updated test set saved to 'ufc_fights_merged_updated.csv' with new columns: Prob_Fighter_X, Prob_Fighter_Y, Predicted_Winner

Sample of Updated Test Set:
       Fight ID fighter_x_Fighter Full Name fighter_y_Fighter Full Name  \
10226      5114        Abdul Razak Alhassan             Claudio Ribeiro   
10227      5114        Abdul Razak Alhassan             Claudio Ribeiro   
10232      5117           Raquel Pennington               Ketlen Vieira   
10233      5117               Ketlen Vieira           Raquel Pennington   
10236      5119               Damon Jackson                     Dan Ige   

       Prob_Fighter_X  Prob_Fighter_Y   Predicted_Winner  
10226        0.024348        0.000000         Phil Hawes  
10227        0.024348        0.000000         Phil Hawes  
10232        0.325142        0.016344  Raquel Pennington

## 2nd model training with shuffled data set

In [62]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
from datetime import datetime

# Load the dataset
df = pd.read_csv("ufc_fights_merged.csv")

# Step 1: Handle missing data (replace null values with 0)
df.fillna(0, inplace=True)

# Step 2: Convert Event Date to datetime
df['Event Date'] = pd.to_datetime(df['Event Date'])

# Step 3: Convert Date of Birth columns to datetime, handle errors
for col in ['fighter_x_Date of Birth', 'fighter_y_Date of Birth']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Step 4: Calculate age for Fighter X and Fighter Y
df['Fighter_X_Age'] = (df['Event Date'] - df['fighter_x_Date of Birth']).dt.days / 365.25
df['Fighter_Y_Age'] = (df['Event Date'] - df['fighter_y_Date of Birth']).dt.days / 365.25
df['Fighter_X_Age'].fillna(0, inplace=True)
df['Fighter_Y_Age'].fillna(0, inplace=True)

# Step 5: Feature engineering (create differential features)
# Example: Assuming columns like 'fighter_x_Strikes_Landed', 'fighter_y_Strikes_Landed' exist
# Replace with actual column names from your dataset
numerical_columns = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col not in [
    'Fight ID', 'Event Date', 'Winning Round', 'Winning Time', 'Fighter_X_Age', 'Fighter_Y_Age']]
for col in numerical_columns:
    if col.startswith('fighter_x_') and col.replace('fighter_x_', 'fighter_y_') in df.columns:
        df[f'Diff_{col[9:]}'] = df[col] - df[col.replace('fighter_x_', 'fighter_y_')]

# Step 6: Deduplicate by Fight ID (keep first occurrence)
df = df.drop_duplicates(subset=['Fight ID'], keep='first')

# Step 7: Create binary target (1 if Fighter X wins, 0 if Fighter Y wins)
df['Winner_Is_Fighter_X'] = df.apply(
    lambda row: 1 if row['Winner Names'] == row['fighter_x_Fighter Full Name'] else 0, axis=1
)

# Step 8: Split the data based on years
train_data = df[df['Event Date'].dt.year <= 2020]
validation_data = df[(df['Event Date'].dt.year >= 2021) & (df['Event Date'].dt.year <= 2022)]
test_data = df[df['Event Date'].dt.year >= 2023]

# Step 9: Filter validation and test sets to include only fighters seen in training
train_fighters = set(train_data['Winner Names'])
validation_data = validation_data[validation_data['Winner Names'].isin(train_fighters)]
test_data = test_data[test_data['Winner Names'].isin(train_fighters)]

# Reset indices to ensure contiguous indexing
train_data = train_data.reset_index(drop=True)
validation_data = validation_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Print the number of rows filtered out
print(f"Validation set size after filtering: {len(validation_data)} rows")
print(f"Test set size after filtering: {len(test_data)} rows")

# Step 10: Define features and target
common_columns = ['Fight ID', 'Event Title', 'Event Date', 'Event Location', 
                  'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 
                  'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_y_Fighter Full Name', 
                  'fighter_x_Date of Birth', 'fighter_y_Date of Birth', 'Winner_Is_Fighter_X']
feature_columns = [col for col in df.columns if col not in common_columns]

# Step 11: One-hot encode categorical columns
categorical_columns = ['fighter_x_Stance', 'fighter_y_Stance']
if 'Weight Class' in df.columns:
    categorical_columns.append('Weight Class')  # Encode Weight Class if present
train_data_encoded = pd.get_dummies(train_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])
validation_data_encoded = pd.get_dummies(validation_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])
test_data_encoded = pd.get_dummies(test_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])

# Align columns across datasets
train_data_encoded, validation_data_encoded = train_data_encoded.align(validation_data_encoded, join='left', axis=1, fill_value=0)
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='left', axis=1, fill_value=0)

# Prepare features (X) and target (y)
X_train = train_data_encoded
y_train = train_data['Winner_Is_Fighter_X']
X_val = validation_data_encoded
y_val = validation_data['Winner_Is_Fighter_X']
X_test = test_data_encoded
y_test = test_data['Winner_Is_Fighter_X']

# Step 12: Train the model (XGBoost)
model = xgb.XGBClassifier(random_state=42, eval_metric='logloss', learning_rate=0.1, max_depth=5, n_estimators=100)
model.fit(X_train, y_train)

# Step 13: Predict on validation and test sets
val_predictions = model.predict(X_val)
test_predictions = model.predict(X_test)

# Step 14: Calculate accuracy
val_accuracy = accuracy_score(y_val, val_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print(f"Validation Accuracy (2021-2022): {val_accuracy:.4f}")
print(f"Test Accuracy (2023-2025): {test_accuracy:.4f}")

# Step 15: Calculate winning probabilities and predicted winner for test set
proba_predictions = model.predict_proba(X_test)

# Create a DataFrame for test set with predictions and probabilities
test_data = test_data.copy()
test_data['Predicted_Winner'] = test_data.apply(
    lambda row: row['fighter_x_Fighter Full Name'] if test_predictions[row.name] == 1 else row['fighter_y_Fighter Full Name'], axis=1
)
test_data['Prob_Fighter_X'] = proba_predictions[:, 1]  # Probability of Fighter X winning
test_data['Prob_Fighter_Y'] = proba_predictions[:, 0]  # Probability of Fighter Y winning

# Step 16: Save the updated test set to a new CSV
test_data.to_csv("ufc_fights_merged_updated.csv", index=False)
print("\nUpdated test set saved to 'ufc_fights_merged_updated.csv' with new columns: Prob_Fighter_X, Prob_Fighter_Y, Predicted_Winner")

# Step 17: Display sample of updated test set
print("\nSample of Updated Test Set:")
print(test_data[['Fight ID', 'fighter_x_Fighter Full Name', 'fighter_y_Fighter Full Name', 'Prob_Fighter_X', 'Prob_Fighter_Y', 'Predicted_Winner']].head())

  df = pd.read_csv("ufc_fights_merged.csv")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fighter_X_Age'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fighter_Y_Age'].fillna(0, inplace=True)
  df[f'Diff_{col[9:]}'] = df[col] - df[col.replace('fighter_x_', 'fighter_y_')]
  df[f'Diff_{col[9:]

Validation set size after filtering: 714 rows
Test set size after filtering: 496 rows
Validation Accuracy (2021-2022): 0.5910
Test Accuracy (2023-2025): 0.5665

Updated test set saved to 'ufc_fights_merged_updated.csv' with new columns: Prob_Fighter_X, Prob_Fighter_Y, Predicted_Winner

Sample of Updated Test Set:
   Fight ID fighter_x_Fighter Full Name fighter_y_Fighter Full Name  \
0      5114        Abdul Razak Alhassan             Claudio Ribeiro   
1      5117           Raquel Pennington               Ketlen Vieira   
2      5119               Damon Jackson                     Dan Ige   
3      5120          Nassourdine Imavov             Sean Strickland   
4      5123               Nicolas Dalby               Warlley Alves   

   Prob_Fighter_X  Prob_Fighter_Y      Predicted_Winner  
0        0.585241        0.414759  Abdul Razak Alhassan  
1        0.377907        0.622093         Ketlen Vieira  
2        0.674125        0.325875         Damon Jackson  
3        0.461999        0

## 3rd model training with shuffled data set

In [63]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
from datetime import datetime
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("ufc_fights_merged.csv")

# Step 1: Handle missing data (replace null values with 0)
df.fillna(0, inplace=True)

# Step 2: Convert Event Date to datetime
df['Event Date'] = pd.to_datetime(df['Event Date'])

# Step 3: Convert Date of Birth columns to datetime, handle errors
for col in ['fighter_x_Date of Birth', 'fighter_y_Date of Birth']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Step 4: Calculate age for Fighter X and Fighter Y
df['Fighter_X_Age'] = (df['Event Date'] - df['fighter_x_Date of Birth']).dt.days / 365.25
df['Fighter_Y_Age'] = (df['Event Date'] - df['fighter_y_Date of Birth']).dt.days / 365.25
df['Fighter_X_Age'].fillna(0, inplace=True)
df['Fighter_Y_Age'].fillna(0, inplace=True)

# Step 5: Feature engineering (create differential features)
numerical_columns = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col not in [
    'Fight ID', 'Event Date', 'Winning Round', 'Winning Time', 'Fighter_X_Age', 'Fighter_Y_Age']]
for col in numerical_columns:
    if col.startswith('fighter_x_') and col.replace('fighter_x_', 'fighter_y_') in df.columns:
        df[f'Diff_{col[9:]}'] = df[col] - df[col.replace('fighter_x_', 'fighter_y_')]

# Step 6: Deduplicate by Fight ID
df = df.drop_duplicates(subset=['Fight ID'], keep='first')

# Step 7: Create binary target
df['Winner_Is_Fighter_X'] = df.apply(
    lambda row: 1 if row['Winner Names'] == row['fighter_x_Fighter Full Name'] else 0, axis=1
)

# Step 8: Split the data based on years
train_data = df[df['Event Date'].dt.year <= 2020]
validation_data = df[(df['Event Date'].dt.year >= 2021) & (df['Event Date'].dt.year <= 2022)]
test_data = df[df['Event Date'].dt.year >= 2023]

# Step 9: Filter validation and test sets
train_fighters = set(train_data['Winner Names'])
validation_data = validation_data[validation_data['Winner Names'].isin(train_fighters)]
test_data = test_data[test_data['Winner Names'].isin(train_fighters)]

# Reset indices
train_data = train_data.reset_index(drop=True)
validation_data = validation_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Print set sizes
print(f"Validation set size after filtering: {len(validation_data)} rows")
print(f"Test set size after filtering: {len(test_data)} rows")

# Step 10: Check class balance
print("\nClass balance in training set:")
print(train_data['Winner_Is_Fighter_X'].value_counts(normalize=True))

# Step 11: Define features and target
common_columns = ['Fight ID', 'Event Title', 'Event Date', 'Event Location', 
                  'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 
                  'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_y_Fighter Full Name', 
                  'fighter_x_Date of Birth', 'fighter_y_Date of Birth', 'Winner_Is_Fighter_X']
feature_columns = [col for col in df.columns if col not in common_columns]

# Step 12: One-hot encode categorical columns
categorical_columns = ['fighter_x_Stance', 'fighter_y_Stance', 'Weight Class']
train_data_encoded = pd.get_dummies(train_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])
validation_data_encoded = pd.get_dummies(validation_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])
test_data_encoded = pd.get_dummies(test_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])

# Align columns
train_data_encoded, validation_data_encoded = train_data_encoded.align(validation_data_encoded, join='left', axis=1, fill_value=0)
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='left', axis=1, fill_value=0)

# Prepare features (X) and target (y)
X_train = train_data_encoded
y_train = train_data['Winner_Is_Fighter_X']
X_val = validation_data_encoded
y_val = validation_data['Winner_Is_Fighter_X']
X_test = test_data_encoded
y_test = test_data['Winner_Is_Fighter_X']

# Step 13: Train the model with class balance handling
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)  # Handle imbalance
model = xgb.XGBClassifier(random_state=42, eval_metric='logloss', learning_rate=0.1, max_depth=5, n_estimators=200, scale_pos_weight=scale_pos_weight)
model.fit(X_train, y_train)

# Step 14: Predict on validation and test sets
val_predictions = model.predict(X_val)
test_predictions = model.predict(X_test)

# Step 15: Calculate accuracy
val_accuracy = accuracy_score(y_val, val_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print(f"Validation Accuracy (2021-2022): {val_accuracy:.4f}")
print(f"Test Accuracy (2023-2025): {test_accuracy:.4f}")

# Step 16: Feature importance
print("\nTop 10 Feature Importances:")
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_importance.head(10))

# Step 17: Calculate winning probabilities and predicted winner
proba_predictions = model.predict_proba(X_test)
test_data = test_data.copy()
test_data['Predicted_Winner'] = test_data.apply(
    lambda row: row['fighter_x_Fighter Full Name'] if test_predictions[row.name] == 1 else row['fighter_y_Fighter Full Name'], axis=1
)
test_data['Prob_Fighter_X'] = proba_predictions[:, 1]
test_data['Prob_Fighter_Y'] = proba_predictions[:, 0]

# Step 18: Save the updated test set
test_data.to_csv("ufc_fights_merged_updated.csv", index=False)
print("\nUpdated test set saved to 'ufc_fights_merged_updated.csv' with new columns: Prob_Fighter_X, Prob_Fighter_Y, Predicted_Winner")

# Step 19: Display sample of updated test set
print("\nSample of Updated Test Set:")
print(test_data[['Fight ID', 'fighter_x_Fighter Full Name', 'fighter_y_Fighter Full Name', 'Prob_Fighter_X', 'Prob_Fighter_Y', 'Predicted_Winner']].head())

ModuleNotFoundError: No module named 'matplotlib'

## 3rd model train without shuffled

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
from datetime import datetime
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("ufc_fights_feature_engineered_full.csv")

# Step 1: Handle missing data (replace null values with 0)
df.fillna(0, inplace=True)

# Step 2: Convert Event Date to datetime
df['Event Date'] = pd.to_datetime(df['Event Date'])

# Step 3: Convert Date of Birth columns to datetime, handle errors
for col in ['fighter_x_Date of Birth', 'fighter_y_Date of Birth']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Step 4: Calculate age for Fighter X and Fighter Y
df['Fighter_X_Age'] = (df['Event Date'] - df['fighter_x_Date of Birth']).dt.days / 365.25
df['Fighter_Y_Age'] = (df['Event Date'] - df['fighter_y_Date of Birth']).dt.days / 365.25
df['Fighter_X_Age'].fillna(0, inplace=True)
df['Fighter_Y_Age'].fillna(0, inplace=True)

# Step 5: Feature engineering (create differential features)
numerical_columns = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col not in [
    'Fight ID', 'Event Date', 'Winning Round', 'Winning Time', 'Fighter_X_Age', 'Fighter_Y_Age']]
for col in numerical_columns:
    if col.startswith('fighter_x_') and col.replace('fighter_x_', 'fighter_y_') in df.columns:
        df[f'Diff_{col[9:]}'] = df[col] - df[col.replace('fighter_x_', 'fighter_y_')]

# Step 6: Deduplicate by Fight ID
df = df.drop_duplicates(subset=['Fight ID'], keep='first')

# Step 7: Create binary target
df['Winner_Is_Fighter_X'] = df.apply(
    lambda row: 1 if row['Winner Names'] == row['fighter_x_Fighter Full Name'] else 0, axis=1
)

# Step 8: Split the data based on years
train_data = df[df['Event Date'].dt.year <= 2020]
validation_data = df[(df['Event Date'].dt.year >= 2021) & (df['Event Date'].dt.year <= 2022)]
test_data = df[df['Event Date'].dt.year >= 2023]

# Step 9: Filter validation and test sets
train_fighters = set(train_data['Winner Names'])
validation_data = validation_data[validation_data['Winner Names'].isin(train_fighters)]
test_data = test_data[test_data['Winner Names'].isin(train_fighters)]

# Reset indices
train_data = train_data.reset_index(drop=True)
validation_data = validation_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Print set sizes
print(f"Validation set size after filtering: {len(validation_data)} rows")
print(f"Test set size after filtering: {len(test_data)} rows")

# Step 10: Check class balance
print("\nClass balance in training set:")
print(train_data['Winner_Is_Fighter_X'].value_counts(normalize=True))

# Step 11: Define features and target
common_columns = ['Fight ID', 'Event Title', 'Event Date', 'Event Location', 
                  'Weight Class', 'Winning Method', 'Winning Round', 'Winning Time', 
                  'Winner Names', 'fighter_x_Fighter Full Name', 'fighter_y_Fighter Full Name', 
                  'fighter_x_Date of Birth', 'fighter_y_Date of Birth', 'Winner_Is_Fighter_X']
feature_columns = [col for col in df.columns if col not in common_columns]

# Step 12: One-hot encode categorical columns
categorical_columns = ['fighter_x_Stance', 'fighter_y_Stance', 'Weight Class']
train_data_encoded = pd.get_dummies(train_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])
validation_data_encoded = pd.get_dummies(validation_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])
test_data_encoded = pd.get_dummies(test_data[feature_columns], columns=[col for col in categorical_columns if col in feature_columns])

# Align columns
train_data_encoded, validation_data_encoded = train_data_encoded.align(validation_data_encoded, join='left', axis=1, fill_value=0)
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='left', axis=1, fill_value=0)

# Prepare features (X) and target (y)
X_train = train_data_encoded
y_train = train_data['Winner_Is_Fighter_X']
X_val = validation_data_encoded
y_val = validation_data['Winner_Is_Fighter_X']
X_test = test_data_encoded
y_test = test_data['Winner_Is_Fighter_X']

# Step 13: Train the model with class balance handling
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)  # Handle imbalance
model = xgb.XGBClassifier(random_state=42, eval_metric='logloss', learning_rate=0.1, max_depth=5, n_estimators=200, scale_pos_weight=scale_pos_weight)
model.fit(X_train, y_train)

# Step 14: Predict on validation and test sets
val_predictions = model.predict(X_val)
test_predictions = model.predict(X_test)

# Step 15: Calculate accuracy
val_accuracy = accuracy_score(y_val, val_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print(f"Validation Accuracy (2021-2022): {val_accuracy:.4f}")
print(f"Test Accuracy (2023-2025): {test_accuracy:.4f}")

# Step 16: Feature importance
print("\nTop 10 Feature Importances:")
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_importance.head(10))

# Step 17: Calculate winning probabilities and predicted winner
proba_predictions = model.predict_proba(X_test)
test_data = test_data.copy()
test_data['Predicted_Winner'] = test_data.apply(
    lambda row: row['fighter_x_Fighter Full Name'] if test_predictions[row.name] == 1 else row['fighter_y_Fighter Full Name'], axis=1
)
test_data['Prob_Fighter_X'] = proba_predictions[:, 1]
test_data['Prob_Fighter_Y'] = proba_predictions[:, 0]

# Step 18: Save the updated test set
test_data.to_csv("ufc_fights_merged_updated.csv", index=False)
print("\nUpdated test set saved to 'ufc_fights_merged_updated.csv' with new columns: Prob_Fighter_X, Prob_Fighter_Y, Predicted_Winner")

# Step 19: Display sample of updated test set
print("\nSample of Updated Test Set:")
print(test_data[['Fight ID', 'fighter_x_Fighter Full Name', 'fighter_y_Fighter Full Name', 'Prob_Fighter_X', 'Prob_Fighter_Y', 'Predicted_Winner']].head())

  df = pd.read_csv("ufc_fights_feature_engineered_full.csv")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fighter_X_Age'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fighter_Y_Age'].fillna(0, inplace=True)
  df[f'Diff_{col[9:]}'] = df[col] - df[col.replace('fighter_x_', 'fighter_y_')]
  d

Validation set size after filtering: 714 rows
Test set size after filtering: 496 rows

Class balance in training set:
Winner_Is_Fighter_X
0    0.511861
1    0.488139
Name: proportion, dtype: float64
Validation Accuracy (2021-2022): 0.5742
Test Accuracy (2023-2025): 0.5726

Top 10 Feature Importances:
Diff__Reach Inches                                                              0.004092
fighter_y_range_past_4_Significant_Strike_Leg_Attempted                         0.003635
fighter_y_highest_past_4_takedown_attempted                                     0.003344
Diff__Round 2 Significant Strike Ground Attempted Past Round 2 Win-Weighted     0.003072
Diff__strike_increase_trend_takedown_attempted                                  0.002980
fighter_y_Round 4 Significant Strike Ground Landed Past Round 4 Win-Weighted    0.002930
Diff__Round 1 Significant Strike Head Landed Recent Round 1 (Last 4 Fights)     0.002809
Diff__Round 5 Significant Strike Leg Landed Past Round 5 Differential      

In [None]:
! pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.feature_selection import SelectKBest, f_classif, RFE
# from sklearn.preprocessing import StandardScaler, LabelEncoder
# from sklearn.impute import SimpleImputer
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBClassifier
# import warnings
# warnings.filterwarnings('ignore')

# # Load the dataset
# df = pd.read_csv('ufc_fights_feature_engineered_full.csv')

# # Create target variable: 1 if fighter_x is the winner, 0 otherwise
# df['target'] = df.apply(lambda x: 1 if x['fighter_x_Fighter Full Name'] in x['Winner Names'] else 0, axis=1)

# # Convert Event Date to datetime
# df['Event Date'] = pd.to_datetime(df['Event Date'])

# # Split data by year
# train_df = df[df['Event Date'].dt.year.between(2012, 2020)]
# val_df = df[df['Event Date'].dt.year.between(2021, 2022)]
# test_df = df[df['Event Date'].dt.year.between(2023, 2025)]

# # Print class balance
# print("Class balance in training set:")
# print(train_df['target'].value_counts(normalize=True))
# print(f"Validation set size: {len(val_df)} rows")
# print(f"Test set size: {len(test_df)} rows")

# # Define features (prioritize differential and include key metrics)
# drop_cols = ['Fight ID', 'Event Title', 'Event Date', 'Event Location', 'Winner Names',
#              'fighter_x_Fighter Full Name', 'fighter_y_Fighter Full Name',
#              'fighter_x_Date of Birth', 'fighter_y_Date of Birth', 'target']
# # Select differential features and round-specific metrics
# feature_cols = [col for col in df.columns if col not in drop_cols and (
#     'Diff__' in col or col in [
#         'Weight Class', 'fighter_x_Stance', 'fighter_y_Stance',
#         'fighter_x_Round 2 Submission Attempted Past Total Round 2',
#         'fighter_y_Round 2 Submission Attempted Past Total Round 2',
#         'fighter_x_Round 2 Significant Strike Ground Attempted Past Round 2 Win-Weighted',
#         'fighter_y_Round 4 Significant Strike Ground Landed Past Round 4 Win-Weighted',
#         # Add other round-specific features from your list as needed
#     ]
# )]
# X_train = train_df[feature_cols]
# y_train = train_df['target']
# X_val = val_df[feature_cols]
# y_val = val_df['target']
# X_test = test_df[feature_cols]
# y_test = test_df['target']

# # Handle categorical variables
# categorical_cols = ['Weight Class', 'fighter_x_Stance', 'fighter_y_Stance']
# le_dict = {}
# for col in categorical_cols:
#     if col in X_train.columns:
#         le = LabelEncoder()
#         le.fit(pd.concat([X_train[col], X_val[col], X_test[col]], axis=0).astype(str))
#         X_train[col] = le.transform(X_train[col].astype(str))
#         X_val[col] = le.transform(X_val[col].astype(str))
#         X_test[col] = le.transform(X_test[col].astype(str))
#         le_dict[col] = le

# # Impute missing values
# numerical_cols = X_train.select_dtypes(include=[np.number]).columns
# imputer = SimpleImputer(strategy='mean')
# X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
# X_val[numerical_cols] = imputer.transform(X_val[numerical_cols])
# X_test[numerical_cols] = imputer.transform(X_test[numerical_cols])

# # Standardize numerical features
# scaler = StandardScaler()
# X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
# X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
# X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# # 1. Correlation Analysis - Heatmap
# plt.figure(figsize=(12, 8))
# corr_matrix = X_train.corr()
# sns.heatmap(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
# plt.title('Correlation Heatmap of Features')
# plt.savefig('correlation_heatmap.png')
# plt.close()
# high_corr_pairs = [(corr_matrix.index[i], corr_matrix.columns[j]) 
#                   for i, j in zip(*np.where(np.abs(corr_matrix) > 0.8)) if i != j and i < j]
# print(f"Highly correlated feature pairs (>0.8): {len(high_corr_pairs)}")
# print("Example pairs:", high_corr_pairs[:5])

# # 2. Feature Importance - Bar Chart
# rf = RandomForestClassifier(random_state=42)
# rf.fit(X_train, y_train)
# feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
# plt.figure(figsize=(10, 6))
# feature_importances[:20].plot(kind='bar')
# plt.title('Top 20 Feature Importances (Random Forest)')
# plt.ylabel('Importance')
# plt.tight_layout()
# plt.savefig('feature_importance.png')
# plt.close()
# print("\nTop 10 Feature Importances:")
# print(feature_importances.head(10))

# # Generate Chart.js chart for feature importance
# print("\nGenerating Chart.js bar chart for top 20 feature importances:")
# top_20_features = feature_importances[:20]
# chart_config = {
#     "type": "bar",
#     "data": {
#         "labels": top_20_features.index.tolist(),
#         "datasets": [{
#             "label": "Feature Importance",
#             "data": top_20_features.values.tolist(),
#             "backgroundColor": [
#                 "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
#                 "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf",
#                 "#aec7e8", "#ffbb78", "#98df8a", "#ff9896", "#c5b0d5",
#                 "#c49c94", "#f7b6d2", "#c7c7c7", "#dbdb8d", "#9edae5"
#             ],
#             "borderColor": "#ffffff",
#             "borderWidth": 1
#         }]
#     },
#     "options": {
#         "scales": {
#             "y": {
#                 "beginAtZero": True,
#                 "title": {
#                     "display": True,
#                     "text": "Importance"
#                 }
#             },
#             "x": {
#                 "title": {
#                     "display": True,
#                     "text": "Feature"
#                 },
#                 "ticks": {
#                     "autoSkip": False,
#                     "maxRotation": 45,
#                     "minRotation": 45
#                 }
#             }
#         },
#         "plugins": {
#             "title": {
#                 "display": True,
#                 "text": "Top 20 Feature Importances (Random Forest)"
#             },
#             "legend": {
#                 "display": False
#             }
#         }
#     }
# }
# print("```chartjs")
# print(chart_config)
# print("```")

# # 3. Univariate Feature Selection - Bar Chart
# selector = SelectKBest(score_func=f_classif, k=20)
# selector.fit(X_train, y_train)
# scores = pd.Series(selector.scores_, index=X_train.columns).sort_values(ascending=False)
# plt.figure(figsize=(10, 6))
# scores[:20].plot(kind='bar')
# plt.title('Top 20 Feature Scores (ANOVA F-test)')
# plt.ylabel('F-score')
# plt.tight_layout()
# plt.savefig('univariate_scores.png')
# plt.close()
# print("\nTop 10 Univariate Feature Scores:")
# print(scores.head(10))

# # 4. Recursive Feature Elimination - Plot
# n_features_list = [10, 20, 50, 100]
# val_accuracies = []
# for n in n_features_list:
#     rfe = RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=n)
#     rfe.fit(X_train, y_train)
#     X_train_rfe = X_train[X_train.columns[rfe.support_]]
#     X_val_rfe = X_val[X_train.columns[rfe.support_]]
#     rf_rfe = RandomForestClassifier(random_state=42)
#     rf_rfe.fit(X_train_rfe, y_train)
#     y_val_pred = rf_rfe.predict(X_val_rfe)
#     val_accuracies.append(accuracy_score(y_val, y_val_pred))
# plt.figure(figsize=(8, 5))
# plt.plot(n_features_list, val_accuracies, marker='o')
# plt.title('Validation Accuracy vs. Number of Features (RFE)')
# plt.xlabel('Number of Features')
# plt.ylabel('Validation Accuracy')
# plt.grid(True)
# plt.savefig('rfe_accuracy.png')
# plt.close()

# # 5. Feature Distribution - Box Plots
# top_features = feature_importances.index[:5]
# plt.figure(figsize=(12, 8))
# for i, feature in enumerate(top_features, 1):
#     plt.subplot(2, 3, i)
#     sns.boxplot(x=y_train, y=X_train[feature])
#     plt.title(f'{feature}')
#     plt.xlabel('Winner (0 = Fighter_Y, 1 = Fighter_X)')
# plt.tight_layout()
# plt.savefig('feature_distributions.png')
# plt.close()

# # Final Model with Selected Features (Top 50 from RFE)
# rfe = RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=50)
# rfe.fit(X_train, y_train)
# selected_features = X_train.columns[rfe.support_].tolist()
# X_train_selected = X_train[selected_features]
# X_val_selected = X_val[selected_features]
# X_test_selected = X_test[selected_features]

# # Train Random Forest
# rf_final = RandomForestClassifier(random_state=42)
# param_grid_rf = {
#     'n_estimators': [100, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2]
# }
# grid_search_rf = GridSearchCV(rf_final, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
# grid_search_rf.fit(X_train_selected, y_train)
# best_rf = grid_search_rf.best_estimator_
# print(f"\nBest Random Forest Hyperparameters: {grid_search_rf.best_params_}")

# # Evaluate Random Forest
# y_val_pred_rf = best_rf.predict(X_val_selected)
# y_test_pred_rf = best_rf.predict(X_test_selected)
# print("\nRandom Forest Performance (50 Features):")
# print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_rf):.4f}")
# print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_rf):.4f}")
# print(f"Validation Precision: {precision_score(y_val, y_val_pred_rf):.4f}")
# print(f"Validation Recall: {recall_score(y_val, y_val_pred_rf):.4f}")
# print(f"Validation F1-Score: {f1_score(y_val, y_val_pred_rf):.4f}")
# print(f"Validation ROC-AUC: {roc_auc_score(y_val, best_rf.predict_proba(X_val_selected)[:, 1]):.4f}")

# # Train XGBoost
# xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
# param_grid_xgb = {
#     'n_estimators': [100, 200],
#     'max_depth': [3, 6, 10],
#     'learning_rate': [0.01, 0.1],
#     'subsample': [0.8, 1.0]
# }
# grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='accuracy', n_jobs=-1)
# grid_search_xgb.fit(X_train_selected, y_train)
# best_xgb = grid_search_xgb.best_estimator_
# print(f"\nBest XGBoost Hyperparameters: {grid_search_xgb.best_params_}")

# # Evaluate XGBoost
# y_val_pred_xgb = best_xgb.predict(X_val_selected)
# y_test_pred_xgb = best_xgb.predict(X_test_selected)
# print("\nXGBoost Performance (50 Features):")
# print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_xgb):.4f}")
# print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_xgb):.4f}")
# print(f"Validation Precision: {precision_score(y_val, y_val_pred_xgb):.4f}")
# print(f"Validation Recall: {recall_score(y_val, y_val_pred_xgb):.4f}")
# print(f"Validation F1-Score: {f1_score(y_val, y_val_pred_xgb):.4f}")
# print(f"Validation ROC-AUC: {roc_auc_score(y_val, best_xgb.predict_proba(X_val_selected)[:, 1]):.4f}")

# # Save predictions
# test_df['Prob_Fighter_X'] = best_rf.predict_proba(X_test_selected)[:, 1]
# test_df['Prob_Fighter_Y'] = best_rf.predict_proba(X_test_selected)[:, 0]
# test_df['Predicted_Winner'] = test_df.apply(
#     lambda x: x['fighter_x_Fighter Full Name'] if x['Prob_Fighter_X'] > 0.5 else x['fighter_y_Fighter Full Name'], axis=1)
# test_df.to_csv('ufc_fights_merged_updated.csv', index=False)
# print("\nUpdated test set saved to 'ufc_fights_merged_updated.csv'")

Class balance in training set:
target
0    0.511861
1    0.488139
Name: proportion, dtype: float64
Validation set size: 1020 rows
Test set size: 1141 rows
Highly correlated feature pairs (>0.8): 0
Example pairs: []

Top 10 Feature Importances:
fighter_x_Round 2 Significant Strike Ground Attempted Past Round 2 Win-Weighted    0.427149
Weight Class                                                                       0.216271
fighter_y_Round 2 Submission Attempted Past Total Round 2                          0.111968
fighter_x_Round 2 Submission Attempted Past Total Round 2                          0.091655
fighter_y_Stance                                                                   0.067069
fighter_x_Stance                                                                   0.057718
fighter_y_Round 4 Significant Strike Ground Landed Past Round 4 Win-Weighted       0.028171
dtype: float64

Generating Chart.js bar chart for top 20 feature importances:
```chartjs
{'type': 'bar', 'data':

In [None]:
#Rough 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

# Load the data
input_file = "ufc_fights_merged_updated.csv"  # Replace with your actual file path
df = pd.read_csv(input_file)

# Convert Event Date to datetime
df['Event Date'] = pd.to_datetime(df['Event Date'])

# Backtesting parameters
initial_bankroll = 10000  # Starting capital
bet_amount = 100  # Fixed bet amount per fight
confidence_threshold = 0.6  # Bet only if predicted probability > 60%

def american_to_decimal(odds):
    """Convert American odds to decimal odds."""
    if pd.isna(odds):
        return 2.0  # Default to +100 if odds missing
    if odds > 0:
        return (odds / 100) + 1
    else:
        return (100 / abs(odds)) + 1

def backtest_betting(df, bet_amount, confidence_threshold, initial_bankroll):
    """Run backtesting simulation for betting strategy."""
    bankroll = initial_bankroll
    bets_placed = 0
    bets_won = 0
    total_profit = 0
    bet_log = []

    # Sort by date and Fight ID to ensure chronological order
    df = df.sort_values(['Event Date', 'Fight ID']).reset_index(drop=True)

    # Process fights in pairs to avoid double-betting
    processed_fights = set()  # Track processed Fight IDs

    for idx in range(0, len(df), 2):  # Iterate in steps of 2 for paired fighters
        if idx + 1 >= len(df):
            break

        row = df.iloc[idx]  # Take first row of the pair (fighter_x and fighter_y data)

        # Skip if Fight ID already processed, winner missing, or future fight
        fight_id = row['Fight ID']
        if (fight_id in processed_fights or
                pd.isna(row['Winner Names']) or
                row['Event Date'] > pd.to_datetime(datetime.now())):
            continue

        # Extract key data
        event_date = row['Event Date']
        fighter_x = row['fighter_x_Fighter Full Name']
        fighter_y = row['fighter_y_Fighter Full Name']
        prob_x = row['Prob_Fighter_X']
        prob_y = row['Prob_Fighter_Y']
        predicted_winner = row['Predicted_Winner']
        actual_winner = row['Winner Names']
        odds_x = row.get('fighter_x_odds', 100)  # Default to +100 if missing
        odds_y = row.get('fighter_y_odds', 100)

        # Convert odds to decimal
        decimal_odds_x = american_to_decimal(odds_x)
        decimal_odds_y = american_to_decimal(odds_y)

        # Betting decision
        bet_placed = False
        bet_on = None
        bet_odds = None
        bet_profit = 0
        bet_prob = 0

        if prob_x > confidence_threshold and predicted_winner == fighter_x:
            bet_placed = True
            bet_on = fighter_x
            bet_odds = decimal_odds_x
            bet_prob = prob_x
        elif prob_y > confidence_threshold and predicted_winner == fighter_y:
            bet_placed = True
            bet_on = fighter_y
            bet_odds = decimal_odds_y
            bet_prob = prob_y

        # Process the bet
        if bet_placed:
            bets_placed += 1
            if bet_on == actual_winner:
                bets_won += 1
                bet_profit = (bet_odds - 1) * bet_amount  # Profit = (decimal odds - 1) * bet amount
                bankroll += bet_profit
                total_profit += bet_profit
            else:
                bet_profit = -bet_amount
                bankroll += bet_profit
                total_profit += bet_profit

            # Log the bet
            bet_log.append({
                'Fight ID': fight_id,
                'Event Date': event_date,
                'Bet On': bet_on,
                'Predicted Winner': predicted_winner,
                'Actual Winner': actual_winner,
                'Bet Amount': bet_amount,
                'Odds (Decimal)': bet_odds,
                'Probability': bet_prob,
                'Profit': bet_profit,
                'Bankroll': bankroll
            })

        # Mark fight as processed
        processed_fights.add(fight_id)

    # Calculate metrics
    win_rate = bets_won / bets_placed if bets_placed > 0 else 0
    roi = (total_profit / (bets_placed * bet_amount)) * 100 if bets_placed > 0 else 0
    # Filter historical fights for accuracy calculation
    historical_df = df[df['Event Date'] <= pd.to_datetime(datetime.now())]
    accuracy = (historical_df['Predicted_Winner'] == historical_df['Winner Names']).mean() if 'Winner Names' in historical_df.columns else 0

    # Print results
    print(f"Backtesting Results:")
    print(f"Initial Bankroll: ${initial_bankroll:.2f}")
    print(f"Final Bankroll: ${bankroll:.2f}")
    print(f"Total Bets Placed: {bets_placed}")
    print(f"Bets Won: {bets_won}")
    print(f"Win Rate: {win_rate:.4f}")
    print(f"Total Profit: ${total_profit:.2f}")
    print(f"ROI: {roi:.2f}%")
    print(f"Model Accuracy: {accuracy:.4f}")

    # Save bet log
    bet_log_df = pd.DataFrame(bet_log)
    bet_log_df.to_csv("ufc_betting_log.csv", index=False)
    print("\nBetting log saved to 'ufc_betting_log.csv'")

    # Plot cumulative profit
    if not bet_log_df.empty:
        bet_log_df['Cumulative Profit'] = bet_log_df['Profit'].cumsum()
        plt.figure(figsize=(10, 6))
        plt.plot(bet_log_df['Event Date'], bet_log_df['Cumulative Profit'], label='Cumulative Profit')
        plt.xlabel('Event Date')
        plt.ylabel('Cumulative Profit ($)')
        plt.title('Cumulative Profit Over Time')
        plt.grid(True)
        plt.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig('cumulative_profit_plot.png')
        plt.close()
        print("Cumulative profit plot saved to 'cumulative_profit_plot.png'")

    return bet_log_df
P
# Run backtesting
bet_log_df = backtest_betting(df, bet_amount, confidence_threshold, initial_bankroll)

# Display sample of bet log
print("\nSample of Betting Log:")
print(bet_log_df[['Fight ID', 'Event Date', 'Bet On', 'Predicted Winner', 'Actual Winner',
                 'Bet Amount', 'Odds (Decimal)', 'Probability', 'Profit']].head())

Backtesting Results:
Initial Bankroll: $10000.00
Final Bankroll: $9400.00
Total Bets Placed: 74
Bets Won: 34
Win Rate: 0.4595
Total Profit: $-600.00
ROI: -8.11%
Model Accuracy: 0.4741

Betting log saved to 'ufc_betting_log.csv'
Cumulative profit plot saved to 'cumulative_profit_plot.png'

Sample of Betting Log:
   Fight ID Event Date          Bet On Predicted Winner         Actual Winner  \
0      5116 2023-01-14  Raoni Barcelos   Raoni Barcelos     Umar Nurmagomedov   
1      5122 2023-01-21   Josiane Nunes    Josiane Nunes         Josiane Nunes   
2      5166 2023-02-18      Jim Miller       Jim Miller   Alexander Hernandez   
3      5186 2023-03-04  Julian Marquez   Julian Marquez  Marc-Andre Barriault   
4      5194 2023-03-04       Jon Jones        Jon Jones             Jon Jones   

   Bet Amount  Odds (Decimal)  Probability  Profit  
0         100             2.0     0.621971  -100.0  
1         100             2.0     0.639243   100.0  
2         100             2.0     0.61778