# Project description
This project uses the The Big Dataset of Ultra Marathon Running to explore long distance running performances and build a predictive model.
The goal is to use the full historical dataset to train a model that can predict the finishing time of runners in a chosen race, but only for athletes who have previous race results recorded in the dataset.

Ultra marathons vary greatly in distance, terrain, and difficulty. Runners often participate in many races over multiple years. 
This project focuses on:

cleaning and standardizing the large dataset

analyzing athlete histories

engineering features such as race difficulty and runner consistency

training a model using global trends while giving extra importance to each runner’s past performances

The model will be used to predict the finishing times of runners in the 2022 Western States Ultra Endurance Run.

In [2]:
# 1_load.py
import pandas as pd

def load_raw_data(path: str) -> pd.DataFrame:
    """Load the raw ultramarathon dataset."""
    return pd.read_csv(path, low_memory=False)



# Exploratory Data Analysis

In [13]:
#EDA keep seperate from pipeline
PATH = "/kaggle/input/the-big-dataset-of-ultra-marathon-running/TWO_CENTURIES_OF_UM_RACES.csv"
df_raw = pd.read_csv(PATH, low_memory=False)

print(df_raw.shape)
df_raw.head()
df_raw.isna().sum()

print("Shape: ", df_raw.shape)
print("Head: ", df_raw.head())
print(df_raw.isna().sum())


def clean_event_name(name):
    if not isinstance(name, str):
        return name
    
    # 1. Lowercase for standardization
    name = name.lower()
    
    # 2. Remove year (19xx or 20xx) regardless of where it is
    # Handles 2018, (2018), 9ª (edition numbers)
    name = re.sub(r'\(?\b(19|20)\d{2}\b\)?', '', name)
    name = re.sub(r'\d+ª', '', name) 
    
    # 3. Remove country codes in brackets like (GER), (USA), (ESP)
    name = re.sub(r'\([a-z]{3}\)', '', name)
    
    # 4. Remove Distance/Stage noise
    # Strips "100km", "50 mile", "6 hour", "etappe", "kidsrun"
    noise_patterns = [
        r'\d+\s*(km|k|miles|mile|mi|h|hr|hour|hours|stunden|óra|órás)\b',
        r'etappe[:\s]\d+',
        r'kidsrun',
        r'winter challenge',
        r'trail race',
        r'road race'
    ]
    for pattern in noise_patterns:
        name = re.sub(pattern, '', name)

    # 5. Clean up symbols and extra whitespace
    name = re.sub(r'[^a-z\s]', ' ', name) # Remove everything except letters and spaces
    name = re.sub(r'\s+', ' ', name).strip()
    
    return name
    
df_raw['Event_name_clean'] = df_raw['Event name'].apply(clean_event_name)

print(df_raw['Event_name_clean'].unique()[:20])

# Look for names that definitely contain 4 digits (likely years)
sample_years = df_raw[df_raw['Event_name_clean'].str.contains(r'\d{4}', na=False)]['Event_name_clean'].unique()[:10]
print("\nNames containing digits:")
print(sample_years)

(7461195, 13)
Shape:  (7461195, 13)
Head:     Year of event Event dates           Event name Event distance/length  \
0           2018  06.01.2018  Selva Costera (CHI)                  50km   
1           2018  06.01.2018  Selva Costera (CHI)                  50km   
2           2018  06.01.2018  Selva Costera (CHI)                  50km   
3           2018  06.01.2018  Selva Costera (CHI)                  50km   
4           2018  06.01.2018  Selva Costera (CHI)                  50km   

   Event number of finishers Athlete performance        Athlete club  \
0                         22           4:51:39 h               Tnfrc   
1                         22           5:15:45 h  Roberto Echeverría   
2                         22           5:16:44 h   Puro Trail Osorno   
3                         22           5:34:13 h            Columbia   
4                         22           5:54:14 h      Baguales Trail   

  Athlete country  Athlete year of birth Athlete gender Athlete age categ

Some event names contain years, sponsor names, or inconsistent formatting.
Many missing distances match specific races that do not include distance in the name.
This motivated the later step of inferring numeric distance from textual patterns.

In [6]:
#Check which events have missing distances
df[df["Event distance/length"].isna()][['Event name', 'Year of event']].drop_duplicates()

NameError: name 'df' is not defined

# Data Cleaning

We removed Athlete club and Athlete country because they do not contribute to performance prediction and contain excessive missing values.

We dropped rows missing gender, year of birth, or age category because these represent essential physiological predictors of endurance performance, and imputing them would introduce bias.

Next, see if we can infer Event distance/length from other features


In [15]:
import re
import numpy as np
import pandas as pd

def clean_event_name(name):
    if not isinstance(name, str):
        return name
    
    # 1. Lowercase and basic strip
    name = name.lower().strip()
    
    # 2. Remove year (19xx or 20xx) - Keep the distance!
    # This removes 2018, (2018), etc. but leaves "100km"
    name = re.sub(r'\(?\b(19|20)\d{2}\b\)?', '', name)
    
    # 3. Remove country codes in brackets like (GER), (USA)
    name = re.sub(r'\([a-z]{3}\)', '', name)
    
    # 4. Remove edition numbers (e.g., 9ª, 10th)
    name = re.sub(r'\d+(ª|th|st|nd|rd)', '', name)

    # 6. Clean up symbols (keep numbers for the distances)
    # We remove punctuation but keep alphanumeric so "100km" stays "100km"
    name = re.sub(r'[^a-z0-9\s]', ' ', name)
    
    # 7. Standardize spacing
    name = re.sub(r'\s+', ' ', name).strip()
    
    return name

def convert_to_seconds(time):
    if 'd' in time:
        days, time = time.split('d')
        days = int(days)
        time = time.strip()
    else:
        days = 0
    h, m, s = map(int, time.split(':'))
    return days * 86400 + h * 3600 + m * 60 + s


def clean_data(df):
    """
    Clean ultra-marathon dataset:
    - Drop weak/high-null columns
    - Drop rows missing essential info
    - Remove irrelevant events
    - Clean 'Event distance/length' and convert to numeric km
    - Correct specific event distance anomalies
    - Remove athletes with conflicting gender data
    - Clean and convert 'Athlete performance' to seconds
    - Compute pace (min/km)
    """
    
    # Drop weak-value columns
    df = df.drop(columns=['Athlete club', 'Athlete country'], errors='ignore')

    # Drop rows missing critical fields
    df = df.dropna(subset=[
        "Athlete gender",
        "Athlete year of birth",
        "Athlete age category",
        "Athlete performance"
    ])
    
    # Remove irrelevant events
    events_to_remove = [
    "Stockholm Fotrally \\(SWE\\)",
    "Maratonmarschen Stockholm \\(SWE\\)"
    ]
    df = df[~df['Event name'].str.contains('|'.join(events_to_remove), case=False)]

    # Clean and convert 'Event distance/length' to numeric km
    df['Event distance/length'] = df['Event distance/length'].astype(str).str.strip()
    distance_str = df['Event distance/length'].str.lower().str.strip()
    distance_clean = distance_str.str.replace('km', '').str.replace('k', '').str.strip()
    
    def extract_number(x):
        match = re.search(r'\d+(\.\d+)?', x)
        if match:
            return float(match.group())
        return np.nan
    distance_numeric = distance_clean.apply(extract_number)

    # Convert miles to km
    miles_mask = distance_clean.str.contains(r'm|mi|mile|miles', regex=True, case=False)
    distance_numeric.loc[miles_mask] = distance_numeric.loc[miles_mask] * 1.60934

    # Remove time-based or stage-based events
    time_mask = distance_str.str.contains(r'h|hr|hour|hours|d|day|days|min', regex=True)
    stage_mask = distance_str.str.contains(r'/|:|x', regex=True)
    remove_mask = time_mask | stage_mask
    df = df[~remove_mask].copy()

    # Assign cleaned numeric distance
    df['Event distance_numeric'] = distance_numeric.loc[df.index]

    # Correct specific anomalous event distances
    df.loc[df['Event name'] == "Ultraroztocze 120km (POL)", 'Event distance_numeric'] = 120
    df.loc[df['Event name'] == "Ultraroztocze 120km (POL)", 'Event distance/length'] = '120km'

    # Remove extreme distances (>250 km)
    df = df[df['Event distance_numeric'].notna() & 
                    (df['Event distance_numeric'] <= 250)].copy()

    # Remove athletes with multiple genders
    gender_counts = df.groupby("Athlete ID")["Athlete gender"].nunique()
    conflicting_ids = gender_counts[gender_counts > 1].index
    df = df[~df["Athlete ID"].isin(conflicting_ids)].copy()

    # Clean 'Athlete performance' and convert to seconds
    df['Athlete performance'] = df['Athlete performance'].astype(str).str.replace(' h', '').str.replace(' km', '')

    pattern1 = r'^\d{1,2}:\d{2}:\d{2}$'
    pattern2 = r'^\d+d \d{2}:\d{2}:\d{2}$'
    mask = df['Athlete performance'].apply(lambda x: bool(re.match(pattern1, x)) or bool(re.match(pattern2, x)))


    df.loc[mask, 'time_in_seconds'] = df.loc[mask, 'Athlete performance'].apply(convert_to_seconds)

    # Compute pace (min/km)
    df['pace_min_per_km'] = (df['time_in_seconds'] / 60) / df['Event distance_numeric']

    df['Event_name_clean'] = df['Event name'].apply(clean_event_name)

    unique_before = df['Event name'].nunique()
    unique_after = df['Event_name_clean'].nunique()

    print(f"Unique Event Names (Before): {unique_before:,}")
    print(f"Unique Event Names (After):  {unique_after:,}")
    print(f"Reduction: {((unique_before - unique_after) / unique_before) * 100:.2f}%")

    return df

# Feature Engineering

In [6]:
def engineer_features(df):
    """
    Create cumulative and rolling features for ultramarathon dataset:
    - cum_num_races, cum_avg_pace, cum_best_pace
    - cum_total_distance, cum_avg_distance, cum_shortest_distance, cum_longest_distance
    - cum_ws_finishes, recent_avg_distance, distance_gap_from_longest
    - athlete_age
    """
    
    # Sort for cumulative calculations
    df = df.sort_values(by=["Athlete ID", "Year of event", "Event name"]).reset_index(drop=True)

    # Cumulative number of races
    df["cum_num_races"] = df.groupby("Athlete ID").cumcount()

    # Cumulative average pace (excluding current race)
    df["cum_avg_pace"] = df.groupby("Athlete ID")["pace_min_per_km"].expanding().mean().shift(1).reset_index(level=0, drop=True)

    # Cumulative best pace (excluding current race)
    df["cum_best_pace"] = df.groupby("Athlete ID")["pace_min_per_km"].expanding().min().shift(1).reset_index(level=0, drop=True)

    # Cumulative distance stats
    df["cum_total_distance"] = df.groupby("Athlete ID")["Event distance_numeric"].cumsum()
    df["cum_avg_distance"] = df["cum_total_distance"] / df["cum_num_races"]
    grp = df.groupby("Athlete ID")["Event distance_numeric"]
    df["cum_shortest_distance"] = grp.cummin()
    df["cum_longest_distance"] = grp.cummax()

    # Cumulative Western States finishes
    df["cum_ws_finishes"] = (
        df["Event name"].eq("Western States")# True for WS
          .groupby(df["Athlete ID"])# per athlete
          .cumsum()# cumulative sum
          .shift(1)# exclude current race
          .fillna(0)# first race has 0
          .astype(int)
    )

    # Recent average distance (rolling 3 races)
    df["recent_avg_distance"] = (
        df.groupby("Athlete ID")["Event distance_numeric"]
          .rolling(3, min_periods=1).mean()
          .reset_index(level=0, drop=True)
          .shift(1)
    )

    # Distance gap from longest
    df["distance_gap_from_longest"] = df["Event distance_numeric"] - df["cum_longest_distance"]

    # Athlete age
    df["athlete_age"] = df["Year of event"] - df["Athlete year of birth"]

    return df

# Test/Train Split

In [7]:
def split_train_test(df):
    """
    Split dataset into train/test while preventing leakage from future races:
    - Test: Western States 2022
    - Train: all prior races of athletes in test set + other athletes
    """
    
    feature_cols = ['Year of event', 'Event number of finishers', 'Athlete gender', 
                    'Event distance_numeric', 'cum_num_races', 'cum_avg_pace', 
                    'cum_best_pace', 'cum_ws_finishes', 'cum_total_distance', 
                    'cum_avg_distance', 'cum_shortest_distance', 'cum_longest_distance', 
                    'recent_avg_distance', 'distance_gap_from_longest', 'athlete_age']

    # Define test set: Western States 2022
    df_test = df[
        (df["Year of event"] == 2022) &
        (df["Event name"].str.contains("Western States", case=False, na=False))
    ].copy()

    # Identify runners in test set
    ws_mask = (df["Year of event"] == 2022) & df["Event name"].str.contains("Western States", case=False, na=False)
    ws_2022_runners = df.loc[ws_mask, "Athlete ID"].unique()

    # Cutoff cumulative races per athlete
    ws_cutoffs = df[ws_mask].set_index("Athlete ID")["cum_num_races"]

    # Map cutoff to all data
    df["ws_cutoff"] = df["Athlete ID"].map(ws_cutoffs)  # NaN for runners not in WS 2022

    # Keep only races before test race OR athletes not in test set
    mask = (df["ws_cutoff"].notna() & (df["cum_num_races"] < df["ws_cutoff"])) | df["ws_cutoff"].isna()
    df_train = df[mask].copy()

    # Drop temporary column
    df_train = df_train.drop(columns="ws_cutoff")

    print("Test set shape:", df_test.shape)
    print("Train set shape:", df_train.shape)

    return df_train, df_test, feature_cols

# Feature and Target Preparation

In [None]:
def apply_smoothed_target_encoding(train_df, test_df, column='Event_name_clean', target='pace_min_per_km', m=10):
    """
    Computes smoothed target encoding for the event names.
    m is the 'smoothing' factor (higher m = more conservative).
    """
    # Calculate global mean from training data only (No leakage!)
    global_mean = train_df[target].mean()

    # Calculate count and mean for each race
    agg = train_df.groupby(column)[target].agg(['count', 'mean'])
    
    # Calculate the smoothed value
    # Formula: (count * mean + m * global_mean) / (count + m)
    smooth_weights = (agg['count'] * agg['mean'] + m * global_mean) / (agg['count'] + m)
    
    # Map the weights back to the dataframes
    train_df['Race_Pace_Mean_Encoded'] = train_df[column].map(smooth_weights).fillna(global_mean)
    test_df['Race_Pace_Mean_Encoded'] = test_df[column].map(smooth_weights).fillna(global_mean)
    
    return train_df, test_df

def prepare_model_data(df_train, df_test, feature_cols, target_col='pace_min_per_km'):
    """
    Prepare train/test datasets for modeling:
    - Select features
    - One-hot encode categorical variables
    - Align columns
    - Define target
    - Clean infinities
    """
    
    df_train, df_test = apply_difficulty_encoding(df_train, df_test)
    
    # Select features
    X_train = df_train[feature_cols].copy()
    X_test  = df_test[feature_cols].copy()

    # One-hot encode categorical variables (gender)
    X_train = pd.get_dummies(X_train, columns=['Athlete gender'], drop_first=True)
    X_test  = pd.get_dummies(X_test,  columns=['Athlete gender'], drop_first=True)

    # Align columns
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    # Define target
    y_train = df_train[target_col]
    y_test  = df_test[target_col]

    # Clean infinities
    X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Fill NaNs (including former infinities) with -1
    X_train.fillna(-1, inplace=True)
    X_test.fillna(-1, inplace=True)


    print("Shapes:", X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    return X_train, X_test, y_train, y_test


# Modelling

In [16]:
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def train_evaluate_lgbm(X_train, y_train, X_test, y_test, params=None):
    """
    Train a LightGBM regressor and evaluate performance on test set.
    Returns the fitted model and predictions.
    """
    if params is None:
        params = {
            "n_estimators": 500,
            "learning_rate": 0.05,
            "num_leaves": 64,
            "max_depth": -1,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "random_state": 42,
            "verbose": -1
        }

    # Initialize model
    model = lgb.LGBMRegressor(**params)

    # Fit model
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)

    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"R2 : {r2:.4f}")

    return model, y_pred


In [None]:
#Pipeline
PATH = "/kaggle/input/the-big-dataset-of-ultra-marathon-running/TWO_CENTURIES_OF_UM_RACES.csv"

# -----------------------------
# Imports
# -----------------------------
import pandas as pd

# Assume all your functions are already defined:
# clean_data(), engineer_features(), split_train_test(), prepare_model_data(), train_evaluate_lgbm()

# -----------------------------
# Pipeline Function
# -----------------------------
def run_pipeline(raw_csv_path):
    """
    Full Kaggle-ready ML pipeline for ultra-marathon pace prediction:
    1. Load raw data
    2. Clean data
    3. Feature engineering
    4. Train/Test split
    5. Prepare features & target
    6. Train & evaluate LightGBM model
    """
    # 1. Load data
    df = load_raw_data(raw_csv_path)
    print("Raw data shape:", df.shape)

    # 2. Clean data
    df_clean = clean_data(df)
    print("After cleaning:", df_clean.shape)

    # 3. Feature engineering
    df_features = engineer_features(df_clean)
    print("After feature engineering:", df_features.shape)

    # 4. Train/Test split
    df_train, df_test, feature_cols = split_train_test(df_features)
    print("Train/Test split done. Train:", df_train.shape, "Test:", df_test.shape)

    # 5. Prepare features & target
    X_train, X_test, y_train, y_test = prepare_model_data(df_train, df_test, feature_cols)
    print("Feature & target preparation done. X_train:", X_train.shape)

    # 6. Train & evaluate model
    model, y_pred = train_evaluate_lgbm(X_train, y_train, X_test, y_test)

    return model, X_train, X_test, y_train, y_test, y_pred

# -----------------------------
# Run the pipeline
# -----------------------------
model, X_train, X_test, y_train, y_test, y_pred = run_pipeline(PATH)


Raw data shape: (7461195, 13)
Unique Event Names (Before): 20,058
Unique Event Names (After):  19,587
Reduction: 2.35%
After cleaning: (6255867, 15)
After feature engineering: (6255867, 26)


  return op(a, b)


Test set shape: (301, 26)
Train set shape: (6255548, 26)
Train/Test split done. Train: (6255548, 26) Test: (301, 26)
Shapes: (6255548, 15) (6255548,) (301, 15) (301,)
Feature & target preparation done. X_train: (6255548, 15)
