In [1]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from multiprocessing import cpu_count

from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Add, LeakyReLU, Activation
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.activations import gelu

from statsmodels.tsa.holtwinters import Holt
import scipy

In [2]:
# ------------------------------------------------------------------
# 1) Enable multi-threading in TensorFlow
# ------------------------------------------------------------------
os.environ["OMP_NUM_THREADS"] = "20"
os.environ["MKL_NUM_THREADS"] = "20"
tf.config.threading.set_intra_op_parallelism_threads(20)
tf.config.threading.set_inter_op_parallelism_threads(20)

# ------------------------------------------------------------------
# 2) Load & Basic Cleaning
# ------------------------------------------------------------------
data_path = "MCM_2025\\2025_MCM-ICM_Problems\\2025_Problem_C_Data\\summerOly_athletes.csv"
df = pd.read_csv(data_path)

df = df[~df['Team'].str.contains(r'\d|\.', na=False)]
roman_pattern = r'(?:\s|-)M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})(?:\s|$)'
df = df[~df['Team'].str.contains(roman_pattern, na=False, flags=re.IGNORECASE)]

recent_years = [2020, 2024]
df = df[df["NOC"].isin(df[df["Year"].isin(recent_years)]["NOC"].unique())]

# Remove events that appeared <4 times
event_counts = df["Event"].value_counts()
df = df[df["Event"].isin(event_counts[event_counts >= 4].index)]

# Convert Year to int
df["Year"] = pd.to_numeric(df["Year"], errors="coerce").astype(int)

# ------------------------------------------------------------------
# 3) Feature Engineering (Row-level)
# ------------------------------------------------------------------
df["Seed_Player"] = df.groupby("Name")["Medal"].transform(
    lambda x: (x == "Gold").cumsum() + (x == "Silver").cumsum()
).fillna(0).astype(int)

df["Elite_Player"] = df.groupby("Name")["Medal"].transform(
    lambda x: max(0, (x != "No medal").sum() - 1 + (x == "Gold").sum())
).fillna(0).astype(int)

df["Participation_Count"] = df.groupby("Name")["Year"].transform("count")
df["Retiring_Player"] = df["Participation_Count"].apply(lambda x: max(0, x - 3)).astype(int)

host_cities = {2008: "CHN", 2012: "GBR", 2016: "BRA", 2020: "JPN", 2024: "FRA"}
df["Is_Host"] = df.apply(lambda row: 1 if host_cities.get(row['Year'], None) == row['NOC'] else 0, axis=1)

df["NOC"] = df["NOC"].astype(str)
df["Name"] = df["Name"].astype(str)

# ------------------------------------------------------------------
# 4) Aggregation by (Year, NOC)
# ------------------------------------------------------------------
df_num_cols = df.select_dtypes(include=["number"]).columns.difference(["Year"]).tolist()

df_noc_year = (
    df.groupby(["Year", "NOC"], as_index=False)
      .apply(lambda subdf: subdf[df_num_cols].sum(numeric_only=True))
      .reset_index(drop=True)
)

# Summation of medal counts
medal_counts = df.groupby(["Year", "NOC"])["Medal"].value_counts().unstack(fill_value=0).reset_index()
medal_counts.rename(columns={"Gold": "Medal_Gold", "Silver": "Medal_Silver", "Bronze": "Medal_Bronze"}, inplace=True)

df_noc_year = df_noc_year.merge(
    medal_counts, on=["Year", "NOC"], how="left"
).fillna(0)

# ------------------------------------------------------------------
# 5) One-Hot Encoding NOC
# ------------------------------------------------------------------
ohe_noc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_noc = ohe_noc.fit_transform(df_noc_year[["NOC"]])
noc_ohe_cols = [f"NOC_{cat}" for cat in ohe_noc.categories_[0]]

df_noc_year.reset_index(drop=True, inplace=True)
noc_ohe_df = pd.DataFrame(encoded_noc, columns=noc_ohe_cols)
df_noc_year_ohe = pd.concat([df_noc_year.drop(columns=["NOC"]), noc_ohe_df], axis=1)

with open("noc_encoder.pkl", "wb") as f:
    pickle.dump(ohe_noc, f)

# ------------------------------------------------------------------
# 6) Scaling
# ------------------------------------------------------------------
target_cols = ["Medal_Gold", "Medal_Silver", "Medal_Bronze"]
feature_cols = df_noc_year_ohe.columns.difference(["Year"] + target_cols)

scaler_features = StandardScaler()
scaler_targets = StandardScaler()

X_all = scaler_features.fit_transform(df_noc_year_ohe[feature_cols])
y_all = scaler_targets.fit_transform(df_noc_year_ohe[target_cols])

with open("scaler_features.pkl", "wb") as f:
    pickle.dump(scaler_features, f)
with open("scaler_targets.pkl", "wb") as f:
    pickle.dump(scaler_targets, f)

# Keep a reference to unscaled aggregator
df_unscaled = df_noc_year_ohe.copy()

df_noc_year_ohe

  .apply(lambda subdf: subdf[df_num_cols].sum(numeric_only=True))


Unnamed: 0,Year,Elite_Player,Is_Host,Participation_Count,Retiring_Player,Seed_Player,Medal_Bronze,Medal_Gold,No medal,Medal_Silver,...,NOC_URU,NOC_USA,NOC_UZB,NOC_VAN,NOC_VEN,NOC_VIE,NOC_VIN,NOC_YEM,NOC_ZAM,NOC_ZIM
0,1896,20,0,25,10,5,1,2,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1896,16,0,29,10,2,2,1,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1896,35,0,132,87,19,3,1,9,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1896,30,0,90,22,13,2,5,15,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1896,24,0,139,76,12,3,3,16,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,2024,0,0,41,4,0,0,0,20,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3093,2024,0,0,5,0,0,0,0,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3094,2024,0,0,4,0,0,0,0,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3095,2024,0,0,47,0,0,1,0,31,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
# ------------------------------------------------------------------
# 7) Define model building & callbacks
# ------------------------------------------------------------------

def build_medal_prediction_model(input_dim):
    def huber_loss(y_true, y_pred, delta=1.0):
        error = y_true - y_pred
        condition = tf.abs(error) < delta
        return tf.reduce_mean(tf.where(condition, 0.5 * tf.square(error), delta * (tf.abs(error) - 0.5 * delta)))

    inputs = Input(shape=(input_dim,))

    x = Dense(4096, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.001))(inputs)
    x = BatchNormalization()(x)
    x = Activation('swish')(x)
    x = Dropout(0.3)(x)

    x1 = Dense(2048, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
    x1 = BatchNormalization()(x1)
    x1 = Activation('swish')(x1)

    x2 = Dense(2048, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.001))(x1)
    x2 = BatchNormalization()(x2)
    x2 = Activation('swish')(x2)
    x2 = Dropout(0.3)(x2)

    x_residual = Add()([x1, x2])

    x3 = Dense(1024, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.001))(x_residual)
    x3 = BatchNormalization()(x3)
    x3 = Activation('swish')(x3)
    x3 = Dropout(0.2)(x3)

    x4 = Dense(512, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.001))(x3)
    x4 = BatchNormalization()(x4)
    x4 = Activation('swish')(x4)
    x4 = Dropout(0.2)(x4)

    # Output layer
    outputs = Dense(3, activation="linear")(x4)  # Output: Gold, Silver, Bronze

    model = tf.keras.models.Model(inputs, outputs)

    optimizer = tf.keras.optimizers.AdamW(learning_rate=0.0001, weight_decay=1e-5)
    model.compile(optimizer=optimizer, loss=huber_loss, metrics=["mae"])
    
    return model


# ------------------------------------------------------------------
# Baseline function with a small "grid" of approaches
# ------------------------------------------------------------------

def build_baseline_model(x_vals, y_vals):
    best_mae = float("inf")
    best_pred_func = None

    # 1) Linear Regression
    lr = LinearRegression()
    lr.fit(x_vals.reshape(-1,1), y_vals)
    mae = mean_absolute_error(y_vals, lr.predict(x_vals.reshape(-1,1)))
    if mae < best_mae:
        best_mae = mae
        best_pred_func = ("Linear", lr)

    # 2) Polynomial Regression (degrees 2-4)
    for deg in [2,3,4]:
        poly_pipeline = make_pipeline(PolynomialFeatures(deg), LinearRegression())
        poly_pipeline.fit(x_vals.reshape(-1,1), y_vals)
        mae_poly = mean_absolute_error(y_vals, poly_pipeline.predict(x_vals.reshape(-1,1)))
        if mae_poly < best_mae:
            best_mae = mae_poly
            best_pred_func = (f"Poly_{deg}", poly_pipeline)

    # 3) Bayesian Ridge Regression
    br = BayesianRidge()
    br.fit(x_vals.reshape(-1,1), y_vals)
    mae_br = mean_absolute_error(y_vals, br.predict(x_vals.reshape(-1,1)))
    if mae_br < best_mae:
        best_mae = mae_br
        best_pred_func = ("BayesianRidge", br)

    # 4) Exponential Fit
    def exp_func(x, a, b):
        return a * np.exp(b * x)
    try:
        x_shift = x_vals - x_vals.min()
        popt, _ = scipy.optimize.curve_fit(exp_func, x_shift, y_vals, maxfev=10000)
        mae_exp = mean_absolute_error(y_vals, exp_func(x_shift, *popt))
        if mae_exp < best_mae:
            best_mae = mae_exp
            best_pred_func = ("Exponential", lambda x: exp_func(x - x_vals.min(), *popt))
    except:
        pass

    return best_pred_func

def predict_baseline_value(model_info, x_target):
    """
    Use the best_pred_func from build_baseline_model to predict a single y value for x_target.
    Handles different types of models gracefully.
    """
    name, model = model_info

    if name in ["Linear", "BayesianRidge"] or name.startswith("Poly_"):
        return model.predict([[x_target]])[0]

    elif name == "Exponential":
        return model(x_target)

    elif name == "Holt":
        # Forecasting using Holt-Winters
        try:
            forecast_steps = x_target - model.fittedvalues.index[-1]
            if forecast_steps < 1:
                forecast_steps = 1

            yhat_f = model.forecast(steps=forecast_steps)

            if len(yhat_f) > 0:
                return yhat_f.iloc[-1] 

        except Exception as e:
            print(f"Warning: Holt-Winters forecast failed for {x_target}: {e}")
            return 0.0 

    return 0.0  # Default return if no valid prediction can be made

In [4]:
# ------------------------------------------------------------------
# Workflow A: train up to <2024, EXTRAPOLATE to 2024, then compare
# ------------------------------------------------------------------
def timeseries_cv_train(X_train, y_train, scaler_targets, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    cv_results = []

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

        model_cv = build_medal_prediction_model(X_fold_train.shape[1])
        es = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
        rlr = ReduceLROnPlateau(monitor='val_loss', patience=15, factor=0.5, verbose=1)

        model_cv.fit(X_fold_train, y_fold_train, epochs=200, batch_size=16,
                     validation_data=(X_fold_val, y_fold_val),
                     callbacks=[es, rlr], verbose=2)

        val_preds = scaler_targets.inverse_transform(model_cv.predict(X_fold_val))
        val_actual = scaler_targets.inverse_transform(y_fold_val)
        val_mae = np.mean(np.abs(val_preds - val_actual))
        cv_results.append(val_mae)
        print(f"Fold {fold+1} MAE: {val_mae:.3f}")

    print(f"[TimeSeriesSplit] Average CV MAE: {np.mean(cv_results):.3f}")

    final_model = build_medal_prediction_model(X_train.shape[1])
    final_model.fit(X_train, y_train, epochs=300, batch_size=64, verbose=2, callbacks=[es])
    return final_model

def train_up_to_2024(X_all, y_all, df_agg):
    mask_train_2024 = (df_agg["Year"] < 2024)
    X_train_2024 = X_all[mask_train_2024]
    y_train_2024 = y_all[mask_train_2024]
    model_2024 = timeseries_cv_train(X_train_2024, y_train_2024, scaler_targets, n_splits=3)
    return model_2024

def build_baseline_and_predict_2024(model, df_agg, feature_cols, target_cols, scaler_features, scaler_targets):
    df_2024 = df_agg[df_agg["Year"] == 2020].copy()
    df_2024["Year"] = 2024

    noc_columns = [col for col in df_agg.columns if col.startswith("NOC_")]

    if 'NOC' not in df_agg.columns:
        df_agg["NOC"] = df_agg[noc_columns].idxmax(axis=1).str.replace("NOC_", "")

    df_2024["NOC"] = df_2024[noc_columns].idxmax(axis=1).str.replace("NOC_", "")

    if "NOC_FRA" in df_2024.columns:
        df_2024["Is_Host"] = df_2024["NOC_FRA"]  # Assuming FRA hosts 2024

    numeric_feats = [c for c in feature_cols if c not in noc_columns and c != "Year" and c != "Is_Host"]

    historical_data = df_agg[df_agg["Year"] < 2024].copy()
    historical_data["NOC"] = historical_data[noc_columns].idxmax(axis=1).str.replace("NOC_", "")

    for feat in numeric_feats:
        pivot_df = historical_data.pivot(index="Year", columns="NOC", values=feat).fillna(0)

        for noc in pivot_df.columns:
            xvals = np.array(pivot_df.index, dtype=float)
            yvals = pivot_df[noc].values

            if len(xvals) < 2:
                continue

            model_info = build_baseline_model(xvals, yvals)
            df_2024.loc[df_2024["NOC"] == noc, feat] = predict_baseline_value(model_info, 2024)

    for col in df_agg.columns:
        if col not in df_2024.columns:
            df_2024[col] = 0

    # Scale & predict
    X_2024_unscaled = df_2024[feature_cols].values
    X_2024_scaled = scaler_features.transform(X_2024_unscaled)

    preds_2024_sc = model.predict(X_2024_scaled)
    preds_2024 = scaler_targets.inverse_transform(preds_2024_sc)

    df_real_2024 = df_agg[df_agg["Year"] == 2024].reset_index(drop=True)
    df_real_2024["NOC"] = df_real_2024[noc_columns].idxmax(axis=1).str.replace("NOC_", "")

    actual_2024_sc = df_real_2024[target_cols].values
    actual_2024 = scaler_targets.inverse_transform(actual_2024_sc)

    df_result_2024 = pd.DataFrame({
        "NOC": df_real_2024["NOC"].astype(str),
        "Gold_Actual": actual_2024[:, 0],
        "Silver_Actual": actual_2024[:, 1],
        "Bronze_Actual": actual_2024[:, 2],
        "Gold_Predicted": preds_2024[:, 0],
        "Silver_Predicted": preds_2024[:, 1],
        "Bronze_Predicted": preds_2024[:, 2],
    })

    return df_result_2024

# Visualization for 2024 results
def visualize_2024_results(df_result_2024):
    df = df_result_2024.copy()
    df["Gold_Error_%"] = np.where(df["Gold_Actual"]>0, 
                                  abs(df["Gold_Predicted"]-df["Gold_Actual"])/df["Gold_Actual"]*100,0)
    df["Silver_Error_%"] = np.where(df["Silver_Actual"]>0,
                                  abs(df["Silver_Predicted"]-df["Silver_Actual"])/df["Silver_Actual"]*100,0)
    df["Bronze_Error_%"] = np.where(df["Bronze_Actual"]>0,
                                  abs(df["Bronze_Predicted"]-df["Bronze_Actual"])/df["Bronze_Actual"]*100,0)

    error_cols = ["Gold_Error_%","Silver_Error_%","Bronze_Error_%"]
    filtered_df = df[(df["Gold_Actual"]>0)|(df["Silver_Actual"]>0)|(df["Bronze_Actual"]>0)]

    plt.figure(figsize=(18,6))
    for i,col in enumerate(error_cols):
        plt.subplot(1,3,i+1)
        plt.hist(filtered_df[col], bins=50, range=(0,100), edgecolor='black', alpha=0.7)
        plt.title(f"Distribution of {col}")
        plt.xlabel("Error Percentage")
        plt.ylabel("Frequency")
        plt.grid(True)
    plt.tight_layout()
    plt.show()

    exceed_100 = (filtered_df[error_cols]>100).sum()/len(filtered_df)*100
    print("Percentage of data points exceeding 100% error (ignoring zero actual values):")
    print(exceed_100)

    # line chart
    filtered_df["Total_Actual"] = filtered_df["Gold_Actual"]+filtered_df["Silver_Actual"]+filtered_df["Bronze_Actual"]
    filtered_df.sort_values(by="Total_Actual", ascending=False, inplace=True)

    nocs = filtered_df["NOC"].values
    gold_actual = filtered_df["Gold_Actual"].values
    gold_pred = filtered_df["Gold_Predicted"].values
    silver_actual = filtered_df["Silver_Actual"].values
    silver_pred = filtered_df["Silver_Predicted"].values
    bronze_actual = filtered_df["Bronze_Actual"].values
    bronze_pred = filtered_df["Bronze_Predicted"].values

    # gold
    plt.figure(figsize=(12,4))
    plt.plot(nocs, gold_actual,label="Actual Gold",marker='o')
    plt.plot(nocs, gold_pred,label="Predicted Gold",marker='x')
    plt.xticks(rotation=90)
    plt.xlabel("NOC")
    plt.ylabel("Gold Medals")
    plt.title("Gold Medals: Actual vs Predicted (2024 Extrapolated)")
    plt.legend()
    plt.grid()
    plt.show()

    # silver
    plt.figure(figsize=(12,4))
    plt.plot(nocs, silver_actual,label="Actual Silver",marker='o')
    plt.plot(nocs, silver_pred,label="Predicted Silver",marker='x')
    plt.xticks(rotation=90)
    plt.xlabel("NOC")
    plt.ylabel("Silver Medals")
    plt.title("Silver Medals: Actual vs Predicted (2024 Extrapolated)")
    plt.legend()
    plt.grid()
    plt.show()

    # bronze
    plt.figure(figsize=(12,4))
    plt.plot(nocs, bronze_actual,label="Actual Bronze",marker='o')
    plt.plot(nocs, bronze_pred,label="Predicted Bronze",marker='x')
    plt.xticks(rotation=90)
    plt.xlabel("NOC")
    plt.ylabel("Bronze Medals")
    plt.title("Bronze Medals: Actual vs Predicted (2024 Extrapolated)")
    plt.legend()
    plt.grid()
    plt.show()

    # final MAE
    gold_mae = mean_absolute_error(gold_actual, gold_pred)
    silver_mae = mean_absolute_error(silver_actual, silver_pred)
    bronze_mae = mean_absolute_error(bronze_actual, bronze_pred)
    print(f"Gold Medal MAE: {gold_mae:.2f}")
    print(f"Silver Medal MAE: {silver_mae:.2f}")
    print(f"Bronze Medal MAE: {bronze_mae:.2f}")

In [5]:
import os
import numpy as np
import tensorflow as tf
from itertools import product
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.layers import Dense, BatchNormalization, Activation, Dropout, Input, Add
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
from statsmodels.tsa.holtwinters import Holt

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
tf.config.run_functions_eagerly(False)

# ------------------------------------------------------------------
# Define custom weighted loss function for top countries
# ------------------------------------------------------------------

def weighted_huber_loss(delta=1.0):
    def loss_fn(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)
        weights_batch = tf.cast(tf.gather(sample_weights, tf.range(tf.shape(y_true)[0])), tf.float32)

        error = y_true - y_pred
        condition = tf.abs(error) < delta
        huber_loss = tf.where(condition, 0.5 * tf.square(error), delta * (tf.abs(error) - 0.5 * delta))

        weighted_loss = tf.reduce_mean(weights_batch * huber_loss)
        return weighted_loss
    return loss_fn


# Function to create custom weights based on medal counts
def generate_sample_weights(y_true, df, top_countries):
    weights = np.ones((len(y_true), y_true.shape[1]))
    for i in range(len(y_true)):
        if df.iloc[i]["NOC"] in top_countries:
            weights[i] *= 10.0  # Increase weight for top countries
    return weights

# ------------------------------------------------------------------
# Select top 20 medal-winning countries
# ------------------------------------------------------------------

# Ensure NOC column exists by reconstructing from one-hot encoding
if 'NOC' not in df_unscaled.columns:
    noc_columns = [col for col in df_unscaled.columns if col.startswith("NOC_")]
    df_unscaled["NOC"] = df_unscaled[noc_columns].idxmax(axis=1).str.replace("NOC_", "")

# Calculate total medals for each NOC
df_unscaled['Total_Medals'] = df_unscaled["Medal_Gold"] + df_unscaled["Medal_Silver"] + df_unscaled["Medal_Bronze"]

# Identify the top 20 medal-winning countries
top_20_countries = df_unscaled.groupby("NOC")["Total_Medals"].sum().nlargest(20).index.tolist()

# Generate the sample weights
sample_weights = generate_sample_weights(y_all, df_unscaled, top_20_countries)
sample_weights = np.array(sample_weights, dtype=np.float32)

# ------------------------------------------------------------------
# Model creation function with flexible architecture
# ------------------------------------------------------------------

def build_medal_prediction_model(input_dim, loss_function, layer_sizes, 
                                 dropout_rate, activation_function, focus_top):
    inputs = Input(shape=(input_dim,))

    x = Dense(layer_sizes[0], activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.001))(inputs)
    x = BatchNormalization()(x)
    x = Activation(activation_function)(x)
    x = Dropout(dropout_rate)(x)

    for size in layer_sizes[1:]:
        x1 = Dense(size, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
        x1 = BatchNormalization()(x1)
        x1 = Activation(activation_function)(x1)

        x2 = Dense(size, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.001))(x1)
        x2 = BatchNormalization()(x2)
        x2 = Activation(activation_function)(x2)
        x2 = Dropout(dropout_rate)(x2)

        x = Add()([x1, x2])

    outputs = Dense(3, activation="linear")(x)

    model = Model(inputs, outputs)

    if focus_top:
        model.compile(optimizer=AdamW(learning_rate=0.0001, weight_decay=1e-5),
                      loss=loss_function(),
                      metrics=["mae"])
    else:
        model.compile(optimizer=AdamW(learning_rate=0.0001, weight_decay=1e-5), 
                      loss=tf.keras.losses.MeanAbsoluteError(), 
                      metrics=["mae"])
    
    return model


# ------------------------------------------------------------------
# Grid Search for Hyperparameter Optimization
# ------------------------------------------------------------------

param_grid = {
    "loss_function": [weighted_huber_loss],
    "layer_sizes": [(1024, 512, 64), (2048, 1024, 512, 64), (1024, 512, 64, 64)],
    "dropout_rate": [0.3, 0.2],
    "batch_size": [32, 16],
    "activation_function": ["relu", "swish", "elu"],
    "focus_top": [True, False]  # Whether to focus on top 20 countries
}

# Extract training data before 2024
mask_train_2024 = (df_unscaled["Year"] < 2024)
X_train_2024 = X_all[mask_train_2024]
y_train_2024 = y_all[mask_train_2024]

X_train_2024 = X_train_2024.astype(np.float32)
y_train_2024 = y_train_2024.astype(np.float32)


# Store the best model and results
best_model = None
best_mae = float("inf")
best_params = None

for params in product(*param_grid.values()):
    param_dict = dict(zip(param_grid.keys(), params))
    print(f"Testing parameters: {param_dict}")

    try:
        model = build_medal_prediction_model(
            input_dim=X_all.shape[1],
            loss_function=param_dict["loss_function"],
            layer_sizes=param_dict["layer_sizes"],
            dropout_rate=param_dict["dropout_rate"],
            activation_function=param_dict["activation_function"],
            focus_top=param_dict["focus_top"]
        )

        # Train on data before 2024
        early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
        lr_reduce = ReduceLROnPlateau(monitor='val_loss', patience=15, factor=0.5, verbose=1)

        model.fit(X_train_2024, y_train_2024, 
                  epochs=100, 
                  batch_size=param_dict["batch_size"], 
                  verbose=2, 
                  callbacks=[early_stopping, lr_reduce])

        # Evaluate extrapolation performance on 2024 data
        df_result_2024_extrap = build_baseline_and_predict_2024(
            model, df_unscaled, feature_cols, target_cols, scaler_features, scaler_targets
        )

        # Calculate MAE for each medal type
        gold_mae = mean_absolute_error(df_result_2024_extrap["Gold_Actual"], df_result_2024_extrap["Gold_Predicted"])
        silver_mae = mean_absolute_error(df_result_2024_extrap["Silver_Actual"], df_result_2024_extrap["Silver_Predicted"])
        bronze_mae = mean_absolute_error(df_result_2024_extrap["Bronze_Actual"], df_result_2024_extrap["Bronze_Predicted"])

        total_mae = gold_mae + silver_mae + bronze_mae
        print(f"Total MAE for current configuration: {total_mae:.2f}")

        # Track the best model
        if total_mae < best_mae:
            best_mae = total_mae
            best_model = model
            best_params = param_dict

    except Exception as e:
        print(f"Skipping configuration due to error: {e}")
        continue


print("\nBest Model Configuration:")
print(best_params)
print(f"Best Total MAE: {best_mae:.2f}")

# Save the best model
if best_model:
    best_model.save("best_medal_prediction_model_top20.h5")

    # Visualize the best model results
    df_result_2024_extrap = build_baseline_and_predict_2024(
        best_model, df_unscaled, feature_cols, target_cols, scaler_features, scaler_targets
    )
    visualize_2024_results(df_result_2024_extrap)
else:
    print("No valid model configurations found.")

Testing parameters: {'loss_function': <function weighted_huber_loss at 0x0000027783E5C680>, 'layer_sizes': (1024, 512, 64), 'dropout_rate': 0.3, 'batch_size': 32, 'activation_function': 'relu', 'focus_top': True}
Epoch 1/100
91/91 - 2s - 24ms/step - loss: 7.0688 - mae: 1.2630 - learning_rate: 1.0000e-04
Epoch 2/100
91/91 - 0s - 5ms/step - loss: 5.6527 - mae: 1.0144 - learning_rate: 1.0000e-04
Epoch 3/100
91/91 - 0s - 5ms/step - loss: 5.1642 - mae: 0.9219 - learning_rate: 1.0000e-04
Epoch 4/100
91/91 - 0s - 5ms/step - loss: 4.8283 - mae: 0.8627 - learning_rate: 1.0000e-04
Epoch 5/100
91/91 - 0s - 5ms/step - loss: 4.6327 - mae: 0.8174 - learning_rate: 1.0000e-04
Epoch 6/100
91/91 - 0s - 5ms/step - loss: 4.4260 - mae: 0.7846 - learning_rate: 1.0000e-04
Epoch 7/100
91/91 - 0s - 5ms/step - loss: 4.2498 - mae: 0.7572 - learning_rate: 1.0000e-04
Epoch 8/100
91/91 - 0s - 5ms/step - loss: 4.0942 - mae: 0.7149 - learning_rate: 1.0000e-04
Epoch 9/100
91/91 - 0s - 5ms/step - loss: 3.9715 - mae: 0.

KeyboardInterrupt: 