In [1]:
import os
import re

import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.optimizers import Adam, AdamW, RMSprop
from tensorflow.keras.regularizers import l2

In [2]:
# ------------------------------------------------------------------
# 0) Load & Basic Cleaning
# ------------------------------------------------------------------
data_path = "MCM_2025\\2025_MCM-ICM_Problems\\2025_Problem_C_Data\\summerOly_athletes.csv"
data = pd.read_csv(data_path)

# Remove rows with digits or '.' in 'Team'
df_no_digits = data[~data['Team'].str.contains(r'\d|\.', na=False)]

# Remove rows with Roman numerals in 'Team'
roman_pattern = r'(?:\s|-)M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})(?:\s|$)'
df_no_roman = df_no_digits[~df_no_digits['Team'].str.contains(roman_pattern, na=False, flags=re.IGNORECASE)]

df_raw = df_no_roman.copy()

# Convert categorical columns to strings and fill missing values
for col in ["Name", "Sex", "Team", "NOC", "City", "Sport", "Event", "Medal"]:
    df_raw[col] = df_raw[col].astype(str).fillna("Unknown")

# Ensure 'Year' is numeric and drop missing values
df_raw["Year"] = pd.to_numeric(df_raw["Year"], errors="coerce")
df_raw.dropna(subset=["Year"], inplace=True)
df_raw["Year"] = df_raw["Year"].astype(int)

print("Initial df_raw shape:", df_raw.shape)

# ------------------------------------------------------------------
# 1) Encoding Transformations with Storage
# ------------------------------------------------------------------
df_filtered = df_raw.copy()

# (1a) Label-encode & scale "Name"
label_encoder_name = LabelEncoder()
df_filtered["Name_Label"] = label_encoder_name.fit_transform(df_filtered["Name"])

# Save label encoder for later use
with open("name_label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder_name, f)

# Initialize and fit MinMaxScaler
scaler_name = MinMaxScaler(feature_range=(0, 1))
df_filtered["Name_Label"] = scaler_name.fit_transform(df_filtered[["Name_Label"]])

# Save the scaler for future use
with open("name_scaler.pkl", "wb") as f:
    pickle.dump(scaler_name, f)

# Create the mapping correctly by using the encoder's classes_
name_mapping_df = pd.DataFrame({
    "Name": label_encoder_name.classes_,
    "Encoded_Value": range(len(label_encoder_name.classes_))
})

# Save the correct mapping
name_mapping_df.to_csv("name_encoding.csv", index=False)

# Drop the original Name column after encoding
df_filtered.drop(columns=["Name"], inplace=True)

print("Name encoding saved and Name column removed successfully.")

# (1b) FeatureHash "Team", "Sport", "Event"
n_features = 2100  # Adjustable number of hashed features for better performance
hashers = {
    "Team": FeatureHasher(n_features=n_features, input_type='string'),
    "Sport": FeatureHasher(n_features=n_features, input_type='string'),
    "Event": FeatureHasher(n_features=n_features, input_type='string')
}

for col, hasher_obj in hashers.items():
    hashed = hasher_obj.transform(df_filtered[col].apply(lambda x: [x]))
    hashed_df = pd.DataFrame.sparse.from_spmatrix(
        hashed, columns=[f"{col}_hashed_{i}" for i in range(n_features)]
    )
    df_filtered.drop(columns=[col], inplace=True)
    df_filtered.reset_index(drop=True, inplace=True)
    hashed_df.reset_index(drop=True, inplace=True)
    df_filtered = pd.concat([df_filtered, hashed_df], axis=1)

# Save the hashers for later use
with open("hashers.pkl", "wb") as f:
    pickle.dump(hashers, f)

print("Feature hashing completed.")

# (1c) One-hot encode "Sex", "City", "Medal"
ohe = OneHotEncoder(handle_unknown="ignore")
ohe_array = ohe.fit_transform(df_filtered[["Sex", "City", "Medal"]])
ohe_cols = ohe.get_feature_names_out(["Sex", "City", "Medal"])
ohe_df = pd.DataFrame(ohe_array.toarray(), columns=ohe_cols)

df_filtered.drop(columns=["Sex", "City", "Medal"], inplace=True)
df_filtered = pd.concat([df_filtered, ohe_df], axis=1)

# Save one-hot encoder
with open("onehot_encoder.pkl", "wb") as f:
    pickle.dump(ohe, f)

print("After row-level transforms, df_filtered shape:", df_filtered.shape)

# ------------------------------------------------------------------
# 2) Group by (Year, NOC)
# ------------------------------------------------------------------
df_filtered["Year"] = df_filtered["Year"].astype(int)

grouped_cols = df_filtered.drop(
    columns=["Medal_Gold", "Medal_Silver", "Medal_Bronze", "Year"],
    errors="ignore"
).columns.difference(["NOC"])

df_agg = df_filtered.groupby(["Year", "NOC"], as_index=False).agg({c: "sum" for c in grouped_cols})
df_agg = df_agg.merge(
    df_filtered.groupby(["Year", "NOC"], as_index=False).agg({
        "Medal_Gold": "sum", "Medal_Silver": "sum", "Medal_Bronze": "sum"
    }),
    on=["Year", "NOC"], how="left"
)

print("Grouped by (Year, NOC) shape:", df_agg.shape)

# ------------------------------------------------------------------
# 3) Encode NOC at the aggregated level
# ------------------------------------------------------------------
ohe_noc = OneHotEncoder(handle_unknown="ignore")
noc_arr = ohe_noc.fit_transform(df_agg[["NOC"]])
noc_cols = [f"NOC_{cat}" for cat in ohe_noc.categories_[0]]
noc_df = pd.DataFrame(noc_arr.toarray(), columns=noc_cols)

df_agg.reset_index(drop=True, inplace=True)
noc_df.reset_index(drop=True, inplace=True)

df_agg = pd.concat([df_agg.drop(columns=["NOC"]), noc_df], axis=1)

# Save NOC encoder
with open("noc_encoder.pkl", "wb") as f:
    pickle.dump(ohe_noc, f)

print("After NOC encoding, df_agg shape:", df_agg.shape)

# Convert numeric
num_cols = df_agg.select_dtypes(include=["number"]).columns
df_agg[num_cols] = df_agg[num_cols].astype("float32")
df_agg["Year"] = df_agg["Year"].astype("int32")

# ------------------------------------------------------------------
# Scale Features / Targets
# ------------------------------------------------------------------

from sklearn.preprocessing import StandardScaler

feature_cols = df_agg.drop(columns=["Year", "Medal_Gold", "Medal_Silver", "Medal_Bronze"]).columns
target_cols = ["Medal_Gold", "Medal_Silver", "Medal_Bronze"]

scaler_features = StandardScaler()
scaler_targets = StandardScaler()

X_scaled = scaler_features.fit_transform(df_agg[feature_cols])
y_scaled = scaler_targets.fit_transform(df_agg[target_cols])

df_agg[feature_cols] = X_scaled
df_agg[target_cols] = y_scaled

# Save scalers
with open("scaler_features.pkl", "wb") as f:
    pickle.dump(scaler_features, f)
with open("scaler_targets.pkl", "wb") as f:
    pickle.dump(scaler_targets, f)

print("df_agg_scaled shape:", df_agg.shape)

df_agg.to_csv("processed_athlete_data.csv", index=False)

Initial df_raw shape: (249329, 9)
Name encoding saved and Name column removed successfully.
Feature hashing completed.
After row-level transforms, df_filtered shape: (249329, 6332)
Grouped by (Year, NOC) shape: (3222, 6332)
After NOC encoding, df_agg shape: (3222, 6565)
df_agg_scaled shape: (3222, 6565)


In [7]:
# ------------------------------------------------------------------
# 4) Train-Test Split (chronological on Year)
# ------------------------------------------------------------------
split_year = 2016
train_df = df_agg[df_agg["Year"] < split_year].copy()
test_df  = df_agg[df_agg["Year"] >= split_year].copy()

X_train = train_df.drop(columns=["Year"] + target_cols)
y_train = train_df[target_cols]

X_test  = test_df.drop(columns=["Year"] + target_cols)
y_test  = test_df[target_cols]

model_input_columns = X_train.columns.tolist()

print("Train shape:", X_train.shape, y_train.shape)
print("Test  shape:", X_test.shape, y_test.shape)

# ------------------------------------------------------------------
# 5) Build & Train a FFNN with Callbacks & Cross-Validation
# ------------------------------------------------------------------

# Multi-threading
os.environ["OMP_NUM_THREADS"] = "20"
os.environ["MKL_NUM_THREADS"] = "20"

tf.config.threading.set_intra_op_parallelism_threads(20)
tf.config.threading.set_inter_op_parallelism_threads(20)

tf.random.set_seed(42)
np.random.seed(42)

def build_nn_model(input_dim):
    model = Sequential([
        tf.keras.Input(shape=(input_dim,)),

        Dense(2048, activation='swish', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),

        Dense(512, activation='swish', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),

        Dense(256, activation='swish', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        BatchNormalization(),
        Dropout(0.2),

        Dense(128, activation='swish', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        Dropout(0.2),

        Dense(64, activation='swish'),
        Dropout(0.1),

        Dense(64, activation='swish'),
        Dense(3, activation='linear')
    ])
    optimizer = AdamW(learning_rate=0.0005, weight_decay=1e-5)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

def learning_rate_scheduler(epoch, lr):
    if epoch == 0:
        return lr * 5
    return lr * 0.99

# Initializing
nn_model = build_nn_model(X_train.shape[1])

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=15, verbose=1)

# ------------------------------------------------------------------
# Time Series Cross-Validation
# ------------------------------------------------------------------
tscv = TimeSeriesSplit(n_splits=3)
cv_results = []

for train_index, val_index in tscv.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    print(f"\nTraining on {len(train_index)} samples, validating on {len(val_index)} samples.")

    lr_scheduler = LearningRateScheduler(learning_rate_scheduler)
    history = nn_model.fit(
        X_train_fold, y_train_fold,
        epochs=300,
        batch_size=32,
        validation_data=(X_val_fold, y_val_fold),
        callbacks=[early_stop, reduce_lr, lr_scheduler],
        verbose=2
    )

    val_predictions = nn_model.predict(X_val_fold)
    val_mae = mean_absolute_error(y_val_fold, val_predictions)
    cv_results.append(val_mae)
    print(f"Fold Validation MAE: {val_mae:.4f}")

print(f"\nAverage CV MAE: {np.mean(cv_results):.4f}")

# ------------------------------------------------------------------
# Final Training & Testing
# ------------------------------------------------------------------
if not X_test.empty:
    print("\nTraining on the full train dataset and testing on unseen data...")
    history = nn_model.fit(
        X_train, y_train,
        epochs=300,
        batch_size=32,
        validation_data=(X_test, y_test),
        callbacks=[early_stop, reduce_lr, lr_scheduler],
        verbose=2
    )
    final_loss, final_mae = nn_model.evaluate(X_test, y_test, verbose=0)
    print("Final Test Loss:", final_loss, "MAE:", final_mae)
else:
    print("No test set to evaluate.")

Train shape: (2603, 6561) (2603, 3)
Test  shape: (619, 6561) (619, 3)

Training on 653 samples, validating on 650 samples.
Epoch 1/300
21/21 - 3s - 164ms/step - loss: 20.9390 - mae: 0.4884 - val_loss: 18.6146 - val_mae: 0.7868 - learning_rate: 0.0025
Epoch 2/300
21/21 - 1s - 71ms/step - loss: 12.7381 - mae: 0.4075 - val_loss: 10.2509 - val_mae: 0.3279 - learning_rate: 0.0025
Epoch 3/300
21/21 - 1s - 68ms/step - loss: 8.1458 - mae: 0.3492 - val_loss: 6.3709 - val_mae: 0.3147 - learning_rate: 0.0025
Epoch 4/300
21/21 - 1s - 68ms/step - loss: 5.2550 - mae: 0.3363 - val_loss: 4.6883 - val_mae: 0.3870 - learning_rate: 0.0024
Epoch 5/300
21/21 - 1s - 68ms/step - loss: 3.7880 - mae: 0.3503 - val_loss: 3.9534 - val_mae: 0.3776 - learning_rate: 0.0024
Epoch 6/300
21/21 - 1s - 67ms/step - loss: 3.0918 - mae: 0.3315 - val_loss: 3.3928 - val_mae: 0.3239 - learning_rate: 0.0024
Epoch 7/300
21/21 - 1s - 67ms/step - loss: 2.6688 - mae: 0.3333 - val_loss: 3.0776 - val_mae: 0.3999 - learning_rate: 0.00

In [13]:
def predict_extrapolated_noc_year(
    noc_code, 
    year_value,
    df_agg,              
    scaler_features,     
    scaler_targets,      
    nn_model,            
    feature_cols,        
    encoder_files,       
    additional_features=None,  
    target_cols=["Medal_Gold","Medal_Silver","Medal_Bronze"],
    max_history_medals=10,
    max_close_points=8
):
    """
    Predict medals for a given NOC and year using historical data and optional additional features.
    """

    noc_col = f"NOC_{noc_code}"
    if noc_col not in df_agg.columns:
        print(f"[Extrapolation] Column {noc_col} not found in df_agg. No data for NOC={noc_code}.")
        return None

    df_noc_all = df_agg[df_agg[noc_col] == 1].copy()
    df_noc_all_sorted = df_noc_all.sort_values("Year")

    df_noc_all_sorted["year_diff"] = (df_noc_all_sorted["Year"] - year_value).abs()
    df_noc_closest = df_noc_all_sorted.nsmallest(max_close_points, "year_diff").copy()

    empty_row = pd.DataFrame(columns=feature_cols, index=[0]).fillna(0.0)
    empty_row["Year"] = float(year_value)

    for c in df_agg.columns:
        if c.startswith("NOC_"):
            empty_row[c] = 1.0 if c == noc_col else 0.0

    def encode_additional_features(new_data, encoder_files, feature_cols):
        with open(encoder_files["hashers"], "rb") as f:
            hashers = pickle.load(f)

        encoded_features = pd.DataFrame(columns=feature_cols).fillna(0)
        for feature, value in new_data.items():
            if feature in hashers:
                hashed_feature = hashers[feature].transform([[value]])
                hashed_df = pd.DataFrame.sparse.from_spmatrix(
                    hashed_feature, columns=[f"{feature}_hashed_{i}" for i in range(2100)]
                )
                for col in hashed_df.columns:
                    if col in encoded_features.columns:
                        encoded_features[col] = hashed_df[col]

        return encoded_features

    if additional_features:
        encoded_new_features = encode_additional_features(additional_features, encoder_files, feature_cols)
        empty_row.update(encoded_new_features)

    empty_row = empty_row.reindex(columns=feature_cols, fill_value=0)

    # Moving average for extrapolation
    def moving_average_baseline(x_vals, y_vals, year_value, window=3):
        valid_indices = x_vals < year_value
        if np.sum(valid_indices) < window:
            window = np.sum(valid_indices)
        if window < 1:
            return np.mean(y_vals)  
        sorted_indices = np.argsort(x_vals[valid_indices])
        moving_avg = np.mean(y_vals[valid_indices][sorted_indices][-window:])
        return moving_avg

    for col in feature_cols:
        if col.startswith("NOC_") or col.startswith("Medal_") or col == "Year":
            continue
        if col not in df_noc_closest.columns:
            empty_row[col] = 0.0
            continue

        valid_rows = df_noc_closest[~df_noc_closest[col].isna()].copy()
        if valid_rows.empty:
            empty_row[col] = 0.0
            continue

        x_vals = valid_rows["Year"].to_numpy().astype(float)
        y_vals = valid_rows[col].to_numpy().astype(float)

        pred_val = moving_average_baseline(x_vals, y_vals, year_value)
        empty_row[col] = pred_val

    # Ensure the feature order matches training data
    X_unscaled = empty_row[feature_cols].copy()
    X_scaled = scaler_features.transform(X_unscaled)

    preds_scaled = nn_model.predict(X_scaled)
    preds_real = scaler_targets.inverse_transform(preds_scaled)
    gold, silver, bronze = preds_real[0]

    print(f"\n[Extrapolated] Predicted medals for NOC={noc_code}, Year={year_value}:")
    print(f"  Gold={gold:.1f}, Silver={silver:.1f}, Bronze={bronze:.1f}")

    return (gold, silver, bronze)

encoder_files = {
    "hashers": "hashers.pkl"
}

predicted_medals = predict_extrapolated_noc_year(
    noc_code="CHN",
    year_value=2024,
    df_agg=df_agg,
    scaler_features=scaler_features,
    scaler_targets=scaler_targets,
    nn_model=nn_model,
    feature_cols=feature_cols,
    encoder_files=encoder_files,
    additional_features={"Sport": "Basketball"}
)

In [15]:
import seaborn as sns
import matplotlib.pyplot as plt

def predict_and_compare(noc_list, df_agg, scaler_features, scaler_targets, nn_model, feature_cols, encoder_files, check_years=[2012, 2016, 2020, 2024], show_history=False, additional_features=None):
    """
    Predicts medal counts using the trained model and compares with actual data.

    Parameters:
        noc_list (list): List of NOCs to predict and compare.
        df_agg (pd.DataFrame): Aggregated dataframe with historical data.
        scaler_features (StandardScaler): Scaler for feature normalization.
        scaler_targets (StandardScaler): Scaler for target values.
        nn_model (Sequential): Trained neural network model.
        feature_cols (list): Columns used for model input.
        encoder_files (dict): Dictionary containing paths to saved encoders.
        check_years (list): Olympic years to check.
        show_history (bool): Whether to display the country's medal count history.
        additional_features (dict, optional): Additional text features such as Sport, Event, Team.

    Returns:
        pd.DataFrame: A summary of actual vs. predicted medal counts.
    """

    results = []

    for year in check_years:
        for noc in noc_list:
            print(f"\nPredicting for NOC={noc}, Year={year}")

            # Get predictions for NOC and year
            try:
                pred_result = predict_extrapolated_noc_year(
                    noc_code=noc,
                    year_value=year,
                    df_agg=df_agg,
                    scaler_features=scaler_features,
                    scaler_targets=scaler_targets,
                    nn_model=nn_model,
                    feature_cols=feature_cols,
                    encoder_files=encoder_files,
                    additional_features=additional_features  # Pass extra features
                )

                if pred_result is not None:
                    gold_pred, silver_pred, bronze_pred = pred_result
                    print(f"Predicted medals: Gold={gold_pred:.1f}, Silver={silver_pred:.1f}, Bronze={bronze_pred:.1f}")
                else:
                    gold_pred, silver_pred, bronze_pred = None, None, None
                    print(f"Prediction unavailable for NOC={noc}, Year={year}")

            except Exception as e:
                print(f"Prediction failed for NOC={noc}, Year={year}. Error: {str(e)}")
                gold_pred, silver_pred, bronze_pred = None, None, None

            # Get actual data
            noc_col = f"NOC_{noc}"  # Adjusting for one-hot encoded columns
            if noc_col not in df_agg.columns:
                print(f"[Check] No data for NOC={noc}. Skipping...")
                continue

            df_noc_all = df_agg[df_agg[noc_col] == 1].copy()
            df_noc_filtered = df_noc_all[df_noc_all["Year"] == year]

            if show_history and not df_noc_all.empty:
                print(f"[History] Medal history for NOC={noc} up to year {year}:")
                display(df_noc_all[["Year", "Medal_Gold", "Medal_Silver", "Medal_Bronze"]].tail(10))

            if not df_noc_filtered.empty:
                gold_actual = df_noc_filtered["Medal_Gold"].values[0]
                silver_actual = df_noc_filtered["Medal_Silver"].values[0]
                bronze_actual = df_noc_filtered["Medal_Bronze"].values[0]
                print(f"Actual medals: Gold={gold_actual}, Silver={silver_actual}, Bronze={bronze_actual}")
            else:
                gold_actual, silver_actual, bronze_actual = None, None, None
                print(f"No actual data for NOC={noc}, Year={year}")

            # Store results
            results.append({
                "NOC": noc,
                "Year": year,
                "Gold_Predicted": gold_pred,
                "Silver_Predicted": silver_pred,
                "Bronze_Predicted": bronze_pred,
                "Gold_Actual": gold_actual,
                "Silver_Actual": silver_actual,
                "Bronze_Actual": bronze_actual
            })

    results_df = pd.DataFrame(results)

    # Calculate absolute differences and percentage errors
    results_df["Gold_Error_Pct"] = (abs(results_df["Gold_Predicted"] - results_df["Gold_Actual"]) / results_df["Gold_Actual"]) * 100
    results_df["Silver_Error_Pct"] = (abs(results_df["Silver_Predicted"] - results_df["Silver_Actual"]) / results_df["Silver_Actual"]) * 100
    results_df["Bronze_Error_Pct"] = (abs(results_df["Bronze_Predicted"] - results_df["Bronze_Actual"]) / results_df["Bronze_Actual"]) * 100

    # Replace NaN values with -1
    results_df.fillna(-1, inplace=True)

    print("\nPrediction vs Actual Summary:")
    display(results_df)

    return results_df

noc_list = ["CHN", "USA", "GBR", "RUS", "GER"]
encoder_files = {
    "hashers": "hashers.pkl"
}

comparison_results = predict_and_compare(
    noc_list=noc_list,
    df_agg=df_agg,
    scaler_features=scaler_features,
    scaler_targets=scaler_targets,
    nn_model=nn_model,
    feature_cols=feature_cols,
    encoder_files=encoder_files,
    show_history=False,
    additional_features={"Sport": "Basketball", "Event": "100m Sprint"}
)

def visualize_selected_nocs(results_df, selected_nocs, selected_years, color='Red'):
    """
    Generate heatmaps for selected NOCs and years using a single-color theme with grayscale.

    Parameters:
        results_df (pd.DataFrame): DataFrame containing prediction vs. actual medal count results.
        selected_nocs (list): List of NOCs to visualize.
        selected_years (list): List of Olympic years to visualize.
        color (str): Color theme for the heatmap (e.g., 'Blues', 'Greens', 'Reds').
    """
    
    # Filter the dataframe for selected NOCs and years
    filtered_df = results_df[(results_df["NOC"].isin(selected_nocs)) & 
                             (results_df["Year"].isin(selected_years))]

    if filtered_df.empty:
        print("No data available for selected NOCs and years. Skipping visualization.")
        return

    # Generate heatmaps for Gold, Silver, and Bronze prediction errors
    for metric in ["Gold_Error_Pct", "Silver_Error_Pct", "Bronze_Error_Pct"]:
        heatmap_data = filtered_df.pivot(index="NOC", columns="Year", values=metric)

        if not heatmap_data.empty:
            plt.figure(figsize=(10, 6))
            sns.heatmap(heatmap_data, annot=True, cmap=color, fmt=".1f", linewidths=0.5)
            plt.title(f"{metric.replace('_', ' ')} Heatmap for Selected NOCs")
            plt.xlabel("Year")
            plt.ylabel("NOC")
            plt.xticks(rotation=45)
            plt.yticks(rotation=0)
            plt.show()
        else:
            print(f"Skipping heatmap for {metric} due to lack of valid data.")

noc_list = ["CHN", "USA", "GBR", "RUS", "GER"]
comparison_results = predict_and_compare(
    noc_list=noc_list,
    df_agg=df_agg,
    scaler_features=scaler_features,
    scaler_targets=scaler_targets,
    nn_model=nn_model,
    feature_cols=feature_cols,
    show_history=False
)
selected_nocs = noc_list
selected_years = [2012, 2016, 2020, 2024]

visualize_selected_nocs(comparison_results, selected_nocs, selected_years, color="Reds")


Predicting for NOC=CHN, Year=2012
[Debug] No existing data for NOC=CHN in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step

[Extrapolated] Predicted medals for NOC=CHN, Year=2012:
  Gold=-0.6, Silver=0.4, Bronze=0.8
Predicted medals: Gold=-0.6, Silver=0.4, Bronze=0.8
No actual data for NOC=CHN, Year=2012

Predicting for NOC=USA, Year=2012
[Debug] No existing data for NOC=USA in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step

[Extrapolated] Predicted medals for NOC=USA, Year=2012:
  Gold=6.4, Silver=0.6, Bronze=0.4
Predicted medals: Gold=6.4, Silver=0.6, Bronze=0.4
No actual data for NOC=USA, Year=2012

Predicting for NOC=GBR, Year=2012
[Debug] No existing data for NOC=GBR in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

[Extrapolated] Predicted medals for NOC=GBR, Year=2012:
  Gold=-0.6, Silver=0.4, Bronze=1.4
Predicted medals: Gold=-0.6, Silver=0.4, Bronze=1.4
No actual data for NOC=GBR, Year=2012

Predicting for NOC=RUS, Year=2012
[Debug] No existing data for NOC=RUS in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step

[Extrapolated] Predicted medals for NOC=RUS, Year=2012:
  Gold=-1.0, Silver=0.0, Bronze=0.3
Predicted medals: Gold=-1.0, Silver=0.0, Bronze=0.3
No actual data for NOC=RUS, Year=2012

Predicting for NOC=GER, Year=2012
[Debug] No existing data for NOC=GER in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

[Extrapolated] Predicted medals for NOC=GER, Year=2012:
  Gold=0.4, Silver=2.5, Bronze=1.7
Predicted medals: Gold=0.4, Silver=2.5, Bronze=1.7
No actual data for NOC=GER, Year=2012

Predicting for NOC=CHN, Year=2016
[Debug] No existing data for NOC=CHN in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

[Extrapolated] Predicted medals for NOC=CHN, Year=2016:
  Gold=-0.6, Silver=0.4, Bronze=0.8
Predicted medals: Gold=-0.6, Silver=0.4, Bronze=0.8
No actual data for NOC=CHN, Year=2016

Predicting for NOC=USA, Year=2016
[Debug] No existing data for NOC=USA in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

[Extrapolated] Predicted medals for NOC=USA, Year=2016:
  Gold=6.4, Silver=0.6, Bronze=0.4
Predicted medals: Gold=6.4, Silver=0.6, Bronze=0.4
No actual data for NOC=USA, Year=2016

Predicting for NOC=GBR, Year=2016
[Debug] No existing data for NOC=GBR in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step

[Extrapolated] Predicted medals for NOC=GBR, Year=2016:
  Gold=-0.6, Silver=0.4, Bronze=1.4
Predicted medals: Gold=-0.6, Silver=0.4, Bronze=1.4
No actual data for NOC=GBR, Year=2016

Predicting for NOC=RUS, Year=2016
[Debug] No existing data for NOC=RUS in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

[Extrapolated] Predicted medals for NOC=RUS, Year=2016:
  Gold=-1.0, Silver=0.0, Bronze=0.3
Predicted medals: Gold=-1.0, Silver=0.0, Bronze=0.3
No actual data for NOC=RUS, Year=2016

Predicting for NOC=GER, Year=2016
[Debug] No existing data for NOC=GER in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step

[Extrapolated] Predicted medals for NOC=GER, Year=2016:
  Gold=0.4, Silver=2.5, Bronze=1.7
Predicted medals: Gold=0.4, Silver=2.5, Bronze=1.7
No actual data for NOC=GER, Year=2016

Predicting for NOC=CHN, Year=2020
[Debug] No existing data for NOC=CHN in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

[Extrapolated] Predicted medals for NOC=CHN, Year=2020:
  Gold=-0.6, Silver=0.4, Bronze=0.8
Predicted medals: Gold=-0.6, Silver=0.4, Bronze=0.8
No actual data for NOC=CHN, Year=2020

Predicting for NOC=USA, Year=2020
[Debug] No existing data for NOC=USA in df_agg.


  empty_row = pd.DataFrame(columns=df_agg.columns, index=[0]).fillna(0.0)


KeyboardInterrupt: 