# ML-CUP Neural Network with Keras/TensorFlow
## Using Mean Euclidean Error (MEE) as Evaluation Metric

This notebook solves the ML-CUP dataset using a neural network with Keras and TensorFlow. Since the test set is blind, we split the training data into train/validation/test sets and use MEE (Mean Euclidean Error) as the metric throughout.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

## 2. Load and Explore the Dataset

In [None]:
# Load the training dataset
# Skip comment lines (lines starting with #)
df_train = pd.read_csv('ML-CUP25-TR.csv', comment='#', header=None)

print("Dataset shape:", df_train.shape)
print("\nFirst rows:")
print(df_train.head())

print("\nDataset info:")
print(df_train.info())

print("\nBasic statistics:")
print(df_train.describe())

print("\nMissing values:")
print(df_train.isnull().sum())

# Dataset structure: ID, 12 input features, 4 targets
# Columns: 0=ID, 1-12=inputs, 13-16=targets
print("\nColumn count:", len(df_train.columns))

## 3. Data Preprocessing

In [None]:
# Separate features and targets
# Drop ID column (column 0)
# Features: columns 1-12 (12 features)
# Targets: columns 13-16 (4 targets)

X = df_train.iloc[:, 1:13].values  # Features: columns 1-12
y = df_train.iloc[:, 13:17].values  # Targets: columns 13-16

print("Features shape:", X.shape)
print("Targets shape:", y.shape)

# Standardize/Normalize the features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

print("\nFeatures after scaling - mean:", X_scaled.mean(axis=0)[:5])
print("Features after scaling - std:", X_scaled.std(axis=0)[:5])

# Also scale the targets for better neural network performance
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y)

print("\nTargets after scaling - mean:", y_scaled.mean(axis=0))
print("Targets after scaling - std:", y_scaled.std(axis=0))

## 4. Define MEE (Mean Euclidean Error) Metric

In [None]:
# Custom MEE metric for Keras
def mee(y_true, y_pred):
    """
    Mean Euclidean Error
    Calculates the mean of the Euclidean distance between true and predicted values
    For each sample, compute sqrt(sum((y_true - y_pred)^2)) and then average across samples
    """
    euclidean_distances = tf.sqrt(tf.reduce_sum(tf.square(y_true - y_pred), axis=1))
    return tf.reduce_mean(euclidean_distances)

# Custom loss function based on MEE (for training)
def mee_loss(y_true, y_pred):
    """
    MEE Loss function for model training
    """
    return mee(y_true, y_pred)

print("MEE metric defined successfully")
print("MEE measures the mean distance between true and predicted points in 4D space")

## 5. Split Data into Training, Validation, and Test Sets

In [None]:
# Split data into train (70%), validation (15%), and test (15%)
# First split: 70% train, 30% temp (which will be split into val and test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y_scaled, test_size=0.30, random_state=42
)

# Second split: split temp into 50% validation, 50% test (15% each of original)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Internal Test set shape:", X_test.shape, y_test.shape)

print("\nData split percentages:")
print(f"Train: {X_train.shape[0]/len(X_scaled)*100:.1f}%")
print(f"Validation: {X_val.shape[0]/len(X_scaled)*100:.1f}%")
print(f"Test: {X_test.shape[0]/len(X_scaled)*100:.1f}%")

## 6. Build the Neural Network Model

In [None]:
# Build the neural network model
model = keras.Sequential([
    layers.Input(shape=(12,)),  # 12 input features
    
    # Hidden layers with batch normalization and dropout for regularization
    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.BatchNormalization(),
    layers.Dropout(0.2),
    
    layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.BatchNormalization(),
    layers.Dropout(0.2),
    
    layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.BatchNormalization(),
    layers.Dropout(0.1),
    
    layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    
    # Output layer: 4 targets
    layers.Dense(4, activation='linear')  # Linear activation for regression
])

# Compile the model with MEE as loss and metric
model.compile(
    loss=mee_loss,
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=[mee]
)

print("Model compiled successfully")
print("\nModel architecture:")
model.summary()

## 7. Train the Model

In [None]:
# Define callbacks for training
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=30,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=15,
    min_lr=0.00001,
    verbose=1
)

# Train the model
print("Starting training...")
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=300,
    batch_size=16,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

print("\nTraining completed!")
print(f"Total epochs trained: {len(history.history['loss'])}")

## 8. Visualize Training History

In [None]:
# Function to smooth curves using exponential moving average
def exponential_moving_average(data, alpha=0.1):
    """
    Smooth data using exponential moving average
    alpha: smoothing factor (0 to 1), higher = more smoothing
    """
    ema = [data[0]]
    for i in range(1, len(data)):
        ema.append(alpha * data[i] + (1 - alpha) * ema[i-1])
    return ema

# Plot training history with smoothed curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Smooth the curves
loss_smooth = exponential_moving_average(history.history['loss'], alpha=0.15)
val_loss_smooth = exponential_moving_average(history.history['val_loss'], alpha=0.15)
mee_smooth = exponential_moving_average(history.history['mee'], alpha=0.15)
val_mee_smooth = exponential_moving_average(history.history['val_mee'], alpha=0.15)

# Plot loss with smoothing
axes[0].plot(history.history['loss'], label='Training Loss (Raw)', linewidth=1, alpha=0.3)
axes[0].plot(loss_smooth, label='Training Loss (Smoothed)', linewidth=2.5)
axes[0].plot(history.history['val_loss'], label='Validation Loss (Raw)', linewidth=1, alpha=0.3)
axes[0].plot(val_loss_smooth, label='Validation Loss (Smoothed)', linewidth=2.5)
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('MEE Loss', fontsize=12)
axes[0].set_title('Model Loss over Epochs', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=10)
axes[0].grid(True, alpha=0.3)

# Plot metric (MEE) with smoothing
axes[1].plot(history.history['mee'], label='Training MEE (Raw)', linewidth=1, alpha=0.3)
axes[1].plot(mee_smooth, label='Training MEE (Smoothed)', linewidth=2.5)
axes[1].plot(history.history['val_mee'], label='Validation MEE (Raw)', linewidth=1, alpha=0.3)
axes[1].plot(val_mee_smooth, label='Validation MEE (Smoothed)', linewidth=2.5)
axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('MEE Metric', fontsize=12)
axes[1].set_title('MEE Metric over Epochs', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=10)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Final Training MEE: {history.history['mee'][-1]:.6f}")
print(f"Final Validation MEE: {history.history['val_mee'][-1]:.6f}")

## 9. Evaluate on Validation and Internal Test Sets

In [None]:
# Evaluate on validation set
val_loss, val_mee = model.evaluate(X_val, y_val, verbose=0)
print("="*60)
print("VALIDATION SET RESULTS")
print("="*60)
print(f"Validation Loss (MEE): {val_loss:.6f}")
print(f"Validation MEE Metric: {val_mee:.6f}")

# Evaluate on internal test set
test_loss, test_mee = model.evaluate(X_test, y_test, verbose=0)
print("\n" + "="*60)
print("INTERNAL TEST SET RESULTS")
print("="*60)
print(f"Test Loss (MEE): {test_loss:.6f}")
print(f"Test MEE Metric: {test_mee:.6f}")

# Make predictions on test set
y_test_pred = model.predict(X_test, verbose=0)

# Inverse transform predictions and targets to original scale
y_test_pred_original = scaler_y.inverse_transform(y_test_pred)
y_test_original = scaler_y.inverse_transform(y_test)

# Calculate MEE in original scale
euclidean_distances = np.sqrt(np.sum((y_test_original - y_test_pred_original)**2, axis=1))
mee_original = np.mean(euclidean_distances)

print(f"\nTest MEE in Original Scale: {mee_original:.6f}")
print(f"Min distance: {euclidean_distances.min():.6f}")
print(f"Max distance: {euclidean_distances.max():.6f}")
print(f"Std distance: {euclidean_distances.std():.6f}")

## 10. Generate Predictions for Blind Test Set

In [None]:
# Load the blind test set
df_test = pd.read_csv('ML-CUP25-TS.csv', comment='#', header=None)

print("Blind test set shape:", df_test.shape)
print("\nFirst rows of blind test set:")
print(df_test.head())

# Extract features from blind test set
# The test set has: ID, 12 input features (no targets)
X_blind = df_test.iloc[:, 1:13].values  # Features: columns 1-12

# Scale the features using the same scaler fitted on training data
X_blind_scaled = scaler_X.transform(X_blind)

print(f"\nBlind test set features shape: {X_blind_scaled.shape}")

# Generate predictions
y_blind_pred_scaled = model.predict(X_blind_scaled, verbose=0)

# Inverse transform predictions to original scale
y_blind_pred_original = scaler_y.inverse_transform(y_blind_pred_scaled)

print(f"Predictions shape: {y_blind_pred_original.shape}")
print("\nFirst 5 predictions:")
print(y_blind_pred_original[:5])

In [None]:
# Save predictions in the required format
# Format: ID, TARGET_1, TARGET_2, TARGET_3, TARGET_4

# Create output dataframe
output_df = pd.DataFrame({
    'ID': df_test[0].values,  # The first column is the ID (use integer index, not string)
    'TARGET_1': y_blind_pred_original[:, 0],
    'TARGET_2': y_blind_pred_original[:, 1],
    'TARGET_3': y_blind_pred_original[:, 2],
    'TARGET_4': y_blind_pred_original[:, 3]
})

print("Output dataframe shape:", output_df.shape)
print("\nFirst 5 rows of output:")
print(output_df.head())

# Save to CSV (without index, no header - standard for ML-CUP format)
output_filename = 'ML-CUP25-TS-predictions.csv'
output_df.to_csv(output_filename, index=False, header=False)

print(f"\nPredictions saved to: {output_filename}")

# Also save a version with headers for reference
output_filename_with_header = 'ML-CUP25-TS-predictions-with-header.csv'
output_df.to_csv(output_filename_with_header, index=False, header=True)

print(f"Predictions with header saved to: {output_filename_with_header}")

## Summary and Results

### Key Points:
- **Training Data**: Split into 70% training, 15% validation, and 15% internal test
- **Loss Function**: Mean Euclidean Error (MEE) - measures Euclidean distance in 4D space
- **Data Preprocessing**: Features and targets standardized using StandardScaler
- **Model Architecture**: 4 hidden layers (128→64→32→16 neurons) with batch normalization and dropout
- **Regularization**: L2 regularization, dropout, and learning rate reduction on plateau

### Performance:
- See the evaluation metrics printed above
- The model was trained with early stopping to prevent overfitting
- Validation MEE is used to monitor generalization

### Predictions:
- Blind test set predictions saved in the required format
- Predictions are in the original (unscaled) target space