In [None]:
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import time

import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, confusion_matrix


# Load the data
data_dir = Path("~/Data").expanduser() / "bci-i-idun-eeg-analysis-challenge"
X = np.load(data_dir / "S001_X.npy")
Y = np.load(data_dir / "S001_Y.npy")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape[0], -1)
X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape[0], -1)

# Convert Y from text labels to integer labels
_markers = list(set(Y))
y_train = np.array([_markers.index(m) for m in y_train])
y_test = np.array([_markers.index(m) for m in y_test])

# Define the XGBoost model
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(_markers), eval_metric='mlogloss', use_label_encoder=False)

# Training function
def train_model(model, X_train, y_train, X_test, y_test, num_epochs=50, patience=4):
    best_val_loss = np.inf
    best_model = None
    epochs_no_improve = 0
    train_losses, val_losses, train_accuracies, val_accuracies = [], [], [], []
    
    start_time = time.time()
    
    for epoch in range(num_epochs):
        # Training phase
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        
        # Compute training loss and accuracy
        train_pred = model.predict(X_train)
        train_loss = model.evals_result()['validation_0']['mlogloss'][-1]
        train_acc = accuracy_score(y_train, train_pred) * 100
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        
        # Validation phase
        val_pred = model.predict(X_test)
        val_loss = model.evals_result()['validation_0']['mlogloss'][-1]
        val_acc = accuracy_score(y_test, val_pred) * 100
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            best_model = model.get_booster().save_model("best_model.json")
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print(f"Early stopping after {epoch+1} epochs.")
                break
        
        print(f'Epoch [{epoch+1}/{num_epochs}]')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
        print('--------------------')
    
    end_time = time.time()
    print(f"Training took {end_time - start_time:.2f} seconds.")
    
    return best_model, train_losses, val_losses, train_accuracies, val_accuracies

# Set the number of epochs
num_epochs = 50

# Train model
best_model, train_losses, val_losses, train_accuracies, val_accuracies = train_model(model, X_train, y_train, X_test, y_test, num_epochs=num_epochs)

# Load the best model
model.load_model("best_model.json")

# Compute metrics
y_pred = model.predict(X_test)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Plot confusion matrix
conf_mat = confusion_matrix(y_test, y_pred, normalize='true')
plt.figure(figsize=(10, 8))
sns.heatmap(conf_mat, annot=True, fmt=".2f", cmap="Blues", xticklabels=_markers, yticklabels=_markers)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f"XGBoost - Balanced Accuracy: {balanced_acc:.4f}, F1: {f1:.4f}")
plt.show()

# Plotting the training and validation losses and accuracy
epochs = range(1, len(train_losses) + 1)

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracies, label='Training Accuracy')
plt.plot(epochs, val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.tight_layout()
plt.show()