In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# === Load data ===
dataTrain = pd.read_csv("allAtt_onehot_large_train_new8.csv")
dataTest = pd.read_csv("allAtt_onehot_large_test_new8.csv")

x_train, y_train = dataTrain.iloc[:, 4:38].values, dataTrain.iloc[:, 38:].values
x_test, y_test = dataTest.iloc[:, 4:38].values, dataTest.iloc[:, 38:].values

y_train_int = np.argmax(y_train, axis=1)
y_test_int = np.argmax(y_test, axis=1)

print(f"Train set shape: {x_train.shape}, Test set shape: {x_test.shape}")
print(f"Class distribution — Train: {np.bincount(y_train_int)}, Test: {np.bincount(y_test_int)}")

# === Generate stacking features via cross-validation ===
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

n_train = x_train.shape[0]
n_test = x_test.shape[0]
n_classes = 2

# Storage for train-set predictions
xgb_train_preds = np.zeros((n_train, n_classes))
lgb_train_preds = np.zeros((n_train, n_classes))
rf_train_preds  = np.zeros((n_train, n_classes))

# Storage for test-set predictions (we'll average across folds)
xgb_test_preds = np.zeros((n_test, n_classes))
lgb_test_preds = np.zeros((n_test, n_classes))
rf_test_preds  = np.zeros((n_test, n_classes))

print("Starting cross-validation to generate stacking features...")

for fold, (train_idx, val_idx) in enumerate(kf.split(x_train)):
    print(f"Processing fold {fold+1}/{n_folds}...")
    
    X_fold_train, X_fold_val = x_train[train_idx], x_train[val_idx]
    y_fold_train = y_train_int[train_idx]
    
    # Initialize base models
    fold_xgb = XGBClassifier(objective="multi:softprob", num_class=2,
                             eval_metric="mlogloss", use_label_encoder=False,
                             random_state=42)
    fold_lgb = LGBMClassifier(objective='multiclass', num_class=2,
                              random_state=42)
    fold_rf  = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Fit models
    fold_xgb.fit(X_fold_train, y_fold_train)
    fold_lgb.fit(X_fold_train, y_fold_train)
    fold_rf.fit(X_fold_train, y_fold_train)
    
    # Predict on validation fold
    xgb_train_preds[val_idx] = fold_xgb.predict_proba(X_fold_val)
    lgb_train_preds[val_idx] = fold_lgb.predict_proba(X_fold_val)
    rf_train_preds[val_idx]  = fold_rf.predict_proba(X_fold_val)
    
    # Predict on test set and accumulate
    xgb_test_preds += fold_xgb.predict_proba(x_test) / n_folds
    lgb_test_preds += fold_lgb.predict_proba(x_test) / n_folds
    rf_test_preds  += fold_rf.predict_proba(x_test) / n_folds

# === Prepare stacking features for LSTM ===
n_models = 3  # XGBoost, LightGBM, RandomForest

train_probs_reshaped = np.stack([xgb_train_preds, lgb_train_preds, rf_train_preds], axis=1)
test_probs_reshaped  = np.stack([xgb_test_preds,  lgb_test_preds,  rf_test_preds ], axis=1)

print(f"Reshaped stacking features — Train: {train_probs_reshaped.shape}, Test: {test_probs_reshaped.shape}")

# === Evaluate base learners on CV predictions ===
base_models = ['XGBoost', 'LightGBM', 'RandomForest']
base_preds = [
    np.argmax(xgb_train_preds, axis=1),
    np.argmax(lgb_train_preds, axis=1),
    np.argmax(rf_train_preds,  axis=1)
]

print("\nBase learner CV performance:")
for name, preds in zip(base_models, base_preds):
    acc = accuracy_score(y_train_int, preds)
    print(f"{name} CV Accuracy: {acc:.4f}")

# === Define LSTM stacking meta-learner ===
class LSTMStack(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, dropout=0.1, bidirectional=True, output_dim=2):
        super(LSTMStack, self).__init__()
        self.directions = 2 if bidirectional else 1
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers>1 else 0,
            bidirectional=bidirectional
        )
        
        self.attention = nn.Sequential(
            nn.Linear(hidden_size*self.directions, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1)
        )
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_size*self.directions, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, output_dim)
        )
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # x: [batch, seq_len, input_size]
        lstm_out, _ = self.lstm(x)  # [batch, seq_len, hidden*dirs]
        attn_scores = self.attention(lstm_out)  # [batch, seq_len, 1]
        attn_weights = torch.softmax(attn_scores, dim=1)
        context = torch.sum(attn_weights * lstm_out, dim=1)  # [batch, hidden*dirs]
        out = self.fc(context)
        return self.softmax(out)

# Convert to tensors and DataLoader
train_tensor = torch.FloatTensor(train_probs_reshaped)
test_tensor  = torch.FloatTensor(test_probs_reshaped)
y_train_t    = torch.LongTensor(y_train_int)
y_test_t     = torch.LongTensor(y_test_int)

train_ds = TensorDataset(train_tensor, y_train_t)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)

# Initialize LSTMStack
input_size = n_classes
hidden_size = 64
model = LSTMStack(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=2,
    dropout=0.3,
    bidirectional=True,
    output_dim=2
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

# === Train LSTM stacking meta-learner ===
epochs = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

best_val_loss = float('inf')
patience = 10
counter = 0
best_model_path = 'best_lstm_stack_model.pt'

train_losses = []
val_accuracies = []

print("\nTraining LSTM stacking meta-learner...")
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    
    # Validation on last 20% of training data
    model.eval()
    with torch.no_grad():
        val_size = int(0.2 * len(train_tensor))
        val_inputs = train_tensor[-val_size:].to(device)
        val_labels = y_train_t[-val_size:].to(device)
        
        val_outputs = model(val_inputs)
        val_loss = criterion(val_outputs, val_labels)
        _, val_preds = torch.max(val_outputs, 1)
        val_acc = accuracy_score(val_labels.cpu().numpy(), val_preds.cpu().numpy())
        val_accuracies.append(val_acc)
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Val Accuracy: {val_acc:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
        
        scheduler.step(val_loss)

# Load best model
model.load_state_dict(torch.load(best_model_path))

# === Plot training history ===
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1, 2, 2)
plt.plot(val_accuracies)
plt.title('Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

plt.tight_layout()
plt.savefig('lstm_training_history.png')
plt.close()

# === Evaluate LSTM stacking meta-learner ===
model.eval()
with torch.no_grad():
    outputs = model(test_tensor.to(device))
    _, preds_stack = torch.max(outputs, 1)
    preds_stack = preds_stack.cpu().numpy()

stack_acc = accuracy_score(y_test_int, preds_stack)
stack_report = classification_report(y_test_int, preds_stack)
stack_cm = confusion_matrix(y_test_int, preds_stack)

print("\n=== LSTM Stacking Ensemble Performance ===")
print(f"✅ Accuracy: {stack_acc:.4f}")
print("📊 Classification Report:")
print(stack_report)

plt.figure(figsize=(8, 6))
sns.heatmap(stack_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('LSTM Stacking Confusion Matrix')
plt.savefig('lstm_stack_confusion_matrix.png')
plt.close()

# ============ Single LSTM Model ============
print("\n=== Training standalone LSTM model ===")

feature_dim = x_train.shape[1]
time_steps = 1

x_train_lstm = x_train.reshape(n_train, time_steps, feature_dim)
x_test_lstm  = x_test.reshape(n_test, time_steps, feature_dim)

x_train_tensor = torch.FloatTensor(x_train_lstm)
x_test_tensor  = torch.FloatTensor(x_test_lstm)

lstm_ds = TensorDataset(x_train_tensor, y_train_t)
lstm_loader = DataLoader(lstm_ds, batch_size=64, shuffle=True)

class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, dropout=0.1, bidirectional=True, output_dim=2):
        super(SimpleLSTM, self).__init__()
        self.directions = 2 if bidirectional else 1
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers>1 else 0,
            bidirectional=bidirectional
        )
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_size*self.directions, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, output_dim)
        )
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_out = lstm_out[:, -1, :]
        out = self.fc(last_out)
        return self.softmax(out)

single_lstm = SimpleLSTM(
    input_size=feature_dim,
    hidden_size=hidden_size,
    num_layers=2,
    dropout=0.1,
    bidirectional=True,
    output_dim=2
).to(device)

lstm_criterion = nn.CrossEntropyLoss()
lstm_optimizer = optim.Adam(single_lstm.parameters(), lr=0.001, weight_decay=1e-5)
lstm_scheduler = optim.lr_scheduler.ReduceLROnPlateau(lstm_optimizer, mode='min', factor=0.5, patience=3, verbose=True)

lstm_epochs = 50
best_val_loss2 = float('inf')
patience2 = 10
counter2 = 0
best_single_path = 'best_single_lstm_model.pt'

train_losses2 = []
val_acc2 = []

for epoch in range(lstm_epochs):
    single_lstm.train()
    total = 0
    
    for inputs, labels in lstm_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = single_lstm(inputs)
        loss = lstm_criterion(outputs, labels)
        
        lstm_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(single_lstm.parameters(), max_norm=1.0)
        lstm_optimizer.step()
        
        total += loss.item()
    
    avg = total / len(lstm_loader)
    train_losses2.append(avg)
    
    single_lstm.eval()
    with torch.no_grad():
        val_size = int(0.2 * len(x_train_tensor))
        v_inputs = x_train_tensor[-val_size:].to(device)
        v_labels = y_train_t[-val_size:].to(device)
        
        v_out = single_lstm(v_inputs)
        v_loss = lstm_criterion(v_out, v_labels)
        _, v_preds = torch.max(v_out, 1)
        v_acc = accuracy_score(v_labels.cpu().numpy(), v_preds.cpu().numpy())
        val_acc2.append(v_acc)
        
        print(f"Epoch {epoch+1}/{lstm_epochs}, Loss: {avg:.4f}, Val Acc: {v_acc:.4f}")
        
        if v_loss < best_val_loss2:
            best_val_loss2 = v_loss
            counter2 = 0
            torch.save(single_lstm.state_dict(), best_single_path)
        else:
            counter2 += 1
            if counter2 >= patience2:
                print(f"Early stopping at epoch {epoch+1}")
                break
        
        lstm_scheduler.step(v_loss)

single_lstm.load_state_dict(torch.load(best_single_path))

# Plot standalone LSTM history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(train_losses2)
plt.title('Standalone LSTM Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1, 2, 2)
plt.plot(val_acc2)
plt.title('Standalone LSTM Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

plt.tight_layout()
plt.savefig('single_lstm_training_history.png')
plt.close()

# Evaluate standalone LSTM
single_lstm.eval()
with torch.no_grad():
    outs = single_lstm(x_test_tensor.to(device))
    _, preds_single = torch.max(outs, 1)
    preds_single = preds_single.cpu().numpy()

single_acc = accuracy_score(y_test_int, preds_single)
single_report = classification_report(y_test_int, preds_single)
single_cm = confusion_matrix(y_test_int, preds_single)

print("\n=== Standalone LSTM Performance ===")
print(f"✅ Accuracy: {single_acc:.4f}")
print("📊 Classification Report:")
print(single_report)

plt.figure(figsize=(8, 6))
sns.heatmap(single_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Standalone LSTM Confusion Matrix')
plt.savefig('single_lstm_confusion_matrix.png')
plt.close()

# === Majority Voting Ensemble ===
test_base_preds = [
    np.argmax(xgb_test_preds, axis=1),
    np.argmax(lgb_test_preds, axis=1),
    np.argmax(rf_test_preds,  axis=1)
]

votes = np.array(test_base_preds).T
vote_result = np.array([np.bincount(row).argmax() for row in votes])

vote_acc = accuracy_score(y_test_int, vote_result)
vote_report = classification_report(y_test_int, vote_result)

print("\n=== Majority Voting Performance ===")
print(f"✅ Accuracy: {vote_acc:.4f}")
print("📊 Classification Report:")
print(vote_report)

# === Baseline model reports ===
print("\n=== Baseline Model Reports ===")
for name, preds in zip(base_models, test_base_preds):
    print(f"\n---- {name} ----")
    print(f"Accuracy: {accuracy_score(y_test_int, preds):.4f}")
    print("Classification Report:")
    print(classification_report(y_test_int, preds))

# === Compare all models ===
base_accs = [accuracy_score(y_test_int, preds) for preds in test_base_preds]
model_names = base_models + ['Voting', 'LSTM Stack', 'Single LSTM']
model_accs = base_accs + [vote_acc, stack_acc, single_acc]

plt.figure(figsize=(12, 6))
bars = plt.bar(model_names, model_accs)

for bar, acc in zip(bars, model_accs):
    plt.text(bar.get_x()+bar.get_width()/2, bar.get_height()+0.01,
             f'{acc:.4f}', ha='center', va='bottom')

plt.ylim(0, max(model_accs)+0.1)
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Test Set Performance Comparison')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('all_models_comparison.png')
plt.close()

print("\n=== Test Set Performance Comparison ===")
for name, acc in zip(model_names, model_accs):
    print(f"{name}: {acc:.4f}")

print("\nAll training and evaluation complete!")


Train set shape: (4940, 34), Test set shape: (380, 34)
Class distribution — Train: [2237 2703], Test: [175 205]
Starting cross-validation to generate stacking features...
Processing fold 1/5...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000553 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2187
[LightGBM] [Info] Number of data points in the train set: 3952, number of used features: 34
[LightGBM] [Info] Start training from score -0.800421
[LightGBM] [Info] Start training from score -0.596274
Processing fold 2/5...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2187
[LightGBM] [Info] Number of data points in the train set: 3952,