<a href="https://colab.research.google.com/github/LoPA607/IE643/blob/main/programming_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

ie_643_2025_programming_challenge_1_path = kagglehub.competition_download('ie-643-2025-programming-challenge-1')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score
from scipy.stats import skew, kurtosis

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
# ================================
# Load Full Data
# ================================
print("Loading Train Data & Labels...")
train_data = pd.read_csv("/kaggle/input/ie-643-2025-programming-challenge-1/Train_Data.csv")        # (68599, 1024)
train_labels = pd.read_csv("/kaggle/input/ie-643-2025-programming-challenge-1/Train_Labels.csv")    # (68599, 25)
print("Train Data:", train_data.shape)
print("Train Labels:", train_labels.shape)

print("Loading Test Data...")
test_data = pd.read_csv("/kaggle/input/ie-643-2025-programming-challenge-1/Test_Data.csv")          # (17150, 1025: ID + features)
print("Test Data:", test_data.shape)

# Extract IDs and features
test_ids = test_data["ID"].values
X_test = test_data.drop(columns=["ID"]).values.astype(np.float32)


In [None]:
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

In [None]:
X = train_data.values.astype(np.float32)
y = train_labels.values.astype(np.float32)

# --- Step 1: Reduce dimensionality for anomaly detection
pca = PCA(n_components=50, random_state=42)
X_reduced = pca.fit_transform(X)

# --- Step 2: Detect anomalies (20% assumed noisy)
iso = IsolationForest(contamination=0.2, random_state=42, n_jobs=-1)
outlier_flags = iso.fit_predict(X_reduced)  # -1 = noisy, 1 = clean

# --- Step 3: Keep only clean samples
clean_idx = np.where(outlier_flags == 1)[0]
X_clean, y_clean = X[clean_idx], y[clean_idx]

print("Original Train Samples:", X.shape[0])
print("Removed Noisy Samples:", len(X) - len(X_clean))
print("Remaining Clean Samples:", X_clean.shape[0])

In [None]:
def make_features(X_in):
    feats = []

    # PCA features
    pca = PCA(n_components=50, random_state=42)
    pca_feats = pca.fit_transform(X_in)
    feats.append(pca_feats)

    # statistical features
    mean_ = X_in.mean(axis=1, keepdims=True)
    std_ = X_in.std(axis=1, keepdims=True)
    min_ = X_in.min(axis=1, keepdims=True)
    max_ = X_in.max(axis=1, keepdims=True)
    skew_ = skew(X_in, axis=1).reshape(-1,1)
    kurt_ = kurtosis(X_in, axis=1).reshape(-1,1)
    stats = np.hstack([mean_, std_, min_, max_, skew_, kurt_])
    feats.append(stats)

    # anomaly score feature
    iso2 = IsolationForest(contamination=0.1, random_state=0, n_jobs=-1)
    scores = -iso2.fit_predict(X_in)   # 1 normal, -1 anomaly
    scores = scores.reshape(-1,1)
    feats.append(scores)

    return np.hstack(feats), pca, iso2
X_feat, pca, iso2 = make_features(X_clean)

# transform test data with same pca & iso2
pca_feats_test = pca.transform(X_test)
stats_test = np.hstack([
    X_test.mean(axis=1, keepdims=True),
    X_test.std(axis=1, keepdims=True),
    X_test.min(axis=1, keepdims=True),
    X_test.max(axis=1, keepdims=True),
    skew(X_test, axis=1).reshape(-1,1),
    kurtosis(X_test, axis=1).reshape(-1,1)
])
scores_test = -iso2.fit_predict(X_test).reshape(-1,1)

X_test_feat = np.hstack([pca_feats_test, stats_test, scores_test])

print("Engineered Train Features:", X_feat.shape)
print("Engineered Test Features :", X_test_feat.shape)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_feat, y_clean, test_size=0.15, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test_feat = scaler.transform(X_test_feat)

In [None]:
# ================================
# Preprocessing
# ================================
X_train, X_val, y_train, y_val = train_test_split(
    X_clean, y_clean, test_size=0.15, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

# ================================
# Torch Dataset
# ================================
class MultiLabelDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X).float()
        self.y = None if y is None else torch.from_numpy(y).float()
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

train_ds = MultiLabelDataset(X_train, y_train)
val_ds   = MultiLabelDataset(X_val, y_val)
test_ds  = MultiLabelDataset(X_test)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=256, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False)


In [None]:
# ================================
# Define MLP Model
# ================================
import torch
import torch.nn as nn
import torch.nn.functional as F

# ================================
# Define Deeper MLP with Skip Connections
# ================================
class MLP(nn.Module):
    def __init__(self, input_dim=1024, hidden_dims=[1024, 768, 512, 384, 256, 128], output_dim=25, dropout=0.3):
        super().__init__()

        self.fc1 = nn.Linear(input_dim, hidden_dims[0])
        self.bn1 = nn.BatchNorm1d(hidden_dims[0])

        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])
        self.bn2 = nn.BatchNorm1d(hidden_dims[1])

        self.fc3 = nn.Linear(hidden_dims[1], hidden_dims[2])
        self.bn3 = nn.BatchNorm1d(hidden_dims[2])

        self.fc4 = nn.Linear(hidden_dims[2], hidden_dims[3])
        self.bn4 = nn.BatchNorm1d(hidden_dims[3])

        self.fc5 = nn.Linear(hidden_dims[3] + hidden_dims[1], hidden_dims[4])  # concat skip from layer2
        self.bn5 = nn.BatchNorm1d(hidden_dims[4])

        self.fc6 = nn.Linear(hidden_dims[4] + hidden_dims[0], hidden_dims[5])  # concat skip from layer1
        self.bn6 = nn.BatchNorm1d(hidden_dims[5])

        self.out = nn.Linear(hidden_dims[5], output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h1 = F.relu(self.bn1(self.fc1(x)))
        h1 = self.dropout(h1)

        h2 = F.relu(self.bn2(self.fc2(h1)))
        h2 = self.dropout(h2)

        h3 = F.relu(self.bn3(self.fc3(h2)))
        h3 = self.dropout(h3)

        h4 = F.relu(self.bn4(self.fc4(h3)))
        h4 = self.dropout(h4)

        # skip connections: concat features from earlier layers
        h5_in = torch.cat([h4, h2], dim=1)
        h5 = F.relu(self.bn5(self.fc5(h5_in)))
        h5 = self.dropout(h5)

        h6_in = torch.cat([h5, h1], dim=1)
        h6 = F.relu(self.bn6(self.fc6(h6_in)))
        h6 = self.dropout(h6)

        out = self.out(h6)
        return out



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(input_dim=X_train.shape[1]).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# CosineAnnealingLR with 10 epochs per cycle
from torch.optim.lr_scheduler import CosineAnnealingLR
scheduler = CosineAnnealingLR(optimizer, T_max=10)

# ================================
# Training Loop
# ================================
def evaluate(model, loader):
    model.eval()
    ys, preds = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            out = model(xb)
            prob = torch.sigmoid(out).cpu().numpy()
            ys.append(yb.numpy())
            preds.append(prob)
    ys = np.vstack(ys)
    preds = np.vstack(preds)
    aucs = []
    for i in range(ys.shape[1]):
        try:
            auc = roc_auc_score(ys[:,i], preds[:,i])
        except ValueError:
            auc = np.nan
        aucs.append(auc)
    return np.nanmean(aucs)


In [None]:
EPOCHS = 10
for epoch in range(1, EPOCHS+1):
    model.train()
    train_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch}")
    for xb, yb in loop:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
        loop.set_postfix(loss=loss.item())

    train_loss /= len(train_loader.dataset)

    # validation
    model.eval()
    val_loss = 0
    ys, preds = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            prob = torch.sigmoid(out).cpu().numpy()
            loss = criterion(out, yb)
            val_loss += loss.item() * xb.size(0)
            ys.append(yb.cpu().numpy())
            preds.append(prob)

    val_loss /= len(val_loader.dataset)
    ys = np.vstack(ys)
    preds = np.vstack(preds)

    # compute ROC-AUC
    aucs = []
    for i in range(ys.shape[1]):
        try:
            auc = roc_auc_score(ys[:, i], preds[:, i])
        except ValueError:
            auc = np.nan
        aucs.append(auc)
    val_auc = np.nanmean(aucs)

    # step scheduler (after each epoch)
    scheduler.step()

    print(f"Epoch {epoch}: Train Loss={train_loss:.4f}, "
          f"Val Loss={val_loss:.4f}, Val ROC-AUC={val_auc:.4f}, "
          f"LR={optimizer.param_groups[0]['lr']:.6f}")


In [None]:
# ================================
# Inference on Test Data
# ================================
model.eval()
all_preds = []
with torch.no_grad():
    for xb in test_loader:
        xb = xb.to(device)
        out = model(xb)
        prob = torch.sigmoid(out).cpu().numpy()
        all_preds.append(prob)

all_preds = np.vstack(all_preds)   # shape: (17150, 25)

# Apply threshold
final_labels = (all_preds > 0.5).astype(int)

# ================================
# Build Submission File
# ================================
label_cols = train_labels.columns.tolist()  # ['A','B',...,'Y']
submission = pd.DataFrame(final_labels, columns=label_cols)
submission.insert(0, "ID", test_ids)

# Save CSV (Kaggle kernel environment → must use /kaggle/working)
submission_path = "/kaggle/working/submission5.csv"
submission.to_csv(submission_path, index=False)

print(submission.head())
