In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# === 1. Charger les données ===
data = np.load("training_data.npz", allow_pickle=True)
X = np.array(data["data"], dtype=np.float32)  # conversion importante !
feature_labels = data["feature_labels"]

# Charger les labels
y = pd.read_csv("training_labels.csv")
y = y.iloc[:, 0].values 

In [67]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Assume X is your raw input of shape (n_samples, 12, 77)

# Step 1: Mean Imputation per feature
X_imputed = X.copy()
for f in range(X.shape[2]):
    feature_vals = X[:, :, f]
    feature_mean = np.nanmean(feature_vals)
    X_imputed[:, :, f] = np.nan_to_num(feature_vals, nan=feature_mean)

# Step 2: Outlier removal (clip to mean ± 5 * std)
X_no_outliers = X_imputed.copy()
for f in range(X.shape[2]):
    mean = np.nanmean(X_no_outliers[:, :, f])
    std = np.nanstd(X_no_outliers[:, :, f])
    lower_bound = mean - 5 * std
    upper_bound = mean + 5 * std
    X_no_outliers[:, :, f] = np.clip(X_no_outliers[:, :, f], lower_bound, upper_bound)

# Step 3: Standard Scaling (per feature, over all samples and time)
n, t, f = X_no_outliers.shape
X_flat = X_no_outliers.reshape(-1, f)  # shape (n * t, f)
scaler = StandardScaler()
X_scaled_flat = scaler.fit_transform(X_flat)
X_ready = X_scaled_flat.reshape(n, t, f)  # final shape: (n_samples, 12, 77)


In [68]:
from sklearn.model_selection import train_test_split

# y should be a 1D numpy array of 0/1 labels
X_train, X_val, y_train, y_val = train_test_split(
    X_ready, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Val shape:", X_val.shape)


Train shape: (42921, 12, 77)
Val shape: (10731, 12, 77)


In [69]:
from torch.utils.data import Sampler
import random
import numpy as np

class BalancedBatchSampler(Sampler):
    def __init__(self, labels, batch_size):
        self.labels = np.array(labels)
        self.batch_size = batch_size
        self.pos_indices = np.where(self.labels == 1)[0].tolist()
        self.neg_indices = np.where(self.labels == 0)[0].tolist()
        self.batch_half = batch_size // 2

    def __iter__(self):
        # Shuffle both positive and negative indices each epoch
        random.shuffle(self.pos_indices)
        random.shuffle(self.neg_indices)

        pos_iter = iter(self.pos_indices)
        neg_iter = iter(self.neg_indices)

        while True:
            try:
                pos_batch = [next(pos_iter) for _ in range(self.batch_half)]
                neg_batch = [next(neg_iter) for _ in range(self.batch_half)]
                batch = pos_batch + neg_batch
                random.shuffle(batch)
                yield batch
            except StopIteration:
                break

    def __len__(self):
        return min(len(self.pos_indices), len(self.neg_indices)) // self.batch_half


In [70]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split



# Custom Dataset class
class DiabetesDataset(Dataset):
    def __init__(self, X, y):
        # Assume X and y are NumPy arrays
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create datasets
train_dataset = DiabetesDataset(X_train, y_train)
train_sampler = BalancedBatchSampler(y_train, batch_size=64)
train_loader = DataLoader(train_dataset, batch_sampler=train_sampler)

val_dataset = DiabetesDataset(X_val, y_val)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


In [71]:
import torch
import torch.nn as nn

class ImprovedLSTM(nn.Module):
    def __init__(self, input_size=77, hidden_size=128, num_layers=2, dropout=0.3, bidirectional=True):
        super(ImprovedLSTM, self).__init__()
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True,
            bidirectional=bidirectional
        )
        direction_multiplier = 2 if bidirectional else 1
        self.norm = nn.LayerNorm(hidden_size * direction_multiplier)
        self.fc = nn.Linear(hidden_size * direction_multiplier, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_output = lstm_out[:, -1, :]
        normalized = self.norm(last_output)
        logits = self.fc(normalized)
        return logits.squeeze()


In [72]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        bce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        probas = torch.sigmoid(inputs)
        pt = probas * targets + (1 - probas) * (1 - targets)
        focal_term = (1 - pt) ** self.gamma
        loss = self.alpha * focal_term * bce_loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

In [73]:
import torch

# Count positives and negatives in training set
n_pos = (y_train == 1).sum()
n_neg = (y_train == 0).sum()

# Compute pos_weight: how much more to weigh the positive class
pos_weight_value = n_neg / n_pos
pos_weight = torch.tensor([pos_weight_value], dtype=torch.float32)

print(f"📊 Positive samples: {n_pos}, Negative samples: {n_neg}")
print(f"⚖️ Using pos_weight = {pos_weight.item():.4f}")


📊 Positive samples: 2714, Negative samples: 40207
⚖️ Using pos_weight = 14.8147


# ***LSTM***

In [74]:
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model, Loss, Optimizer
model = ImprovedLSTM().to(device)
n_pos = (y_train == 1).sum()
n_neg = (y_train == 0).sum()
pos_weight = torch.tensor([n_neg / n_pos], dtype=torch.float32).to(device)
# Focal Loss instead of BCEWithLogitsLoss
criterion = FocalLoss(alpha=1, gamma=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)


def train_model(train_loader, val_loader, y_train):
    model = ImprovedLSTM().to(device)

    # Compute pos_weight for imbalance
    n_pos = (y_train == 1).sum()
    n_neg = (y_train == 0).sum()
    pos_weight = torch.tensor([n_neg / n_pos], dtype=torch.float32).to(device)

    criterion = FocalLoss(alpha=1, gamma=2)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

    def evaluate(model, dataloader):
        model.eval()
        y_true, y_prob = [], []
        with torch.no_grad():
            for X_batch, y_batch in dataloader:
                X_batch = X_batch.to(device)
                logits = model(X_batch)
                probs = torch.sigmoid(logits).cpu().numpy()
                y_prob.extend(probs)
                y_true.extend(y_batch.numpy())
        return np.array(y_true), np.array(y_prob)

    best_f1, best_epoch = 0, 0
    for epoch in range(1, 31):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            optimizer.zero_grad()
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        y_true, y_prob = evaluate(model, val_loader)
        thresholds = np.linspace(0.1, 0.9, 81)
        f1_scores = [f1_score(y_true, y_prob > t) for t in thresholds]
        best_thresh = thresholds[np.argmax(f1_scores)]
        best_epoch_f1 = max(f1_scores)
        acc = accuracy_score(y_true, y_prob > 0.5)
        auc = roc_auc_score(y_true, y_prob)

        if best_epoch_f1 > best_f1:
            best_f1 = best_epoch_f1
            best_epoch = epoch
            torch.save(model.state_dict(), "best_model_lstm.pth")

        scheduler.step(best_epoch_f1)

        print(f"Epoch {epoch:02d} | Loss: {total_loss:.4f} | Acc: {acc:.4f} | F1@best_thresh: {best_epoch_f1:.4f} | AUC: {auc:.4f}")

    print(f"\n✅ Best F1-score: {best_f1:.4f} achieved at epoch {best_epoch}")
    return model, best_thresh
trained_model, threshold = train_model(train_loader, val_loader, y_train)
# ➤ Call it like this:
# trained_model, threshold = train_model(train_loader, val_loader, y_train)


Epoch 01 | Loss: 34.9065 | Acc: 0.9380 | F1@best_thresh: 0.4076 | AUC: 0.8951
Epoch 02 | Loss: 29.0027 | Acc: 0.9369 | F1@best_thresh: 0.4519 | AUC: 0.9048
Epoch 03 | Loss: 27.2355 | Acc: 0.9396 | F1@best_thresh: 0.4409 | AUC: 0.9056
Epoch 04 | Loss: 26.2248 | Acc: 0.9399 | F1@best_thresh: 0.4468 | AUC: 0.9086
Epoch 05 | Loss: 24.8561 | Acc: 0.9412 | F1@best_thresh: 0.4340 | AUC: 0.8994
Epoch 06 | Loss: 23.9555 | Acc: 0.9373 | F1@best_thresh: 0.4467 | AUC: 0.9076
Epoch 07 | Loss: 22.1093 | Acc: 0.9405 | F1@best_thresh: 0.4544 | AUC: 0.9027
Epoch 08 | Loss: 21.1600 | Acc: 0.9409 | F1@best_thresh: 0.4548 | AUC: 0.9016
Epoch 09 | Loss: 20.6512 | Acc: 0.9397 | F1@best_thresh: 0.4492 | AUC: 0.8982
Epoch 10 | Loss: 20.0116 | Acc: 0.9414 | F1@best_thresh: 0.4467 | AUC: 0.8899
Epoch 11 | Loss: 19.5735 | Acc: 0.9391 | F1@best_thresh: 0.4482 | AUC: 0.8908
Epoch 12 | Loss: 18.9067 | Acc: 0.9405 | F1@best_thresh: 0.4387 | AUC: 0.8846
Epoch 13 | Loss: 17.7164 | Acc: 0.9391 | F1@best_thresh: 0.4464 

# ***CNN***

In [75]:
import torch
import torch.nn as nn

class ImprovedCNNModel(nn.Module):
    def __init__(self, input_channels=77, kernel_size=3, dropout=0.3):
        super(ImprovedCNNModel, self).__init__()

        self.conv1 = nn.Conv1d(input_channels, 64, kernel_size, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        
        self.conv2 = nn.Conv1d(64, 128, kernel_size, padding=1)
        self.bn2 = nn.BatchNorm1d(128)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

        self.pool = nn.AdaptiveAvgPool1d(1)  # Global Average Pooling
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        # Input shape: (batch, seq_len=12, features=77) → (batch, features, seq_len)
        x = x.permute(0, 2, 1)
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)  # Shape: (batch, 128)
        x = self.dropout(x)
        x = self.fc(x)
        return x.squeeze()  # Output: (batch,)


In [76]:
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

n_pos = (y_train == 1).sum()
n_neg = (y_train == 0).sum()
pos_weight = torch.tensor([n_neg / n_pos], dtype=torch.float32).to(device)
# Focal Loss instead of BCEWithLogitsLoss
criterion = FocalLoss(alpha=1, gamma=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Make sure your model and FocalLoss are defined elsewhere
# class DeepCNN(nn.Module): ...
# class FocalLoss(nn.Module): ...

def train_model(train_loader, val_loader, y_train):
    model = ImprovedCNNModel().to(device)

    # Compute pos_weight for imbalance
    n_pos = (y_train == 1).sum()
    n_neg = (y_train == 0).sum()
    pos_weight = torch.tensor([n_neg / n_pos], dtype=torch.float32).to(device)

    criterion = FocalLoss(alpha=1, gamma=2)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

    def evaluate(model, dataloader):
        model.eval()
        y_true, y_prob = [], []
        with torch.no_grad():
            for X_batch, y_batch in dataloader:
                X_batch = X_batch.to(device)
                logits = model(X_batch)
                probs = torch.sigmoid(logits).cpu().numpy()
                y_prob.extend(probs)
                y_true.extend(y_batch.numpy())
        return np.array(y_true), np.array(y_prob)

    best_f1, best_epoch = 0, 0
    for epoch in range(1, 31):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            optimizer.zero_grad()
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        y_true, y_prob = evaluate(model, val_loader)
        thresholds = np.linspace(0.1, 0.9, 81)
        f1_scores = [f1_score(y_true, y_prob > t) for t in thresholds]
        best_thresh = thresholds[np.argmax(f1_scores)]
        best_epoch_f1 = max(f1_scores)
        acc = accuracy_score(y_true, y_prob > 0.5)
        auc = roc_auc_score(y_true, y_prob)

        if best_epoch_f1 > best_f1:
            best_f1 = best_epoch_f1
            best_epoch = epoch
            torch.save(model.state_dict(), "best_model_cnn.pth")

        scheduler.step(best_epoch_f1)

        print(f"Epoch {epoch:02d} | Loss: {total_loss:.4f} | Acc: {acc:.4f} | F1@best_thresh: {best_epoch_f1:.4f} | AUC: {auc:.4f}")

    print(f"\n✅ Best F1-score: {best_f1:.4f} achieved at epoch {best_epoch}")
    return model, best_thresh
trained_model, threshold = train_model(train_loader, val_loader, y_train)
# ➤ Call it like this:
# trained_model, threshold = train_model(train_loader, val_loader, y_train)


Epoch 01 | Loss: 37.0272 | Acc: 0.9372 | F1@best_thresh: 0.3976 | AUC: 0.8915
Epoch 02 | Loss: 30.9534 | Acc: 0.9390 | F1@best_thresh: 0.4252 | AUC: 0.9010
Epoch 03 | Loss: 29.3927 | Acc: 0.9416 | F1@best_thresh: 0.4272 | AUC: 0.9017
Epoch 04 | Loss: 28.4255 | Acc: 0.9408 | F1@best_thresh: 0.4428 | AUC: 0.9071
Epoch 05 | Loss: 27.6088 | Acc: 0.9420 | F1@best_thresh: 0.4391 | AUC: 0.9044
Epoch 06 | Loss: 26.9163 | Acc: 0.9405 | F1@best_thresh: 0.4493 | AUC: 0.9053
Epoch 07 | Loss: 26.3853 | Acc: 0.9405 | F1@best_thresh: 0.4458 | AUC: 0.9059
Epoch 08 | Loss: 26.1959 | Acc: 0.9415 | F1@best_thresh: 0.4643 | AUC: 0.9093
Epoch 09 | Loss: 25.4831 | Acc: 0.9425 | F1@best_thresh: 0.4441 | AUC: 0.9029
Epoch 10 | Loss: 25.1691 | Acc: 0.9416 | F1@best_thresh: 0.4613 | AUC: 0.9087
Epoch 11 | Loss: 24.7205 | Acc: 0.9412 | F1@best_thresh: 0.4588 | AUC: 0.9088
Epoch 12 | Loss: 24.2569 | Acc: 0.9407 | F1@best_thresh: 0.4571 | AUC: 0.9009
Epoch 13 | Loss: 23.1989 | Acc: 0.9402 | F1@best_thresh: 0.4535 

# ***fusion***

In [77]:
lstm_model = ImprovedLSTM().to(device)
cnn_model = ImprovedCNNModel().to(device)

lstm_model.load_state_dict(torch.load("best_model_lstm.pth"))
cnn_model.load_state_dict(torch.load("best_model_cnn.pth"))

lstm_model.eval()
cnn_model.eval()


ImprovedCNNModel(
  (conv1): Conv1d(77, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (pool): AdaptiveAvgPool1d(output_size=1)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [78]:
def get_model_probs(model, dataloader):
    probs = []
    with torch.no_grad():
        for X_batch, _ in dataloader:
            X_batch = X_batch.to(device)
            logits = model(X_batch)
            prob = torch.sigmoid(logits).cpu().numpy()
            probs.extend(prob)
    return np.array(probs)

y_true = []
for _, y_batch in val_loader:
    y_true.extend(y_batch.numpy())
y_true = np.array(y_true)

probs_lstm = get_model_probs(lstm_model, val_loader)
probs_cnn  = get_model_probs(cnn_model, val_loader)


In [91]:
probs_fused = 0.4 * probs_lstm + 0.6 * probs_cnn


In [92]:
thresholds = np.linspace(0.1, 0.9, 81)
f1s = [f1_score(y_true, probs_fused > t) for t in thresholds]
best_thresh = thresholds[np.argmax(f1s)]

final_f1 = max(f1s)
final_acc = accuracy_score(y_true, probs_fused > 0.5)
final_auc = roc_auc_score(y_true, probs_fused)

print(f"\n📊 Fusion Results:")
print(f"AUC: {final_auc:.4f} | Acc: {final_acc:.4f} | Best F1: {final_f1:.4f} @ Threshold: {best_thresh:.2f}")



📊 Fusion Results:
AUC: 0.9138 | Acc: 0.9427 | Best F1: 0.4638 @ Threshold: 0.38


In [None]:
AUC: 0.9138 | Acc: 0.9427 | Best F1: 0.4638 @ Threshold: 0.38

# ***testing***

In [94]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler

# ───────────────────────────────────────────────────────────────
# Load test data
# ───────────────────────────────────────────────────────────────
data = np.load("evaluation_data.npz", allow_pickle=True)
X_test = np.array(data["data"], dtype=np.float32)  # conversion importante !
feature_labels = data["feature_labels"]



# ───────────────────────────────────────────────────────────────
# Apply preprocessing (same as training pipeline)
# ───────────────────────────────────────────────────────────────

# Step 1: Mean Imputation
for f in range(X_test.shape[2]):
    feature_vals = X_test[:, :, f]
    feature_mean = np.nanmean(feature_vals)
    X_test[:, :, f] = np.nan_to_num(feature_vals, nan=feature_mean)

# Step 2: Outlier clipping (mean ± 5*std)
for f in range(X_test.shape[2]):
    mean = np.nanmean(X_test[:, :, f])
    std = np.nanstd(X_test[:, :, f])
    lower = mean - 5 * std
    upper = mean + 5 * std
    X_test[:, :, f] = np.clip(X_test[:, :, f], lower, upper)

# Step 3: Standard Scaling
n, t, f = X_test.shape
X_flat = X_test.reshape(-1, f)
scaler = StandardScaler()
X_scaled_flat = scaler.fit_transform(X_flat)
X_test_ready = X_scaled_flat.reshape(n, t, f)

# ───────────────────────────────────────────────────────────────
# Convert to PyTorch dataloader (no labels here)
# ───────────────────────────────────────────────────────────────
from torch.utils.data import DataLoader, TensorDataset

X_tensor = torch.tensor(X_test_ready, dtype=torch.float32)
dummy_labels = torch.zeros(len(X_tensor))  # dummy placeholder
test_loader = DataLoader(TensorDataset(X_tensor, dummy_labels), batch_size=64, shuffle=False)

# ───────────────────────────────────────────────────────────────
# Load both trained models
# ───────────────────────────────────────────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lstm_model = ImprovedLSTM().to(device)
cnn_model = ImprovedCNNModel().to(device)
lstm_model.load_state_dict(torch.load("best_model_lstm.pth"))
cnn_model.load_state_dict(torch.load("best_model_cnn.pth"))
lstm_model.eval()
cnn_model.eval()

# ───────────────────────────────────────────────────────────────
# Predict probabilities and apply weighted fusion
# ───────────────────────────────────────────────────────────────
def get_model_probs(model, dataloader):
    probs = []
    with torch.no_grad():
        for X_batch, _ in dataloader:
            X_batch = X_batch.to(device)
            logits = model(X_batch)
            prob = torch.sigmoid(logits).cpu().numpy()
            probs.extend(prob)
    return np.array(probs)

probs_lstm = get_model_probs(lstm_model, test_loader)
probs_cnn  = get_model_probs(cnn_model, test_loader)
probs_fused = 0.4 * probs_lstm + 0.6 * probs_cnn

# ───────────────────────────────────────────────────────────────
# Convert probs to class labels using threshold 0.39
# ───────────────────────────────────────────────────────────────
best_thresh = 0.39
labels = (probs_fused > best_thresh).astype(int).flatten()

# ───────────────────────────────────────────────────────────────
# Generate final submission file
# ───────────────────────────────────────────────────────────────
submission = pd.DataFrame({
    "Id": np.arange(len(labels)),
    "Label": labels
})

submission.to_csv("final_submission.csv", index=False)
print("✅ Submission file saved as final_submission.csv with binary labels.")


✅ Submission file saved as final_submission.csv with binary labels.
