In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imageio.v2 as imageio

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
from sklearn.manifold import TSNE

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [2]:
# conf
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2
BATCH_SIZE = 64
EPOCHS = 120
LR = 1e-3
WEIGHT_DECAY = 1e-4
FRAME_EVERY = 2  # make a frame every N epochs

OUT_DIR = "outputs_bc_cnn_tsne_gif"
os.makedirs(OUT_DIR, exist_ok=True)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# Data
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")  # 1=benign, 0=malignant

# Split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=VAL_SIZE, stratify=y_temp, random_state=RANDOM_STATE)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

# Embedding
X_all_s = scaler.transform(X)
tsne = TSNE(n_components=2, random_state=RANDOM_STATE, init="pca", learning_rate="auto", perplexity=30)
X_all_2d = tsne.fit_transform(X_all_s)

# ? 
idx_train = X_train.index.to_numpy()
idx_val   = X_val.index.to_numpy()
idx_test  = X_test.index.to_numpy()


In [4]:
# Torch tensors
def to_tensor(x):
    return torch.tensor(x, dtype=torch.float32).unsqueeze(1)

X_train_t = to_tensor(X_train_s)
X_val_t   = to_tensor(X_val_s)
X_test_t  = to_tensor(X_test_s)

y_train_t = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
y_val_t   = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)
y_test_t  = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(TensorDataset(X_test_t, y_test_t), batch_size=BATCH_SIZE, shuffle=False)

# ---------------------------
# Model
class CNN1D(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(32, 1),
        )
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

model = CNN1D().to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

@torch.no_grad()
def predict_proba_tensor(X_t):
    model.eval()
    probs = []
    for i in range(0, X_t.size(0), BATCH_SIZE):
        xb = X_t[i:i+BATCH_SIZE].to(DEVICE)
        logits = model(xb)
        p = torch.sigmoid(logits).cpu().numpy().ravel()
        probs.append(p)
    return np.concatenate(probs)

@torch.no_grad()
def predict_proba(loader):
    model.eval()
    probs, labels = [], []
    for xb, yb in loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        p = torch.sigmoid(logits).cpu().numpy().ravel()
        probs.append(p)
        labels.append(yb.numpy().ravel())
    return np.concatenate(probs), np.concatenate(labels)

def train_one_epoch():
    model.train()
    total = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total += loss.item() * xb.size(0)
    return total / len(train_loader.dataset)


# Training + frames
frames = []
train_losses, val_aucs = [], []

X_all_t = to_tensor(X_all_s)

for epoch in range(1, EPOCHS+1):
    loss = train_one_epoch()
    train_losses.append(loss)

    val_probs, val_labels = predict_proba(val_loader)
    try:
        val_auc = roc_auc_score(val_labels, val_probs)
    except ValueError:
        val_auc = float("nan")
    val_aucs.append(val_auc)

    if epoch % FRAME_EVERY == 0 or epoch == 1 or epoch == EPOCHS:
      
        all_probs = predict_proba_tensor(X_all_t)

        # Validation cm
        val_pred = (val_probs >= 0.5).astype(int)
        cm = confusion_matrix(val_labels, val_pred)
      
        fig = plt.figure(figsize=(10, 4.8))

        # Left: t-SNE colored by prob
        ax1 = fig.add_subplot(1, 2, 1)
        sc = ax1.scatter(X_all_2d[:, 0], X_all_2d[:, 1], c=all_probs, s=20, alpha=0.85)
        cb = fig.colorbar(sc, ax=ax1)
        cb.set_label("Predicted P(benign)")
        ax1.set_title(f"t-SNE colored by P(benign) — Epoch {epoch}")
        ax1.set_xlabel("t-SNE 1")
        ax1.set_ylabel("t-SNE 2")

        benign_mask = (y.values == 1)
        malignant_mask = (y.values == 0)
        ax1.scatter(X_all_2d[benign_mask, 0], X_all_2d[benign_mask, 1], s=8, alpha=0.4, marker="o")
        ax1.scatter(X_all_2d[malignant_mask, 0], X_all_2d[malignant_mask, 1], s=8, alpha=0.4, marker="x")

        # Right: CM
        ax2 = fig.add_subplot(1, 2, 2)
        im = ax2.imshow(cm)
        ax2.set_title(f"Validation Confusion Matrix (thr=0.5)\nEpoch {epoch} | AUC={val_auc:.3f}")
        ax2.set_xlabel("Predicted")
        ax2.set_ylabel("True")
        for (i, j), v in np.ndenumerate(cm):
            ax2.text(j, i, int(v), ha="center", va="center")
        fig.tight_layout()

        frame_path = os.path.join(OUT_DIR, f"_frame_{epoch:03d}.png")
        fig.savefig(frame_path, dpi=140)
        plt.close(fig)

        frames.append(imageio.imread(frame_path))

In [5]:
# Save GIF
gif_path = os.path.join(OUT_DIR, "tsne_training.gif")
imageio.mimsave(gif_path, frames, duration=0.09)

# Final test metrics

test_probs, test_labels = predict_proba(test_loader)
test_pred = (test_probs >= 0.5).astype(int)
test_acc = accuracy_score(test_labels, test_pred)
try:
    test_auc = roc_auc_score(test_labels, test_probs)
except ValueError:
    test_auc = float("nan")
cm_test = confusion_matrix(test_labels, test_pred)
report = classification_report(test_labels, test_pred, target_names=["malignant (0)", "benign (1)"])

print("Done. Files saved to:", os.path.abspath(OUT_DIR))
print("GIF:", gif_path)


Done. Files saved to: /Users/amusuokamoto/Documents/Vscode_rando_/Random Projects/outputs_bc_cnn_tsne_gif
GIF: outputs_bc_cnn_tsne_gif/tsne_training.gif
