In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader
from scipy.interpolate import interp1d

In [3]:
# ─── Utilities ────────────────────────────────────────────────────────────────
lam, p, niter = 1e4, 0.01, 10
def baseline_als(y):
    L = len(y)
    D = np.diff(np.eye(L), 2)
    D = lam * D.dot(D.T)
    w = np.ones(L)
    for _ in range(niter):
        b = np.linalg.solve(np.diag(w) + D, w * y)
        w = p * (y > b) + (1 - p) * (y < b)
    return b

def preprocess(arr):
    out = np.zeros_like(arr)
    for i, s in enumerate(arr):
        b = baseline_als(s)
        c = s - b
        norm = np.linalg.norm(c)
        out[i] = c / norm if norm > 0 else c
    return out

def floatify_cols(df):
    new = []
    for c in df.columns:
        if c in ('Label', 'Label 1', 'Label 2'):
            new.append(c)
        else:
            new.append(float(c))
    df.columns = new

# --- Augmentation Functions ---
def add_baseline_drift(spec, degree=3, scale=0.1):
    """Add a random polynomial baseline of given degree."""
    n = len(spec)
    x = np.linspace(0, 1, n)
    coeffs = np.random.randn(degree + 1) * scale
    baseline = np.polyval(coeffs, x)
    return spec + baseline

def jitter_wavenumber(spec, max_shift=2):
    """Apply a small horizontal shift to simulate calibration jitter."""
    n = len(spec)
    x = np.arange(n)
    shift = np.random.uniform(-max_shift, max_shift)
    f = interp1d(x, spec, kind='cubic', fill_value="extrapolate")
    return f(x + shift)

In [4]:
# ─── 1) Load reference_v2 and preprocess ──────────────────────────────────────
ref_df = pd.read_csv('reference_v2.csv')

floatify_cols(ref_df)
wav_cols = [c for c in ref_df.columns if c != 'Label']
ref_specs  = ref_df[wav_cols].values       # (n_ref_samples, n_waves)
ref_labels = ref_df['Label'].values        # (n_ref_samples,)
ref_proc   = preprocess(ref_specs)         # (n_ref_samples, n_waves)

In [22]:
# --- Refactored Synthetic Mixture Generation ---
classes       = sorted(np.unique(ref_labels))
class_to_i    = {c: i for i, c in enumerate(classes)}
ratios        = np.arange(0.05, 1.0, 0.05)
noise_level   = 0.01
base_n        = 10  # base number of spectra per pair/ratio
oversample_set = {
    'meoh','6-mercapto-1-hexanol'
}

synth_specs  = []
synth_labels = []

for (ci, cj) in combinations(classes, 2):
    idx_i = np.where(ref_labels == ci)[0]
    idx_j = np.where(ref_labels == cj)[0]
    # Determine how many per ratio
    n_per_ratio = base_n * 2 if (ci in oversample_set or cj in oversample_set) else base_n
    
    for r in ratios:
        for _ in range(n_per_ratio):
            # pick random pure spectra
            spec_i = ref_specs[np.random.choice(idx_i)]
            spec_j = ref_specs[np.random.choice(idx_j)]
            # linear mix
            mix = r * spec_i + (1 - r) * spec_j
            # augmentations
            mix = add_baseline_drift(mix, degree=3, scale=0.1)
            mix = jitter_wavenumber(mix, max_shift=2)
            mix += np.random.normal(scale=noise_level, size=mix.shape)
            # store
            synth_specs.append(mix)
            synth_labels.append((ci, cj))

synth_specs = np.array(synth_specs)
print("Synthetic raw spectra after oversampling & augmentation:", synth_specs.shape)

Synthetic raw spectra after oversampling & augmentation: (16530, 1024)


In [23]:
from joblib import Parallel, delayed

def preprocess_single(spectrum):
    """
    Baseline‐correct + L2‐normalize + abs, for one 1D array.
    """
    b = baseline_als(spectrum)
    c = spectrum - b
    norm = np.linalg.norm(c)
    out = c / norm if norm > 0 else c
    return np.abs(out)

# 2) Parallel map over all spectra
#    n_jobs=-1 uses all CPUs; you can set e.g. n_jobs=4 to use 4 cores.
synth_proc = np.vstack(
    Parallel(n_jobs=-1, verbose=10)(
        delayed(preprocess_single)(spec) 
        for spec in synth_specs
    )
)

print("Parallel preprocess done:", synth_proc.shape)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.7s finished
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 131 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 177 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 227 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 254 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 281 tasks      | e

Parallel preprocess done: (16530, 1024)


[Parallel(n_jobs=-1)]: Done 16530 out of 16530 | elapsed:  6.4min finished


In [24]:
# ─── 4) Embed synthetic mixtures via Siamese ──────────────────────────────────
# load siamese
class SiameseNet(nn.Module):
    def __init__(self, input_len, embed_dim=64):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv1d(1,16,7,padding=3), nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(16,32,5,padding=2), nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Flatten(),
            nn.Linear((input_len//4)*32, embed_dim), nn.ReLU()
        )
    def forward(self,x):
        z = self.encoder(x)
        return F.normalize(z, dim=1)

siamese = SiameseNet(input_len=ref_proc.shape[1], embed_dim=64)
siamese.load_state_dict(torch.load('siamese_raman_resampled.pth', map_location='cpu'))
siamese.eval()

with torch.no_grad():
    tensor = torch.tensor(synth_proc, dtype=torch.float32).unsqueeze(1)
    syn_embeds = siamese(tensor).cpu().numpy()  # (n_synth, 64)
print("Synthetic embeddings:", syn_embeds.shape)

Synthetic embeddings: (16530, 64)


In [25]:
# ─── 5) Build X_synth, Y_synth ─────────────────────────────────────────────────
C = len(classes)
N = len(syn_embeds)
X_synth = syn_embeds
Y_synth = np.zeros((N, C), dtype=int)
for k, (ci, cj) in enumerate(synth_labels):
    Y_synth[k, class_to_i[ci]] = 1
    Y_synth[k, class_to_i[cj]] = 1

In [26]:
# ─── 6) Split synthetic into train/val/test (80/10/10) ─────────────────────────
X_tmp, X_test_s, Y_tmp, Y_test_s = train_test_split(X_synth, Y_synth, test_size=0.10, random_state=0)
X_train_s, X_val_s, Y_train_s, Y_val_s = train_test_split(X_tmp, Y_tmp, test_size=0.1111, random_state=0)
print("Synthetic train/val/test:", len(X_train_s), len(X_val_s), len(X_test_s))

Synthetic train/val/test: 13224 1653 1653


In [27]:
# ─── 7) DataLoaders for synthetic ─────────────────────────────────────────────
batch_size = 64
train_loader = DataLoader(
    TensorDataset(
        torch.tensor(X_train_s, dtype=torch.float32),
        torch.tensor(Y_train_s, dtype=torch.float32)   # ← make this float
    ),
    batch_size=batch_size,
    shuffle=True
)
val_loader = DataLoader(
    TensorDataset(
        torch.tensor(X_val_s, dtype=torch.float32),
        torch.tensor(Y_val_s, dtype=torch.float32)     # ← and this
    ),
    batch_size=batch_size
)
test_loader = DataLoader(
    TensorDataset(
        torch.tensor(X_test_s, dtype=torch.float32),
        torch.tensor(Y_test_s, dtype=torch.float32)    # ← and this
    ),
    batch_size=batch_size
)


In [28]:
# ─── 8) Define & train MLP on synthetic ────────────────────────────────────────
class PresenceNet(nn.Module):
    def __init__(self, D, C):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(D,128), nn.ReLU(),
            nn.Linear(128,64), nn.ReLU(),
            nn.Linear(64,C),   nn.Sigmoid()
        )
    def forward(self,x):
        return self.net(x)

D = syn_embeds.shape[1]
model_p = PresenceNet(D, C)
criterion = nn.BCELoss()
opt = optim.Adam(model_p.parameters(), lr=1e-3)
epochs = 200

for e in range(1, epochs+1):
    model_p.train()
    loss_tr = 0
    for xb, yb in train_loader:
        pred = model_p(xb)
        loss = criterion(pred, yb)
        opt.zero_grad(); loss.backward(); opt.step()
        loss_tr += loss.item()*xb.size(0)
    loss_tr /= len(train_loader.dataset)
    model_p.eval()
    loss_val = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            loss_val += criterion(model_p(xb), yb).item()*xb.size(0)
    loss_val /= len(val_loader.dataset)
    print(f"Epoch {e}/{epochs} — Train Loss: {loss_tr:.4f} | Val Loss: {loss_val:.4f}")

Epoch 1/200 — Train Loss: 0.4409 | Val Loss: 0.3367
Epoch 2/200 — Train Loss: 0.3035 | Val Loss: 0.2857
Epoch 3/200 — Train Loss: 0.2735 | Val Loss: 0.2687
Epoch 4/200 — Train Loss: 0.2611 | Val Loss: 0.2590
Epoch 5/200 — Train Loss: 0.2527 | Val Loss: 0.2520
Epoch 6/200 — Train Loss: 0.2466 | Val Loss: 0.2464
Epoch 7/200 — Train Loss: 0.2417 | Val Loss: 0.2422
Epoch 8/200 — Train Loss: 0.2381 | Val Loss: 0.2401
Epoch 9/200 — Train Loss: 0.2352 | Val Loss: 0.2349
Epoch 10/200 — Train Loss: 0.2322 | Val Loss: 0.2321
Epoch 11/200 — Train Loss: 0.2297 | Val Loss: 0.2310
Epoch 12/200 — Train Loss: 0.2274 | Val Loss: 0.2277
Epoch 13/200 — Train Loss: 0.2252 | Val Loss: 0.2253
Epoch 14/200 — Train Loss: 0.2234 | Val Loss: 0.2228
Epoch 15/200 — Train Loss: 0.2214 | Val Loss: 0.2207
Epoch 16/200 — Train Loss: 0.2197 | Val Loss: 0.2199
Epoch 17/200 — Train Loss: 0.2182 | Val Loss: 0.2174
Epoch 18/200 — Train Loss: 0.2166 | Val Loss: 0.2162
Epoch 19/200 — Train Loss: 0.2149 | Val Loss: 0.2148
Ep

In [29]:
# ─── 9) Evaluate synthetic test set ────────────────────────────────────────────
model_p.eval()
yp, yt = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        yp.append(model_p(xb).numpy())
        yt.append(yb.numpy())
y_pred = (np.vstack(yp)>0.5).astype(int)
y_true = np.vstack(yt)
print("\nSynthetic Test Classification Report:")
print(classification_report(y_true, y_pred, target_names=classes))


Synthetic Test Classification Report:
                              precision    recall  f1-score   support

           1,9-nonanedithiol       0.99      0.71      0.83       266
             1-dodecanethiol       0.63      0.56      0.59       226
             1-undecanethiol       0.59      0.30      0.40       224
        6-mercapto-1-hexanol       0.85      0.63      0.72       441
                     benzene       1.00      0.95      0.97       238
                benzenethiol       0.96      0.82      0.88       269
                        dmmp       1.00      0.87      0.93       257
                        etoh       0.98      0.71      0.82       246
                        meoh       0.89      0.85      0.87       414
       n,n-dimethylformamide       0.96      0.77      0.86       244
                    pyridine       0.94      0.90      0.91       258
tris(2-ethylhexyl) phosphate       0.97      0.72      0.83       223

                   micro avg       0.90      0.74

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# ─── 10) Final validation on real mixtures ─────────────────────────────────────
mix_df = pd.read_csv('mixtures_dataset.csv')
floatify_cols(mix_df)
mix_specs = mix_df[wav_cols].values
mix_proc  = preprocess(mix_specs)
pairs     = list(zip(mix_df['Label 1'], mix_df['Label 2']))

In [30]:
with torch.no_grad():
    mix_embeds = siamese(torch.tensor(mix_proc, dtype=torch.float32).unsqueeze(1)).cpu().numpy()

# build real multi-hot
N_real = len(mix_df)
Y_real = np.zeros((N_real, C), dtype=int)
for i, (l1, l2) in enumerate(pairs):
    Y_real[i, class_to_i[l1]] = 1
    Y_real[i, class_to_i[l2]] = 1

# predict real
model_p.eval()
preds = model_p(torch.tensor(mix_embeds, dtype=torch.float32)).detach().numpy()
Y_pred_real = (preds>0.5).astype(int)

from sklearn.metrics import classification_report

# 1) Compute support for each class
supports = Y_real.sum(axis=0)   # length C array of counts

# 2) Select only the classes with support > 0
valid_idx = [i for i, s in enumerate(supports) if s > 0]
valid_labels = [classes[i] for i in valid_idx]

# 3) Filter y_true and y_pred to these columns
y_true_filt = Y_real[:, valid_idx]
y_pred_filt = Y_pred_real[:, valid_idx]

# 4) Print report on the filtered set
print("\nReal Mixtures Validation Report (labels with support > 0):")
print(classification_report(
    y_true_filt,
    y_pred_filt,
    target_names=valid_labels,
    zero_division=0
))



Real Mixtures Validation Report (labels with support > 0):
                       precision    recall  f1-score   support

      1-dodecanethiol       1.00      0.79      0.88       243
 6-mercapto-1-hexanol       1.00      0.33      0.50       108
              benzene       1.00      1.00      1.00       193
         benzenethiol       0.96      1.00      0.98        72
                 etoh       0.99      0.70      0.82       121
                 meoh       1.00      0.79      0.89       243
n,n-dimethylformamide       1.00      1.00      1.00        72
             pyridine       1.00      1.00      1.00       108

            micro avg       1.00      0.82      0.90      1160
            macro avg       0.99      0.83      0.88      1160
         weighted avg       1.00      0.82      0.88      1160
          samples avg       1.00      0.82      0.88      1160



In [32]:
# 1) Define the PresenceNet WITHOUT final Sigmoid
class PresenceNetLogits(nn.Module):
    def __init__(self, D, C):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(D, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, C)   # raw logits
        )
    def forward(self, x):
        return self.net(x)

In [33]:
# 2) Instantiate model
D = X_train_s.shape[1]  # embedding dimension
C = len(classes)
model_boost = PresenceNetLogits(D, C)



In [34]:
# 3) Build pos_weight to up-weight specific classes

pos = Y_train_s.sum(axis=0)
neg = len(Y_train_s) - pos
pos_weight = torch.tensor((neg/pos).clip(min=1.0), dtype=torch.float32)


print("pos_weight:", pos_weight)

pos_weight: tensor([5.7298, 5.6087, 5.5336, 2.9891, 5.7401, 5.7711, 5.7815, 5.7366, 2.9381,
        5.6386, 5.7366, 5.6054])


In [35]:
# 4) Use BCEWithLogitsLoss with pos_weight
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(model_boost.parameters(), lr=1e-3)

# 5) Training loop skeleton
num_epochs = 200
for epoch in range(1, num_epochs+1):
    model_boost.train()
    train_loss = 0.0
    for xb, yb in train_loader:
        logits = model_boost(xb)
        loss = criterion(logits, yb)  # yb must be FloatTensor
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_loss /= len(train_loader.dataset)

    # Validation...
    model_boost.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            logits = model_boost(xb)
            val_loss += criterion(logits, yb).item() * xb.size(0)
    val_loss /= len(val_loader.dataset)

    print(f"Epoch {epoch}/{num_epochs} — "
          f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

Epoch 1/200 — Train Loss: 0.9252 | Val Loss: 0.7790
Epoch 2/200 — Train Loss: 0.7477 | Val Loss: 0.7370
Epoch 3/200 — Train Loss: 0.7107 | Val Loss: 0.6995
Epoch 4/200 — Train Loss: 0.6785 | Val Loss: 0.6736
Epoch 5/200 — Train Loss: 0.6561 | Val Loss: 0.6537
Epoch 6/200 — Train Loss: 0.6393 | Val Loss: 0.6372
Epoch 7/200 — Train Loss: 0.6262 | Val Loss: 0.6267
Epoch 8/200 — Train Loss: 0.6154 | Val Loss: 0.6141
Epoch 9/200 — Train Loss: 0.6065 | Val Loss: 0.6074
Epoch 10/200 — Train Loss: 0.5980 | Val Loss: 0.5991
Epoch 11/200 — Train Loss: 0.5903 | Val Loss: 0.5905
Epoch 12/200 — Train Loss: 0.5849 | Val Loss: 0.5832
Epoch 13/200 — Train Loss: 0.5789 | Val Loss: 0.5799
Epoch 14/200 — Train Loss: 0.5726 | Val Loss: 0.5716
Epoch 15/200 — Train Loss: 0.5677 | Val Loss: 0.5653
Epoch 16/200 — Train Loss: 0.5625 | Val Loss: 0.5654
Epoch 17/200 — Train Loss: 0.5579 | Val Loss: 0.5572
Epoch 18/200 — Train Loss: 0.5533 | Val Loss: 0.5518
Epoch 19/200 — Train Loss: 0.5487 | Val Loss: 0.5506
Ep

In [36]:
with torch.no_grad():
    mix_embeds = siamese(torch.tensor(mix_proc, dtype=torch.float32).unsqueeze(1)).cpu().numpy()

# build real multi-hot
N_real = len(mix_df)
Y_real = np.zeros((N_real, C), dtype=int)
for i, (l1, l2) in enumerate(pairs):
    Y_real[i, class_to_i[l1]] = 1
    Y_real[i, class_to_i[l2]] = 1

# predict real
model_boost.eval()
preds = model_boost(torch.tensor(mix_embeds, dtype=torch.float32)).detach().numpy()
Y_pred_real = (preds>0.5).astype(int)

from sklearn.metrics import classification_report

# 1) Compute support for each class
supports = Y_real.sum(axis=0)   # length C array of counts

# 2) Select only the classes with support > 0
valid_idx = [i for i, s in enumerate(supports) if s > 0]
valid_labels = [classes[i] for i in valid_idx]

# 3) Filter y_true and y_pred to these columns
y_true_filt = Y_real[:, valid_idx]
y_pred_filt = Y_pred_real[:, valid_idx]

# 4) Print report on the filtered set
print("\nReal Mixtures Validation Report (labels with support > 0):")
print(classification_report(
    y_true_filt,
    y_pred_filt,
    target_names=valid_labels,
    zero_division=0
))


Real Mixtures Validation Report (labels with support > 0):
                       precision    recall  f1-score   support

      1-dodecanethiol       1.00      0.79      0.89       243
 6-mercapto-1-hexanol       1.00      0.33      0.50       108
              benzene       1.00      1.00      1.00       193
         benzenethiol       0.94      1.00      0.97        72
                 etoh       0.98      0.70      0.82       121
                 meoh       1.00      0.80      0.89       243
n,n-dimethylformamide       0.67      1.00      0.80        72
             pyridine       1.00      1.00      1.00       108

            micro avg       0.96      0.82      0.88      1160
            macro avg       0.95      0.83      0.86      1160
         weighted avg       0.97      0.82      0.87      1160
          samples avg       0.97      0.82      0.87      1160

