In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader

In [2]:
# ─── Utilities ────────────────────────────────────────────────────────────────
lam, p, niter = 1e4, 0.01, 10
def baseline_als(y):
    L = len(y)
    D = np.diff(np.eye(L), 2)
    D = lam * D.dot(D.T)
    w = np.ones(L)
    for _ in range(niter):
        b = np.linalg.solve(np.diag(w) + D, w * y)
        w = p * (y > b) + (1 - p) * (y < b)
    return b

def preprocess(arr):
    out = np.zeros_like(arr)
    for i, s in enumerate(arr):
        b = baseline_als(s)
        c = s - b
        norm = np.linalg.norm(c)
        out[i] = c / norm if norm > 0 else c
    return out

def floatify_cols(df):
    new = []
    for c in df.columns:
        if c in ('Label', 'Label 1', 'Label 2'):
            new.append(c)
        else:
            new.append(float(c))
    df.columns = new

In [3]:
# ─── 1) Load reference_v2 and preprocess ──────────────────────────────────────
ref_df = pd.read_csv('reference_v2.csv')

floatify_cols(ref_df)
wav_cols = [c for c in ref_df.columns if c != 'Label']
ref_specs  = ref_df[wav_cols].values       # (n_ref_samples, n_waves)
ref_labels = ref_df['Label'].values        # (n_ref_samples,)

In [4]:
# Unique chemical classes
classes    = sorted(np.unique(ref_labels))
C = len(classes)
class_to_i = {c:i for i,c in enumerate(classes)}

# ─── 2) Generate synthetic mixtures ────────────────────────────────────────────
ratios = np.arange(0.05, 1.0, 0.05)
noise_level = 0.01
n_per_ratio = 10  # number of random spectra per pair/ratio

synth_specs = []
synth_labels = []
for (i, ci), (j, cj) in combinations(enumerate(classes), 2):
    # indices of pure spectra for each class
    idx_i = np.where(ref_labels == ci)[0]
    idx_j = np.where(ref_labels == cj)[0]
    for r in ratios:
        for _ in range(n_per_ratio):
            spec_i = ref_specs[np.random.choice(idx_i)]
            spec_j = ref_specs[np.random.choice(idx_j)]
            mix = r * spec_i + (1-r) * spec_j
            mix += np.random.normal(scale=noise_level, size=mix.shape)
            synth_specs.append(mix)
            synth_labels.append((ci, cj))
synth_specs = np.array(synth_specs)        # (n_synth, n_waves)
print("Synthetic raw spectra:", synth_specs.shape)

Synthetic raw spectra: (12540, 1024)


In [5]:
from joblib import Parallel, delayed

def preprocess_single(spectrum):
    """
    Baseline‐correct + L2‐normalize + abs, for one 1D array.
    """
    b = baseline_als(spectrum)
    c = spectrum - b
    norm = np.linalg.norm(c)
    out = c / norm if norm > 0 else c
    return np.abs(out)

# 2) Parallel map over all spectra
#    n_jobs=-1 uses all CPUs; you can set e.g. n_jobs=4 to use 4 cores.
synth_proc = np.vstack(
    Parallel(n_jobs=-1, verbose=10)(
        delayed(preprocess_single)(spec) 
        for spec in synth_specs
    )
)

print("Parallel preprocess done:", synth_proc.shape)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 153 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 201 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 226 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 253 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 309 tasks      | elapsed:  

Parallel preprocess done: (12540, 1024)


[Parallel(n_jobs=-1)]: Done 12540 out of 12540 | elapsed:  4.9min finished


In [6]:
# ─── 4) Embed synthetic mixtures via Siamese ──────────────────────────────────
# load siamese
class SiameseNet(nn.Module):
    def __init__(self, input_len, embed_dim=64):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv1d(1,16,7,padding=3), nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(16,32,5,padding=2), nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Flatten(),
            nn.Linear((input_len//4)*32, embed_dim), nn.ReLU()
        )
    def forward(self,x):
        z = self.encoder(x)
        return F.normalize(z, dim=1)

siamese = SiameseNet(input_len=ref_specs.shape[1], embed_dim=64)
siamese.load_state_dict(torch.load('siamese_mixture.pth', map_location='cpu'))
siamese.eval()

with torch.no_grad():
    tensor = torch.tensor(synth_proc, dtype=torch.float32).unsqueeze(1)
    syn_embeds = siamese(tensor).cpu().numpy()  # (n_synth, 64)
print("Synthetic embeddings:", syn_embeds.shape)

Synthetic embeddings: (12540, 64)


In [7]:
# ─── 5) Build X_synth, Y_synth ─────────────────────────────────────────────────
N = len(syn_embeds)
X_synth = syn_embeds
Y_synth = np.zeros((N, C), dtype=int)
for k, (ci, cj) in enumerate(synth_labels):
    Y_synth[k, class_to_i[ci]] = 1
    Y_synth[k, class_to_i[cj]] = 1

In [8]:
# ─── 6) Split synthetic into train/val/test (80/10/10) ─────────────────────────
X_tmp, X_test_s, Y_tmp, Y_test_s = train_test_split(X_synth, Y_synth, test_size=0.10, random_state=0)
X_train_s, X_val_s, Y_train_s, Y_val_s = train_test_split(X_tmp, Y_tmp, test_size=0.1111, random_state=0)
print("Synthetic train/val/test:", len(X_train_s), len(X_val_s), len(X_test_s))

Synthetic train/val/test: 10032 1254 1254


In [9]:
# ─── 7) DataLoaders for synthetic ─────────────────────────────────────────────
batch_size = 64
train_loader = DataLoader(
    TensorDataset(
        torch.tensor(X_train_s, dtype=torch.float32),
        torch.tensor(Y_train_s, dtype=torch.float32)   # ← make this float
    ),
    batch_size=batch_size,
    shuffle=True
)
val_loader = DataLoader(
    TensorDataset(
        torch.tensor(X_val_s, dtype=torch.float32),
        torch.tensor(Y_val_s, dtype=torch.float32)     # ← and this
    ),
    batch_size=batch_size
)
test_loader = DataLoader(
    TensorDataset(
        torch.tensor(X_test_s, dtype=torch.float32),
        torch.tensor(Y_test_s, dtype=torch.float32)    # ← and this
    ),
    batch_size=batch_size
)


In [10]:
# ─── 8) Define & train MLP on synthetic ────────────────────────────────────────
class PresenceNet(nn.Module):
    def __init__(self, D, C):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(D,128), nn.ReLU(),
            nn.Linear(128,64), nn.ReLU(),
            nn.Linear(64,C),   nn.Sigmoid()
        )
    def forward(self,x):
        return self.net(x)

D = syn_embeds.shape[1]
model_p = PresenceNet(D, C)
criterion = nn.BCELoss()
opt = optim.Adam(model_p.parameters(), lr=1e-3)
epochs = 200

for e in range(1, epochs+1):
    model_p.train()
    loss_tr = 0
    for xb, yb in train_loader:
        pred = model_p(xb)
        loss = criterion(pred, yb)
        opt.zero_grad(); loss.backward(); opt.step()
        loss_tr += loss.item()*xb.size(0)
    loss_tr /= len(train_loader.dataset)
    model_p.eval()
    loss_val = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            loss_val += criterion(model_p(xb), yb).item()*xb.size(0)
    loss_val /= len(val_loader.dataset)
    print(f"Epoch {e}/{epochs} — Train Loss: {loss_tr:.4f} | Val Loss: {loss_val:.4f}")

Epoch 1/200 — Train Loss: 0.4826 | Val Loss: 0.3819
Epoch 2/200 — Train Loss: 0.3118 | Val Loss: 0.2525
Epoch 3/200 — Train Loss: 0.2278 | Val Loss: 0.2089
Epoch 4/200 — Train Loss: 0.2023 | Val Loss: 0.1948
Epoch 5/200 — Train Loss: 0.1899 | Val Loss: 0.1822
Epoch 6/200 — Train Loss: 0.1781 | Val Loss: 0.1716
Epoch 7/200 — Train Loss: 0.1673 | Val Loss: 0.1605
Epoch 8/200 — Train Loss: 0.1575 | Val Loss: 0.1528
Epoch 9/200 — Train Loss: 0.1498 | Val Loss: 0.1463
Epoch 10/200 — Train Loss: 0.1438 | Val Loss: 0.1409
Epoch 11/200 — Train Loss: 0.1387 | Val Loss: 0.1364
Epoch 12/200 — Train Loss: 0.1344 | Val Loss: 0.1326
Epoch 13/200 — Train Loss: 0.1308 | Val Loss: 0.1299
Epoch 14/200 — Train Loss: 0.1273 | Val Loss: 0.1261
Epoch 15/200 — Train Loss: 0.1245 | Val Loss: 0.1236
Epoch 16/200 — Train Loss: 0.1215 | Val Loss: 0.1200
Epoch 17/200 — Train Loss: 0.1190 | Val Loss: 0.1198
Epoch 18/200 — Train Loss: 0.1167 | Val Loss: 0.1171
Epoch 19/200 — Train Loss: 0.1148 | Val Loss: 0.1144
Ep

In [11]:
# ─── 9) Evaluate synthetic test set ────────────────────────────────────────────
model_p.eval()
yp, yt = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        yp.append(model_p(xb).numpy())
        yt.append(yb.numpy())
y_pred = (np.vstack(yp)>0.5).astype(int)
y_true = np.vstack(yt)
print("\nSynthetic Test Classification Report:")
print(classification_report(y_true, y_pred, target_names=classes))


Synthetic Test Classification Report:
                              precision    recall  f1-score   support

           1,9-nonanedithiol       0.87      0.86      0.86       196
             1-dodecanethiol       0.75      0.74      0.74       225
             1-undecanethiol       0.57      0.42      0.48       208
        6-mercapto-1-hexanol       0.98      0.70      0.82       204
                     benzene       1.00      1.00      1.00       235
                benzenethiol       0.96      0.94      0.95       195
                        dmmp       1.00      1.00      1.00       220
                        etoh       0.97      0.91      0.94       217
                        meoh       1.00      0.92      0.96       184
       n,n-dimethylformamide       1.00      0.97      0.99       212
                    pyridine       1.00      0.99      1.00       223
tris(2-ethylhexyl) phosphate       0.97      0.98      0.97       189

                   micro avg       0.93      0.87

In [12]:
# --- Load mixtures and convert columns ---
mix_df = pd.read_csv('mixtures_dataset.csv')

# Re-use your floatify_cols helper:
def floatify_cols(df):
    new_cols = []
    for c in df.columns:
        if c in ('Label 1', 'Label 2'):
            new_cols.append(c)
        else:
            new_cols.append(float(c))  # convert wavenumber strings → floats
    df.columns = new_cols

floatify_cols(mix_df)

# --- Now select wavenumber columns (they are all numeric) ---
wav_cols = [c for c in mix_df.columns if c not in ('Label 1', 'Label 2')]

# --- Extract spectra as a pure float array ---
mix_specs = mix_df[wav_cols].values.astype(float)  # ensure float64 dtype

# --- Parallel preprocessing now works because mix_specs is numeric ---
mix_proc = np.vstack(
    Parallel(n_jobs=-1, verbose=5)(
        delayed(preprocess_single)(spec) for spec in mix_specs
    )
)
print("Mixtures preprocessed:", mix_proc.shape)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:    8.3s


Mixtures preprocessed: (580, 1024)


[Parallel(n_jobs=-1)]: Done 580 out of 580 | elapsed:   13.4s finished


In [13]:
pairs     = list(zip(mix_df['Label 1'], mix_df['Label 2']))

In [14]:
with torch.no_grad():
    mix_embeds = siamese(torch.tensor(mix_proc, dtype=torch.float32).unsqueeze(1)).cpu().numpy()

# build real multi-hot
N_real = len(mix_df)
Y_real = np.zeros((N_real, C), dtype=int)
for i, (l1, l2) in enumerate(pairs):
    Y_real[i, class_to_i[l1]] = 1
    Y_real[i, class_to_i[l2]] = 1

# predict real
model_p.eval()
preds = model_p(torch.tensor(mix_embeds, dtype=torch.float32)).detach().numpy()
Y_pred_real = (preds>0.5).astype(int)

from sklearn.metrics import classification_report

# 1) Compute support for each class
supports = Y_real.sum(axis=0)   # length C array of counts

# 2) Select only the classes with support > 0
valid_idx = [i for i, s in enumerate(supports) if s > 0]
valid_labels = [classes[i] for i in valid_idx]

# 3) Filter y_true and y_pred to these columns
y_true_filt = Y_real[:, valid_idx]
y_pred_filt = Y_pred_real[:, valid_idx]

# 4) Print report on the filtered set
print("\nReal Mixtures Validation Report (labels with support > 0):")
print(classification_report(
    y_true_filt,
    y_pred_filt,
    target_names=valid_labels,
    zero_division=0
))



Real Mixtures Validation Report (labels with support > 0):
                       precision    recall  f1-score   support

      1-dodecanethiol       1.00      0.44      0.62       243
 6-mercapto-1-hexanol       0.90      0.33      0.49       108
              benzene       1.00      1.00      1.00       193
         benzenethiol       0.33      0.50      0.40        72
                 etoh       1.00      0.70      0.83       121
                 meoh       1.00      0.97      0.99       243
n,n-dimethylformamide       1.00      1.00      1.00        72
             pyridine       1.00      1.00      1.00       108

            micro avg       0.92      0.75      0.83      1160
            macro avg       0.90      0.74      0.79      1160
         weighted avg       0.95      0.75      0.81      1160
          samples avg       0.93      0.75      0.81      1160



In [15]:
# 1) Define the PresenceNet WITHOUT final Sigmoid
class PresenceNetLogits(nn.Module):
    def __init__(self, D, C):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(D, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, C)   # raw logits
        )
    def forward(self, x):
        return self.net(x)

In [16]:
# 2) Instantiate model
D = X_train_s.shape[1]  # embedding dimension
C = len(classes)
model_boost = PresenceNetLogits(D, C)



In [18]:
# 3) Build pos_weight to up-weight specific classes

pos = Y_train_s.sum(axis=0)
neg = len(Y_train_s) - pos
pos_weight = torch.tensor((neg/pos).clip(min=1.0), dtype=torch.float32)


print("pos_weight:", pos_weight)

pos_weight: tensor([4.9714, 5.0726, 5.0216, 4.9928, 5.0800, 4.9326, 5.1096, 5.0000, 4.9361,
        4.9750, 5.0144, 4.9012])


In [19]:
# 4) Use BCEWithLogitsLoss with pos_weight
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.Adam(model_boost.parameters(), lr=1e-3)

# 5) Training loop skeleton
num_epochs = 200
for epoch in range(1, num_epochs+1):
    model_boost.train()
    train_loss = 0.0
    for xb, yb in train_loader:
        logits = model_boost(xb)
        loss = criterion(logits, yb)  # yb must be FloatTensor
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_loss /= len(train_loader.dataset)

    # Validation...
    model_boost.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            logits = model_boost(xb)
            val_loss += criterion(logits, yb).item() * xb.size(0)
    val_loss /= len(val_loader.dataset)

    print(f"Epoch {epoch}/{num_epochs} — "
          f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

Epoch 1/200 — Train Loss: 0.9226 | Val Loss: 0.6375
Epoch 2/200 — Train Loss: 0.5625 | Val Loss: 0.5154
Epoch 3/200 — Train Loss: 0.4966 | Val Loss: 0.4706
Epoch 4/200 — Train Loss: 0.4548 | Val Loss: 0.4323
Epoch 5/200 — Train Loss: 0.4206 | Val Loss: 0.4058
Epoch 6/200 — Train Loss: 0.3944 | Val Loss: 0.3803
Epoch 7/200 — Train Loss: 0.3748 | Val Loss: 0.3639
Epoch 8/200 — Train Loss: 0.3580 | Val Loss: 0.3549
Epoch 9/200 — Train Loss: 0.3453 | Val Loss: 0.3412
Epoch 10/200 — Train Loss: 0.3342 | Val Loss: 0.3331
Epoch 11/200 — Train Loss: 0.3245 | Val Loss: 0.3191
Epoch 12/200 — Train Loss: 0.3135 | Val Loss: 0.3098
Epoch 13/200 — Train Loss: 0.3057 | Val Loss: 0.3043
Epoch 14/200 — Train Loss: 0.2972 | Val Loss: 0.2978
Epoch 15/200 — Train Loss: 0.2897 | Val Loss: 0.2889
Epoch 16/200 — Train Loss: 0.2818 | Val Loss: 0.2816
Epoch 17/200 — Train Loss: 0.2745 | Val Loss: 0.2726
Epoch 18/200 — Train Loss: 0.2694 | Val Loss: 0.2664
Epoch 19/200 — Train Loss: 0.2631 | Val Loss: 0.2628
Ep

In [20]:
with torch.no_grad():
    mix_embeds = siamese(torch.tensor(mix_proc, dtype=torch.float32).unsqueeze(1)).cpu().numpy()

# build real multi-hot
N_real = len(mix_df)
Y_real = np.zeros((N_real, C), dtype=int)
for i, (l1, l2) in enumerate(pairs):
    Y_real[i, class_to_i[l1]] = 1
    Y_real[i, class_to_i[l2]] = 1

# predict real
model_boost.eval()
preds = model_boost(torch.tensor(mix_embeds, dtype=torch.float32)).detach().numpy()
Y_pred_real = (preds>0.5).astype(int)

from sklearn.metrics import classification_report

# 1) Compute support for each class
supports = Y_real.sum(axis=0)   # length C array of counts

# 2) Select only the classes with support > 0
valid_idx = [i for i, s in enumerate(supports) if s > 0]
valid_labels = [classes[i] for i in valid_idx]

# 3) Filter y_true and y_pred to these columns
y_true_filt = Y_real[:, valid_idx]
y_pred_filt = Y_pred_real[:, valid_idx]

# 4) Print report on the filtered set
print("\nReal Mixtures Validation Report (labels with support > 0):")
print(classification_report(
    y_true_filt,
    y_pred_filt,
    target_names=valid_labels,
    zero_division=0
))


Real Mixtures Validation Report (labels with support > 0):
                       precision    recall  f1-score   support

      1-dodecanethiol       0.84      0.77      0.80       243
 6-mercapto-1-hexanol       0.88      0.33      0.48       108
              benzene       1.00      1.00      1.00       193
         benzenethiol       0.67      1.00      0.80        72
                 etoh       1.00      1.00      1.00       121
                 meoh       1.00      0.96      0.98       243
n,n-dimethylformamide       1.00      1.00      1.00        72
             pyridine       1.00      1.00      1.00       108

            micro avg       0.93      0.88      0.91      1160
            macro avg       0.92      0.88      0.88      1160
         weighted avg       0.93      0.88      0.89      1160
          samples avg       0.94      0.88      0.89      1160



In [21]:
# ─── 9) Evaluate synthetic test set ────────────────────────────────────────────
model_boost.eval()
yp, yt = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        yp.append(model_boost(xb).numpy())
        yt.append(yb.numpy())
y_pred = (np.vstack(yp)>0.5).astype(int)
y_true = np.vstack(yt)
print("\nSynthetic Test Classification Report:")
print(classification_report(y_true, y_pred, target_names=classes))


Synthetic Test Classification Report:
                              precision    recall  f1-score   support

           1,9-nonanedithiol       0.86      0.90      0.88       196
             1-dodecanethiol       0.57      0.88      0.69       225
             1-undecanethiol       0.48      0.88      0.62       208
        6-mercapto-1-hexanol       0.81      0.92      0.86       204
                     benzene       1.00      1.00      1.00       235
                benzenethiol       0.93      0.99      0.96       195
                        dmmp       1.00      1.00      1.00       220
                        etoh       0.89      0.98      0.93       217
                        meoh       0.96      0.95      0.95       184
       n,n-dimethylformamide       1.00      0.98      0.99       212
                    pyridine       1.00      0.99      1.00       223
tris(2-ethylhexyl) phosphate       0.95      0.99      0.97       189

                   micro avg       0.83      0.96