In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.decomposition import non_negative_factorization
from SiameseNetwork import SiameseNetwork

In [3]:
def baseline_AsLS(y, lam=1e4, p=0.01, niter=10):
    L = len(y)
    D = np.diff(np.eye(L), 2)
    D = lam * D.dot(D.T)
    w = np.ones(L)
    for _ in range(niter):
        b = np.linalg.solve(np.diag(w) + D, w * y)
        w = p * (y > b) + (1 - p) * (y < b)
    return b

def preprocess(arr, lam=1e4, p=0.01, niter=10):
    out = np.zeros_like(arr)
    for i, spec in enumerate(arr):
        bkg = baseline_AsLS(spec, lam=lam, p=p, niter=niter)
        corr = spec - bkg
        nrm = np.linalg.norm(corr)
        normed = corr / nrm if nrm else corr
        normed = np.abs(normed)
        out[i] = normed
    return out

def minmax_normalize(arr):
    min_val = np.min(arr)
    max_val = np.max(arr)
    return (arr - min_val) / (max_val - min_val + 1e-8)

In [5]:
query_df = pd.read_csv("query_v2_multiples.csv")
feature_cols = [col for col in query_df.columns if col.replace('.', '', 1).isdigit()]
labels = query_df[["Label 1", "Label 2"]].values
spectra = query_df[feature_cols].values.astype(float)
spectra_proc = preprocess(spectra)

In [29]:
input_len = spectra_proc.shape[1]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SiameseNetwork(input_len=input_len,embed_dim=64)
model.load_state_dict(torch.load("siamese_raman.pth", map_location=device))
model.to(device)
model.eval()

SiameseNetwork(
  (encoder): Sequential(
    (0): Conv1d(1, 16, kernel_size=(7,), stride=(1,), padding=(3,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv1d(16, 32, kernel_size=(5,), stride=(1,), padding=(2,))
    (4): ReLU()
    (5): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=8192, out_features=64, bias=True)
    (8): ReLU()
  )
)

In [26]:
W_all, H_all, _ = non_negative_factorization(spectra_proc, n_components=2, init='random', random_state=0)

# ---------- Normalize Components ----------
H_norm = np.array([minmax_normalize(H_all[i]) for i in range(2)])  # shape: (2, n_features)

# ---------- Load Reference Data ----------
ref_df = pd.read_csv("reference_siamese.csv")
ref_labels = ref_df["Label"].values
ref_specs = ref_df.drop(columns=["Label"]).values.astype(float)
ref_proc = preprocess(ref_specs)

# Get reference embeddings
with torch.no_grad():
    ref_input = torch.tensor(ref_proc, dtype=torch.float32).unsqueeze(1).to(device)
    ref_embeddings = model(ref_input).cpu().numpy()

# ---------- Classify Each Component ----------
predictions = []
for i in range(len(W_all)):  # For each query spectrum
    pred_components = []
    for j in range(2):  # Each component (H_all[0], H_all[1])
        comp = H_all[j] * W_all[i, j]
        comp_norm = minmax_normalize(comp)
        comp_tensor = torch.tensor(comp_norm, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
        with torch.no_grad():
            comp_embed = model(comp_tensor).cpu().numpy()
        dists = np.linalg.norm(ref_embeddings - comp_embed, axis=1)
        nearest_idx = np.argmin(dists)
        pred_components.append(ref_labels[nearest_idx])
    predictions.append(pred_components)


In [27]:
import numpy as np

# Convert to NumPy arrays
predictions = np.array(predictions)
labels = np.array(labels)

# Accuracy for label 1 (first component)
acc_label1 = np.mean(predictions[:, 0] == labels[:, 0])
# Accuracy for label 2 (second component)
acc_label2 = np.mean(predictions[:, 1] == labels[:, 1])

# Exact match accuracy (both labels match)
acc_flexible = np.mean([
    set(pred) == set(true)
    for pred, true in zip(predictions, labels)
])
print(f"Flexible Match Accuracy (Set Match): {acc_flexible:.2%}")
print(f"\nLabel 1 Accuracy: {acc_label1:.2%}")
print(f"Label 2 Accuracy: {acc_label2:.2%}")


Flexible Match Accuracy (Set Match): 8.33%

Label 1 Accuracy: 25.00%
Label 2 Accuracy: 16.67%
