In [41]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from scipy.optimize import nnls
from scipy.signal import find_peaks
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import itertools

In [42]:
ref_df = pd.read_csv("reference_v2.csv")
query_df = pd.read_csv("query_v2.csv")

crop_max = 1500

In [43]:
def baseline_AsLS(y, lam=1e4, p=0.01, niter=10):
    L = len(y)
    D = np.diff(np.eye(L), 2)
    D = lam * D.dot(D.T)
    w = np.ones(L)
    for _ in range(niter):
        b = np.linalg.solve(np.diag(w) + D, w * y)
        w = p * (y > b) + (1 - p) * (y < b)
    return b

def preprocess(arr, lam=1e4, p=0.01, niter=10):
    out = np.zeros_like(arr)
    for i, spec in enumerate(arr):
        bkg = baseline_AsLS(spec, lam=lam, p=p, niter=niter)
        corr = spec - bkg
        nrm = np.linalg.norm(corr)
        normed = corr / nrm if nrm else corr
        out[i] = np.abs(normed)
    return out

def extract_cps(spectra, num_peaks=15, height=0.01, prominence=0.01):
    counts = np.zeros(spectra.shape[1])
    for spec in spectra:
        peaks, _ = find_peaks(spec, height=height, prominence=prominence)
        counts[peaks] += 1
    top_peaks = np.argsort(counts)[-num_peaks:]
    return sorted(top_peaks)

def l1_nnls(A, b, alpha=0.001):
    lasso = Lasso(alpha=alpha, positive=True, max_iter=10000)
    lasso.fit(A, b)
    return lasso.coef_

In [44]:
all_cols = ref_df.columns.drop("Label")
wavenumbers = pd.to_numeric(all_cols, errors="coerce")
valid_cols = all_cols[wavenumbers < 1500]

ref_data = ref_df[valid_cols].values
query_data = query_df[valid_cols].values
ref_labels = ref_df["Label"].values

X_ref = preprocess(ref_data)
X_query = preprocess(query_data)

# Build reference spectra dictionary
classes = np.unique(ref_labels)
ref_spectra_avg = {c: X_ref[ref_labels == c].mean(axis=0) for c in classes}
ref_matrix = np.array([ref_spectra_avg[c] for c in classes])

# Use peak-domain CPs
cps = extract_cps(X_ref, num_peaks=15)
ref_matrix_cps = ref_matrix[:, cps]
X_query_cps = X_query[:, cps]

In [45]:
# Get weights for each query using l1-penalized NNLS
weights_l1 = np.array([l1_nnls(ref_matrix_cps.T, q) for q in X_query_cps])

# Build binary targets for meta-model
y_multi = np.array([[label in (l1, l2) for label in classes]
                    for l1, l2 in zip(query_df["Label 1"], query_df["Label 2"])])

# Train a random forest meta-classifier per chemical
meta_models = {}
for i, chem in enumerate(classes):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(weights_l1, y_multi[:, i])
    meta_models[chem] = clf

# Use meta-models to predict presence probabilities
proba_preds = np.array([meta_models[c].predict([w])[0] for w in weights_l1 for c in classes])
proba_preds = proba_preds.reshape(len(weights_l1), len(classes))

# Get binary predictions
binary_preds = (proba_preds > 0.5).astype(int)
predicted_labels = [[c for j, c in enumerate(classes) if row[j]] for row in binary_preds]

# Evaluation
true1 = query_df["Label 1"].values
true2 = query_df["Label 2"].values
results_df = pd.DataFrame({"True1": true1, "True2": true2, "Predicted": predicted_labels}) 

In [46]:
metrics = {}
for i, chem in enumerate(classes):
    y_true = [(chem in [l1, l2]) for l1, l2 in zip(true1, true2)]
    y_pred = binary_preds[:, i]
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    metrics[chem] = {"Accuracy": acc, "Precision": prec, "Recall": rec}

metrics_df = pd.DataFrame.from_dict(metrics, orient="index")
filtered_metrics_df = metrics_df[~((metrics_df["Precision"] == 0) & (metrics_df["Recall"] == 0))]

# Total accuracy (both labels must be predicted)
correct = [l1 in p and l2 in p for l1, l2, p in zip(true1, true2, predicted_labels)]
total_accuracy = sum(correct) / len(correct)


In [47]:
print(filtered_metrics_df)

                              Accuracy  Precision  Recall
1,9-nonanedithiol             1.000000       1.00    1.00
1-dodecanethiol               0.966667       1.00    0.90
1-undecanethiol               1.000000       1.00    1.00
6-mercapto-1-hexanol          0.966667       1.00    0.75
benzene                       0.933333       0.75    1.00
benzenethiol                  0.966667       1.00    0.50
etoh                          1.000000       1.00    1.00
meoh                          0.966667       1.00    0.90
n,n-dimethylformamide         1.000000       1.00    1.00
pyridine                      1.000000       1.00    1.00
tris(2-ethylhexyl) phosphate  1.000000       1.00    1.00


In [49]:
filtered_metrics_df

Unnamed: 0,Accuracy,Precision,Recall
"1,9-nonanedithiol",1.0,1.0,1.0
1-dodecanethiol,0.966667,1.0,0.9
1-undecanethiol,1.0,1.0,1.0
6-mercapto-1-hexanol,0.966667,1.0,0.75
benzene,0.933333,0.75,1.0
benzenethiol,0.966667,1.0,0.5
etoh,1.0,1.0,1.0
meoh,0.966667,1.0,0.9
"n,n-dimethylformamide",1.0,1.0,1.0
pyridine,1.0,1.0,1.0
