In [29]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import minmax_scale
from scipy.signal import find_peaks
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [6]:
def baseline_als(y, lam=1e4, p=0.01, niter=10):
    L = len(y)
    D = np.diff(np.eye(L), 2)
    D = lam * D.dot(D.T)
    w = np.ones(L)
    for _ in range(niter):
        b = np.linalg.solve(np.diag(w) + D, w * y)
        w = p * (y > b) + (1 - p) * (y < b)
    return b

def preprocess(arr, lam=1e4, p=0.01, niter=10):
    out = np.zeros_like(arr)
    for i, s in enumerate(arr):
        b = baseline_als(s, lam=lam, p=p, niter=niter)
        c = s - b
        out[i] = c / np.linalg.norm(c) if np.linalg.norm(c) > 0 else c
    return out

def smooth(spec, K=3):
    kernel = np.ones(K) / K
    return np.convolve(spec, kernel, mode='same')

In [7]:
ref_df = pd.read_csv("reference_v2.csv")
query_df = pd.read_csv("query_only_mixed.csv")

# --- Parameters ---
crop_max = 1500
lam = 1e4
p = 0.01
niter = 10
K_smooth = 3
N_peak = 12
w_max = 15
height = 0.01
prominence = 0.01
n_neighbors = 3

In [8]:
all_feature_cols = ref_df.columns.drop("Label")
wavenumbers = pd.to_numeric(all_feature_cols, errors='coerce')
keep_mask = (wavenumbers < crop_max) & (~wavenumbers.isna())
keep_cols = all_feature_cols[keep_mask]

# Re-extract using fixed columns
R_raw = ref_df[keep_cols].values
Q_raw = query_df[keep_cols].values
R_labels = ref_df["Label"].values

R_proc = preprocess(R_raw, lam=lam, p=p, niter=niter)
Q_proc = preprocess(Q_raw, lam=lam, p=p, niter=niter)

In [10]:
CPs = {}
classes = np.unique(R_labels)
for chem in classes:
    specs = R_proc[R_labels == chem]
    counts = np.zeros(Q_proc.shape[1], int)
    for s in specs:
        pks, _ = find_peaks(smooth(s, K_smooth), height=height, prominence=prominence)
        counts[pks] += 1
    CPs[chem] = sorted(np.argsort(counts)[-N_peak:])




In [11]:
chemical_classifiers = {}
for chem in classes:
    X_train, y_train = [], []
    for i, s in enumerate(R_proc):
        cp_vals = [np.max(s[max(0, cp - w_max//2):cp + w_max//2 + 1]) for cp in CPs[chem]]
        X_train.append(minmax_scale(cp_vals))
        y_train.append(1 if R_labels[i] == chem else 0)
    clf = KNeighborsClassifier(n_neighbors=n_neighbors, metric='cosine')
    clf.fit(X_train, y_train)
    chemical_classifiers[chem] = clf

In [12]:
predictions = []
for s in Q_proc:
    detected = []
    for chem in classes:
        cp_vals = [np.max(s[max(0, cp - w_max//2):cp + w_max//2 + 1]) for cp in CPs[chem]]
        cp_vals = minmax_scale(cp_vals)
        pred = chemical_classifiers[chem].predict([cp_vals])[0]
        if pred == 1:
            detected.append(chem)
    predictions.append(detected)

In [15]:
results_df = pd.DataFrame({
    "True_Label_1": query_df["Label 1"],
    "True_Label_2": query_df["Label 2"],
    "Predicted_Labels": predictions
})

In [30]:
label_scores = {}

for chem in classes:
    # True labels: whether the chemical is actually present in the mixture
    y_true = [(chem in [l1, l2]) for l1, l2 in zip(results_df["True_Label_1"], results_df["True_Label_2"])]
    
    # Predicted labels: whether the chemical was predicted to be present
    y_pred = [chem in preds for preds in results_df["Predicted_Labels"]]
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    
    label_scores[chem] = {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec
    }

# Convert to DataFrame and sort
label_scores_df = pd.DataFrame.from_dict(label_scores, orient='index')
label_scores_df = label_scores_df.sort_values(by='Accuracy', ascending=False)

# Display the per-label metrics
print("Per-label binary classification performance:")
print(label_scores_df.round(3))

# ✅ Total Accuracy: full prediction must contain BOTH true labels
def is_correct_prediction(true1, true2, pred):
    return true1 in pred and true2 in pred

total_correct = sum(
    is_correct_prediction(row["True_Label_1"], row["True_Label_2"], row["Predicted_Labels"])
    for _, row in results_df.iterrows()
)

total_accuracy = total_correct / len(results_df)
print(f"\nTotal Mixture Prediction Accuracy (both labels correct): {total_accuracy:.2%} ({total_correct} / {len(results_df)})")

Per-label binary classification performance:
                              Accuracy  Precision  Recall
n,n-dimethylformamide            1.000        1.0   1.000
pyridine                         1.000        1.0   1.000
dmmp                             1.000        0.0   0.000
benzene                          1.000        1.0   1.000
tris(2-ethylhexyl) phosphate     1.000        0.0   0.000
benzenethiol                     0.944        1.0   0.500
1-undecanethiol                  0.944        0.0   0.000
1,9-nonanedithiol                0.944        0.0   0.000
etoh                             0.944        1.0   0.667
1-dodecanethiol                  0.889        1.0   0.778
6-mercapto-1-hexanol             0.889        1.0   0.333
meoh                             0.500        0.5   0.111

Total Mixture Prediction Accuracy (both labels correct): 33.33% (6 / 18)


In [28]:
print(results_df)

             True_Label_1          True_Label_2  \
0    6-mercapto-1-hexanol               benzene   
1    6-mercapto-1-hexanol              pyridine   
2                 benzene          benzenethiol   
3                 benzene                  etoh   
4                 benzene                  meoh   
5                    etoh       1-dodecanethiol   
6                    etoh                  meoh   
7                    meoh       1-dodecanethiol   
8                    meoh       1-dodecanethiol   
9                    meoh       1-dodecanethiol   
10                   meoh       1-dodecanethiol   
11                   meoh       1-dodecanethiol   
12                   meoh       1-dodecanethiol   
13                   meoh       1-dodecanethiol   
14  n,n-dimethylformamide  6-mercapto-1-hexanol   
15  n,n-dimethylformamide              pyridine   
16               pyridine          benzenethiol   
17                benzene       1-dodecanethiol   

                           Pre