In [1]:
import pandas as pd
import numpy as np
from scipy.signal import find_peaks
from sklearn.preprocessing import MultiLabelBinarizer, minmax_scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [2]:
def baseline_AsLS(y, lam=1e4, p=0.01, niter=10):
    L = len(y)
    D = np.diff(np.eye(L), 2)
    D = lam * D.dot(D.T)
    w = np.ones(L)
    for _ in range(niter):
        b = np.linalg.solve(np.diag(w) + D, w * y)
        w = p * (y > b) + (1 - p) * (y < b)
    return b

def preprocess(arr, lam=1e4, p=0.01, niter=10):
    out = np.zeros_like(arr)
    for i, spec in enumerate(arr):
        bkg = baseline_AsLS(spec,lam=lam, p=p, niter=niter)
        corr = spec - bkg
        nrm = np.linalg.norm(corr)
        normed = corr / nrm if nrm else corr
        normed = np.abs(normed)  #Make all values positive, element-wise
        out[i] = normed
    return out

def smooth(spec, K_smooth=3):
    kernel = np.ones(K_smooth) / K_smooth
    return np.convolve(spec, kernel, mode='same')

# Extract valid wavenumber columns
def extract_wavenumber_cols(df):
    return [col for col in df.columns if col.replace('.', '', 1).isdigit()]

# Identify with CaPSim + kNN multilabel
def identify_multilabel_knn(query_df, ref_df,
    crop_max=1700, lam=1e4, p=0.01, niter=10,
    K_smooth=3, N_peak=12, w_max=15,
    height=0.01, prominence=0.01,
    n_neighbors=3):

    wav_cols = extract_wavenumber_cols(query_df)
    wavs = np.array(wav_cols, dtype=float)
    keep_cols = [col for col, w in zip(wav_cols, wavs) if w < crop_max]

    Q_raw = query_df[keep_cols].values.astype(float)
    R_raw = ref_df[keep_cols].values.astype(float)

    # Build multilabel targets
    ref_labels = list(zip(ref_df['Label 1'], ref_df['Label 2']))
    query_labels = list(zip(query_df['Label 1'], query_df['Label 2']))
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(ref_labels)

    # Preprocess
    Q = preprocess(Q_raw)
    R = preprocess(R_raw, lam, p, niter)

    # CP detection
    CPs = {}
    for i, class_name in enumerate(mlb.classes_):
        specs = R[y[:, i] == 1]
        counts = np.zeros(Q.shape[1], int)
        for s in specs:
            pks, _ = find_peaks(smooth(s, K_smooth), height=height, prominence=prominence)
            counts[pks] += 1
        CPs[class_name] = sorted(np.argsort(counts)[-N_peak:])

    global_cp = sorted({i for idxs in CPs.values() for i in idxs})

    # Reference feature matrix
    X = []
    for s in R:
        vec = [np.max(s[max(0, i - w_max//2):i + w_max//2 + 1]) for i in global_cp]
        X.append(minmax_scale(vec))
    X = np.array(X)

    # Train multilabel kNN
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric='cosine')
    knn.fit(X, y)

    # Query feature matrix
    Q_feat = np.vstack([
        minmax_scale([np.max(s[max(0, i - w_max//2):i + w_max//2 + 1]) for i in global_cp])
        for s in Q
    ])
    y_pred = knn.predict(Q_feat)
    y_true = mlb.transform(query_labels)

    return y_true, y_pred, mlb

In [3]:
ref_df = pd.read_csv("mixtures_dataset.csv")
query_df = pd.read_csv("query_only_mixed.csv")


In [4]:
y_true, y_pred, mlb = identify_multilabel_knn(query_df, ref_df)

# Convert to DataFrame for user inspection
results_df = pd.DataFrame({
    'True Labels': [", ".join(mlb.classes_[row.astype(bool)]) for row in y_true],
    'Predicted Labels': [", ".join(mlb.classes_[row.astype(bool)]) for row in y_pred]
})

In [5]:
from IPython.display import display
display(results_df)



Unnamed: 0,True Labels,Predicted Labels
0,"6-mercapto-1-hexanol, benzene","6-mercapto-1-hexanol, benzene"
1,"6-mercapto-1-hexanol, pyridine","6-mercapto-1-hexanol, pyridine"
2,"benzene, benzenethiol","benzene, benzenethiol"
3,"benzene, etoh","benzene, etoh"
4,"benzene, meoh","benzene, meoh"
5,"1-dodecanethiol, etoh","1-dodecanethiol, etoh"
6,"etoh, meoh","etoh, meoh"
7,"1-dodecanethiol, meoh","1-dodecanethiol, meoh"
8,"1-dodecanethiol, meoh","1-dodecanethiol, meoh"
9,"1-dodecanethiol, meoh","1-dodecanethiol, meoh"


In [6]:
report = classification_report(y_true, y_pred, target_names=mlb.classes_, zero_division=0, output_dict=True)
pd.DataFrame(report).T.reset_index().rename(columns={'index': 'Class'})

Unnamed: 0,Class,precision,recall,f1-score,support
0,1-dodecanethiol,1.0,1.0,1.0,9.0
1,6-mercapto-1-hexanol,1.0,1.0,1.0,3.0
2,benzene,1.0,1.0,1.0,5.0
3,benzenethiol,1.0,1.0,1.0,2.0
4,etoh,1.0,1.0,1.0,3.0
5,meoh,1.0,1.0,1.0,9.0
6,"n,n-dimethylformamide",1.0,1.0,1.0,2.0
7,pyridine,1.0,1.0,1.0,3.0
8,micro avg,1.0,1.0,1.0,36.0
9,macro avg,1.0,1.0,1.0,36.0
