In [9]:
import numpy as np
import pandas as pd

In [None]:
def baseline_als(y, lam=1e4, p=0.01, niter=10):
    L = len(y)
    D = np.diff(np.eye(L), 2)
    D = lam * D @ D.T
    w = np.ones(L)
    for _ in range(niter):
        b = np.linalg.solve(np.diag(w) + D, w * y)
        w = p * (y > b) + (1 - p) * (y < b)
    return b

def preprocess(arr, lam=1e4, p=0.01, niter=10):
    out = np.zeros_like(arr)
    for i, s in enumerate(arr):
        b = baseline_als(s, lam=lam, p=p, niter=niter)
        c = s - b
        norm = np.linalg.norm(c)
        out[i] = (c / norm) if (norm > 0) else c
    return out



def filter_flat_spectra(df, label_col='Label', noise_pct=5, absolute_thr=None):
    """
    Remove spectra whose std (or RMS) is below a threshold.
    
    df : DataFrame with wavenumber cols then a 'Label' col at the end
    noise_pct : percentile below which to drop (e.g. 5 drops the 5% flattest)
    absolute_thr : if set, overrides percentile and uses an absolute std threshold
    """
    wn_cols = df.columns.drop(label_col)
    raw = df[wn_cols].values
    proc = preprocess(raw)

    # compute per‐spectrum std
    sigmas = proc.std(axis=1)
    
    if absolute_thr is None:
        thr = np.percentile(sigmas, noise_pct)
    else:
        thr = absolute_thr
    
    mask = sigmas > thr
    print(f"Dropping {np.sum(~mask)} / {len(mask)} spectra (std ≤ {thr:.4f})")
    return df.iloc[mask].reset_index(drop=True)

In [16]:
fname = "AgNP.csv"
df = pd.read_csv(fname)
cleaned = filter_flat_spectra(df, label_col="Label", noise_pct=5)
cleaned.to_csv(f"cleaned_{fname}", index=False)

Dropping 2 / 25 spectra (std ≤ 0.0209)
