# FTIR Spectral Analysis - Dynamic Variable Selection and PCA/Clustering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
# --- USER SETTINGS ---
input_csv = "soya-horas-normalized-1744max.csv"
skip_initial_hours = 5
apply_range = True
range_min = 500
range_max = 4000
savgol_params = (11, 3)
auc_regions = [(500, 800), (900, 1000), (1100, 1200), (1300, 1400), (1600, 1800)]

In [None]:
# --- STEP 1: Load and preprocess data ---
df = pd.read_csv(input_csv, index_col=0)
df = df[[col for col in df.columns if float(col) >= skip_initial_hours]]

smoothed = df.apply(lambda col: savgol_filter(col, *savgol_params), axis=0)
smoothed = pd.DataFrame(smoothed, index=df.index, columns=df.columns)
norm_df = smoothed.apply(lambda col: col / col.max(), axis=0)

if apply_range:
    norm_df = norm_df[(norm_df.index >= range_min) & (norm_df.index <= range_max)]

In [None]:
# --- STEP 2: Compute slopes ---
times = np.array([float(c) for c in norm_df.columns])
slopes = norm_df.apply(lambda row: np.polyfit(times, row.values, 1)[0], axis=1)

In [None]:
# --- STEP 3: Select dynamic wavenumbers ---
def select_dynamic_wavenumbers(slopes, step=100, top_n=5):
    candidates = []
    for start in range(range_min, range_max, step):
        region = slopes[(slopes.index >= start) & (slopes.index < start + step)]
        if not region.empty:
            max_wn = region.abs().idxmax()
            candidates.append((max_wn, abs(region[max_wn])))
    top = sorted(candidates, key=lambda x: x[1], reverse=True)[:top_n]
    return [wn for wn, _ in top]

top_wavenumbers = select_dynamic_wavenumbers(slopes)

In [None]:
# --- STEP 4: Compute AUCs ---
auc_data = []
for start, end in auc_regions:
    region = norm_df[(norm_df.index >= start) & (norm_df.index < end)]
    auc = np.trapz(region, x=region.index, axis=0)
    auc_data.append(auc)
X_auc = pd.DataFrame(auc_data).T
X_auc.columns = [f"{start}-{end}" for start, end in auc_regions]

In [None]:
# --- STEP 5: PCA and Clustering ---
def apply_pca_clustering(X, label):
    scaled = StandardScaler().fit_transform(X)
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(scaled)
    clusters = KMeans(n_clusters=3, random_state=42).fit_predict(scaled)
    plt.figure(figsize=(6, 5))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='Set1')
    plt.title(f"KMeans Clustering - {label}")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    print(f"Explained variance ({label}): PC1 = {pca.explained_variance_ratio_[0]:.3f}, PC2 = {pca.explained_variance_ratio_[1]:.3f}")
    return clusters, pca.explained_variance_ratio_

In [None]:
# Apply to top dynamic wavenumbers
X_top = norm_df.loc[top_wavenumbers].T
clusters_top, var_top = apply_pca_clustering(X_top, "Top 5 Dynamic Wavenumbers")

# Apply to AUC regions
clusters_auc, var_auc = apply_pca_clustering(X_auc, "AUC of Spectral Regions")

In [None]:
# Save selected variables (optional)
X_top.to_csv("selected_wavenumbers_top5.csv")
X_auc.to_csv("auc_regions_summary.csv")