In [48]:
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [49]:
# --- Preprocessing functions from user ---
def baseline_AsLS(y, lam=1e4, p=0.01, niter=10):
    L = len(y)
    D = np.diff(np.eye(L), 2)
    D = lam * D.dot(D.T)
    w = np.ones(L)
    for _ in range(niter):
        b = np.linalg.solve(np.diag(w) + D, w * y)
        w = p * (y > b) + (1 - p) * (y < b)
    return b

def preprocess(arr, lam=1e4, p=0.01, niter=10):
    out = np.zeros_like(arr)
    for i, spec in enumerate(arr):
        bkg = baseline_AsLS(spec, lam=lam, p=p, niter=niter)
        corr = spec - bkg
        nrm = np.linalg.norm(corr)
        normed = corr / nrm if nrm else corr
        out[i] = np.abs(normed)
    return out

class MixtureDataset(Dataset):
    def __init__(self, spectra, label_pairs=None):
        self.X = torch.from_numpy(spectra).float()
        self.labels = label_pairs

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        if self.labels is not None:
            return x, self.labels[idx]
        return x


In [50]:
# --- Load and preprocess data ---
mixtures = pd.read_csv('mixtures_dataset.csv')
# Drop chemical ID columns (Label 1, Label 2)
spec_df = mixtures.iloc[:, :-2]
# Convert column names to numeric, coercing errors to NaN
wn = pd.to_numeric(spec_df.columns, errors='coerce')
# Mask columns that are valid wavenumbers <= 1700 cm^-1
mask = (wn <= 1700.0) & (~wn.isna())
# Select masked spectral data
spec_selected = spec_df.loc[:, mask].to_numpy()

In [51]:
spectra_pp = preprocess(spec_selected)

In [52]:
# Train/validation split (unsupervised)
X_train, X_val = train_test_split(spectra_pp, test_size=0.25, random_state=42)
train_loader = DataLoader(MixtureDataset(X_train), batch_size=32, shuffle=True)
val_loader   = DataLoader(MixtureDataset(X_val),   batch_size=32)


In [53]:
# --- Model definition ---
class Encoder(nn.Module):
    def __init__(self, n_bands, n_endmembers):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(16, 16, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(16 * n_bands, 64)
        self.fc2 = nn.Linear(64, n_endmembers)
        self.softmax = nn.Softmax(dim=1)
    def forward(self, x):
        x = x.unsqueeze(1)  # (B, 1, bands)
        x = nn.functional.relu(self.conv1(x))
        x = nn.functional.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = nn.functional.relu(self.fc1(x))
        return self.softmax(self.fc2(x))

class EndmemberDecoder(nn.Module):
    def __init__(self, n_endmembers, n_bands):
        super().__init__()
        # Initialize endmember spectra W
        self.W = nn.Parameter(torch.rand(n_endmembers, n_bands))
    def forward(self, z):
        # z: (B, n_endmembers), W: (n_endmembers, n_bands)
        return torch.matmul(z, self.W)

class UnmixingAutoencoder(nn.Module):
    def __init__(self, n_bands, n_endmembers):
        super().__init__()
        self.encoder = Encoder(n_bands, n_endmembers)
        self.decoder = EndmemberDecoder(n_endmembers, n_bands)
    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z

In [54]:
# --- Loss functions ---
def spectral_angle_distance(x, x_hat):
    cos_sim = torch.sum(x * x_hat, dim=1) / (
        torch.norm(x, dim=1) * torch.norm(x_hat, dim=1) + 1e-8)
    return torch.acos(torch.clamp(cos_sim, -1.0, 1.0))

def unmixing_loss(x, x_hat, z, lambda_sparsity=1e-3):
    sad = spectral_angle_distance(x, x_hat)
    l1 = torch.sum(torch.abs(z), dim=1)
    return torch.mean(sad + lambda_sparsity * l1)

In [55]:
# --- Training setup ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = UnmixingAutoencoder(n_bands=spectra_pp.shape[1], n_endmembers=2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def run_epoch(loader, train=True):
    total_loss = 0.0
    model.train() if train else model.eval()
    for x in loader:
        x = x.to(device)
        if train:
            optimizer.zero_grad()
        x_hat, z = model(x)
        loss = unmixing_loss(x, x_hat, z)
        if train:
            loss.backward()
            optimizer.step()
        total_loss += loss.item() * x.size(0)
    return total_loss / len(loader.dataset)

# --- Run training ---
epochs = 50
for epoch in range(1, epochs + 1):
    train_loss = run_epoch(train_loader, train=True)
    val_loss = run_epoch(val_loader,   train=False)
    print(f"Epoch {epoch:02d}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

Epoch 01: Train Loss = 1.2378, Val Loss = 1.2564
Epoch 02: Train Loss = 1.2285, Val Loss = 1.2484
Epoch 03: Train Loss = 1.2197, Val Loss = 1.2403
Epoch 04: Train Loss = 1.2110, Val Loss = 1.2321
Epoch 05: Train Loss = 1.2021, Val Loss = 1.2238
Epoch 06: Train Loss = 1.1930, Val Loss = 1.2154
Epoch 07: Train Loss = 1.1838, Val Loss = 1.2069
Epoch 08: Train Loss = 1.1745, Val Loss = 1.1982
Epoch 09: Train Loss = 1.1651, Val Loss = 1.1894
Epoch 10: Train Loss = 1.1554, Val Loss = 1.1805
Epoch 11: Train Loss = 1.1456, Val Loss = 1.1715
Epoch 12: Train Loss = 1.1358, Val Loss = 1.1623
Epoch 13: Train Loss = 1.1258, Val Loss = 1.1529
Epoch 14: Train Loss = 1.1156, Val Loss = 1.1435
Epoch 15: Train Loss = 1.1053, Val Loss = 1.1341
Epoch 16: Train Loss = 1.0950, Val Loss = 1.1245
Epoch 17: Train Loss = 1.0845, Val Loss = 1.1149
Epoch 18: Train Loss = 1.0740, Val Loss = 1.1050
Epoch 19: Train Loss = 1.0633, Val Loss = 1.0952
Epoch 20: Train Loss = 1.0525, Val Loss = 1.0853
Epoch 21: Train Loss

In [56]:
# --- After your training loop completes ---
# Collect predicted concentrations on the validation set
model.eval()
all_z = []

with torch.no_grad():
    for x in val_loader:
        x = x.to(device)
        _, z = model(x)              # z has shape (batch_size, 2)
        all_z.append(z.cpu().numpy())

# Concatenate into an (N_val × 2) array
all_z = np.vstack(all_z)

# Print the first few for inspection
print("Predicted abundances for validation samples (first 10):")
print(all_z[:10])




Predicted abundances for validation samples (first 10):
[[0.20834814 0.7916519 ]
 [0.46121046 0.5387896 ]
 [0.21284384 0.7871561 ]
 [0.2041408  0.79585916]
 [0.45925716 0.5407428 ]
 [0.46241504 0.53758496]
 [0.27187666 0.72812337]
 [0.4592773  0.54072267]
 [0.20482196 0.795178  ]
 [0.42990747 0.57009256]]
