<a href="https://colab.research.google.com/github/MZiaAfzal71/Average_Weighted_Path_Vector/blob/main/Data%20Files/Models/ANNSingleDescriptor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/MZiaAfzal71/Average_Weighted_Path_Vector.git
%cd Average_Weighted_Path_Vector/Data\ Files

In [None]:
!pip install osfclient
from osfclient.api import OSF
import os
from subprocess import run

# Replace with your OSF project ID
project_id = "p5ga2"   # e.g. from https://osf.io/abcd3/
osf = OSF()
project = osf.project(project_id)
store = project.storage("osfstorage")

desc_folder = []
for fold in store.folders:
    if fold.path.strip("/") == "Descriptors Data":
        desc_folder = fold
        break

# Download all files and keep folder structure
for f in desc_folder.files:
    local_path = f.path.strip("/")            # keep folders
    local_dir = os.path.dirname(local_path)   # extract dir
    if local_dir and not os.path.exists(local_dir):
        os.makedirs(local_dir, exist_ok=True) # create dirs if missing
    with open(local_path, "wb") as out:
        f.write_to(out)
    if local_path.endswith(".zip"):
      command = f"unzip '{local_path}' -d '{local_dir}'"
      run(command, shell=True)
      print(f"\nUnzipped {local_path} -> {local_dir}")
      continue
    print(f"Downloaded {f.path} -> {local_path}")

In [None]:
import pandas as pd
import numpy as np
import os, gc, math, random
from dataclasses import dataclass
from typing import List, Optional, Dict, Any, Tuple

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

In [None]:
# ----------------------------
# Config
# ----------------------------
@dataclass
class Config:
    output_dir: str = "./desc_only_output"
    batch_size: int = 16
    epochs: int = 5
    lr: float = 1e-5
    weight_decay: float = 0.01
    seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    proj_dim: int = 128
    dropout: float = 0.1

# ----------------------------
# Utils
# ----------------------------
def set_seed(seed: int):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

# ----------------------------
# Model (Single descriptor)
# ----------------------------
class DescriptorSingle(nn.Module):
    def __init__(self, n_desc: int, proj_dim: int = 128, hidden: int = 256, dropout: float = 0.1):
        super().__init__()
        self.desc_proj = nn.Sequential(
            nn.Linear(n_desc, proj_dim),
            nn.LayerNorm(proj_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(proj_dim, hidden),
            nn.LayerNorm(hidden),
            nn.ReLU(),
        )
        self.dropout = nn.Dropout(dropout)
        self.regressor = nn.Sequential(
            nn.Linear(hidden, hidden // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden // 2, 1),
        )

    def forward(self, desc):
        h = self.desc_proj(desc)
        h = self.dropout(h)
        return self.regressor(h).squeeze(-1)

# ----------------------------
# Dataset / Collate
# ----------------------------
class DescDataset(Dataset):
    def __init__(self, targets: Optional[np.ndarray], descriptors: Optional[np.ndarray] = None):
        self.targets = None if targets is None else np.asarray(targets, dtype=np.float32)
        self.desc = None if descriptors is None else np.asarray(descriptors, dtype=np.float32)

    def __len__(self):
        return len(self.targets) if self.targets is not None else len(self.desc)

    def __getitem__(self, i):
        item = {}
        if self.targets is not None:
            item["labels"] = torch.tensor(self.targets[i], dtype=torch.float32)
        if self.desc is not None:
            d = self.desc[i]
            if not np.all(np.isfinite(d)):
                d = np.nan_to_num(d, nan=0.0, posinf=0.0, neginf=0.0)
            item["descriptors"] = torch.tensor(d, dtype=torch.float32)
        return item

def collate_stack(batch):
    out = {}
    if "labels" in batch[0]:
        out["labels"] = torch.stack([b["labels"] for b in batch])
    if "descriptors" in batch[0]:
        out["descriptors"] = torch.stack([b["descriptors"] for b in batch])
    return out

# ----------------------------
# Training / Evaluation
# ----------------------------
def make_loaders(df: pd.DataFrame, target_col: str, cfg: Config,
                 desc_cols: List[str]) -> Tuple[DataLoader, DataLoader, StandardScaler]:
    train_df = df[df["Training/Test"].str.strip().str.lower() == "training"].reset_index(drop=True)
    test_df  = df[df["Training/Test"].str.strip().str.lower() == "test"].reset_index(drop=True)

    scaler = StandardScaler().fit(train_df[desc_cols].to_numpy(dtype=np.float32))
    train_desc = scaler.transform(train_df[desc_cols].to_numpy(dtype=np.float32))
    test_desc  = scaler.transform(test_df[desc_cols].to_numpy(dtype=np.float32))

    train_ds = DescDataset(train_df[target_col].to_numpy(dtype=np.float32), train_desc)
    test_ds  = DescDataset(test_df[target_col].to_numpy(dtype=np.float32), test_desc)

    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_stack)
    test_loader  = DataLoader(test_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=collate_stack)
    return train_loader, test_loader, scaler


def train_single_desc_for_prop(prop: str, desc_name: str, cfg: Config) -> Dict[str, Any]:
    set_seed(cfg.seed)
    ensure_dir(cfg.output_dir)

    target_col = f"{prop}-Measured"
    data_file = f"Descriptors Data/{prop}_{desc_name}.parquet"
    sheet_name = f"{prop}_{desc_name}"

    df = pd.read_parquet(data_file)
    desc_cols = df.columns[9:].to_list()

    # Data loaders
    train_loader, test_loader, scaler = make_loaders(df, target_col, cfg, desc_cols)

    # Model
    n_desc = len(desc_cols)
    model = DescriptorSingle(n_desc, proj_dim=cfg.proj_dim, dropout=cfg.dropout).to(cfg.device)
    optim = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)

    best_mse = float("inf")
    best_path = os.path.join(cfg.output_dir, f"{sheet_name}_best.pt")

    for epoch in tqdm(range(1, cfg.epochs + 1)):
        model.train(); ep_loss = 0.0
        for batch in tqdm(train_loader):
            batch = {k: v.to(cfg.device) for k, v in batch.items()}
            pred = model(batch["descriptors"])
            loss = F.mse_loss(pred, batch["labels"])
            optim.zero_grad(); loss.backward(); optim.step()
            ep_loss += loss.item()
        print(f"Epoch {epoch}/{cfg.epochs} | train MSE: {ep_loss / max(1,len(train_loader)):.6f}")

        model.eval(); preds, labels = [], []
        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(cfg.device) for k, v in batch.items()}
                out = model(batch["descriptors"])
                preds.extend(out.detach().cpu().numpy())
                labels.extend(batch["labels"].detach().cpu().numpy())
        mse = mean_squared_error(labels, preds)
        print(f"→ Test MSE: {mse:.6f} | RMSE: {math.sqrt(mse):.6f}")
        if mse < best_mse:
            best_mse = mse
            torch.save(model.state_dict(), best_path)
            print(f"  ✓ Saved best checkpoint → {best_path}")

    model.eval()

    all_desc  = scaler.transform(df[desc_cols].to_numpy(dtype=np.float32))
    all_ds = DescDataset(df[target_col].to_numpy(dtype=np.float32), all_desc)

    all_loader = DataLoader(all_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=collate_stack)

    all_preds = []
    with torch.no_grad():
        for batch in all_loader:
            batch = {k: v.to(cfg.device) for k, v in batch.items()}
            out = model(batch["descriptors"])
            all_preds.extend(out.detach().cpu().numpy())

    # Build results DF
    new_results = pd.DataFrame({
        "Name": df["NAME"] if "NAME" in df.columns else pd.Series([None]*len(df)),
        "SMILES": df["SMILES"],
        "Observed": df[target_col],
        "Predicted": all_preds,
        "Training/Test": df["Training/Test"],
    })

    # Final metrics on Test only
    obs_test = new_results[new_results["Training/Test"].str.lower() == "test"]["Observed"].values
    pred_test = new_results[new_results["Training/Test"].str.lower() == "test"]["Predicted"].values
    mae_v = mean_absolute_error(obs_test, pred_test)
    rmse_v = rmse(obs_test, pred_test)
    r2_v = r2_score(obs_test, pred_test)
    print(f"Final Test metrics for {sheet_name} → MAE: {mae_v:.4f} | RMSE: {rmse_v:.4f} | R²: {r2_v:.4f}")

    # Save predictions parquet
    pred_path = os.path.join(cfg.output_dir, f"{sheet_name}.parquet")
    new_results.to_parquet(pred_path, index=False)
    print(f"Saved predictions → {pred_path}")

    return {
        "MAE": mae_v, "RMSE": rmse_v, "R2": r2_v,
    }


# ----------------------------
# Multi-property runner
# ----------------------------
def run_all_properties_single_desc(prop_names: List[str], desc_name: str, cfg: Config):
    ensure_dir(cfg.output_dir)
    perf_rows = []
    for prop in prop_names:
        print(f"\n=== Processing: {prop}_{desc_name} ===")
        result = train_single_desc_for_prop(prop, desc_name, cfg)
        perf_rows.append([f"{prop}_{desc_name}", result["MAE"], result["RMSE"], result["R2"]])
    perf_df = pd.DataFrame(perf_rows, columns=["Property", "MAE", "RMSE", "R2"])
    stats_path = os.path.join(cfg.output_dir, f"{desc_name}_single_stats.csv")
    perf_df.to_csv(stats_path, index=False)
    print(f"\n📊 Stats saved → {stats_path}")
    return perf_df


In [None]:
cfg = Config(
    epochs=30,
    batch_size=8,
    proj_dim=128,
    lr = 1e-4
)

prop_names = ["Log VP", "MP", "BP", "LogBCF", "LogS", "LogP"]
# prop_names = ["LogBCF"]

desc_names = ["MACCS", "Morgan", "pwav"]
# desc_names = ["pwav"]

for desc in desc_names:
    perf_df = run_all_properties_single_desc(prop_names, desc, cfg)
    print(perf_df)
