<a href="https://colab.research.google.com/github/Hazem-Abdelsalam/ML-QDs-electronic-properties/blob/main/ML_GQDs_CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training

##Install & Setup

In [None]:
# Step 1A: Install, Import & Setup (Run once)
import os

# Install chemprop from GitHub if in Colab
if os.getenv("COLAB_RELEASE_TAG"):
    try:
        import chemprop
    except ImportError:
        !git clone https://github.com/chemprop/chemprop.git && cd chemprop && pip install . && cd ..
        !pip install rdkit-pypi --pre deepchem

# Imports
from pathlib import Path
from lightning import pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint
import pandas as pd
import torch
import numpy as np
from google.colab import files
import re
from typing import List
from rdkit import RDLogger

# Set seed for reproducibility
pl.seed_everything(24, workers=True)

# Disable RDKit warnings
RDLogger.DisableLog('rdApp.*')

print("‚úÖ Environment setup complete.")

## Upload CSV & Preview Data

In [None]:
# Step 1B: Upload CSV & Preview Data

def upload_and_combine_csvs(
    expected_base_names: List[str] = ['GQDs_SiCQDs_Data'],
    smiles_columns: List[str] = ['SMILES'],
    name_columns: List[str] = ['Name'],
    target_columns: List[List[str]] = [['HOMO', 'LUMO']],
    output_filename: str = 'combined_cleaned_results.csv'
) -> pd.DataFrame:
    print("üì§ Please upload your CSV file...")
    uploaded = files.upload()

    def find_matching_file(base_name):
        pattern = re.compile(fr'{re.escape(base_name)}(?:\s*\(\d+\))?\.csv', re.IGNORECASE)
        for filename in uploaded.keys():
            if pattern.search(filename):
                return filename
        return None

    dfs = []
    for base_name, smi_col, name_col, tgt_cols in zip(expected_base_names, smiles_columns, name_columns, target_columns):
        matched_file = find_matching_file(base_name)
        if not matched_file:
            available_files = "\n".join(uploaded.keys())
            raise FileNotFoundError(f"No file matching '{base_name}' found.\nAvailable files:\n{available_files}")
        print(f"\nüìÅ Processing file: {matched_file}")
        try:
            df = pd.read_csv(matched_file)

            # Normalize column names
            df = df.rename(columns={smi_col: 'smiles'})
            if name_col in df.columns:
                df = df.rename(columns={name_col: 'name'})
            else:
                print(f"‚ö†Ô∏è Warning: 'Name' column not found. Generating generic names.")
                df['name'] = [f"mol_{i}" for i in range(len(df))]

            for tgt_col in tgt_cols:
                df = df.rename(columns={tgt_col: tgt_col.lower()})

            required_cols = ['smiles', 'name'] + [col.lower() for col in tgt_cols]
            if not all(col in df.columns for col in required_cols):
                missing = set(required_cols) - set(df.columns)
                raise ValueError(f"Missing required columns: {missing}")

            df = df[required_cols].dropna()
            dfs.append(df)
            print(f"- Processed {len(df)} rows")
        except Exception as e:
            raise RuntimeError(f"Error processing {matched_file}: {e}")

    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.to_csv(output_filename, index=False)
    print(f"\nüíæ Saved combined data: {output_filename}")
    return combined_df

# Run upload and data loading
try:
    df_input = upload_and_combine_csvs()
    smis = df_input['smiles'].values
    names = df_input['name'].values
    ys_all = df_input[[ 'homo', 'lumo']].values
    print(f"\n‚úÖ Success! Total molecules: {len(smis)}")

    # Preview first 5 entries: Name + targets (not SMILES)
    print("\nüìã First 5 entries (Name + Targets):")
    preview_df = df_input[['name',  'homo', 'lumo']].head(5)
    display(preview_df)

except Exception as e:
    print(f"‚ùå Failed to process file: {e}")

# # Step 2: K-Fold Cross-Validation Training

In [None]:
# Step 2: K-Fold Training & Prediction (Core Model) ‚Äî EHOMO & ELUMO ONLY
from sklearn.model_selection import KFold
from chemprop import data, featurizers, models, nn
import numpy as np

k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True)

# Initialize lists to collect results across folds
all_true, all_pred, all_smiles, all_names = [], [], [], []

for fold_idx, (train_val_idx, test_idx) in enumerate(kf.split(smis)):
    print(f"\n--- Fold {fold_idx + 1} / {k_folds} ---")

    # Split data using indices
    smis_train_val = [smis[i] for i in train_val_idx]
    names_train_val = [names[i] for i in train_val_idx]
    ys_train_val = ys_all[train_val_idx]

    smis_test = [smis[i] for i in test_idx]
    names_test = [names[i] for i in test_idx]
    ys_test = ys_all[test_idx]

    # Create MoleculeDatapoint objects
    all_data_train_val = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis_train_val, ys_train_val)]
    all_data_test = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis_test, ys_test)]

    # Inner split: train / validation
    train_val_mols = [d.mol for d in all_data_train_val]
    inner_kf = KFold(n_splits=5, shuffle=True)
    train_inner_idx, val_inner_idx = next(inner_kf.split(train_val_mols))

    train_data, val_data, _ = data.split_data_by_indices(
        all_data_train_val, [train_inner_idx], [val_inner_idx], []
    )
    train_data = train_data[0]
    val_data = val_data[0]
    test_data = all_data_test

    # Featurizer and Datasets
    featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
    train_dset = data.MoleculeDataset(train_data, featurizer)
    scaler = train_dset.normalize_targets()

    val_dset = data.MoleculeDataset(val_data, featurizer)
    val_dset.normalize_targets(scaler)

    test_dset = data.MoleculeDataset(test_data, featurizer)
    test_dset.normalize_targets(scaler)

    # DataLoaders
    num_workers = 4
    train_loader = data.build_dataloader(train_dset, num_workers=num_workers, shuffle=True)
    val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
    test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

    # Build MPNN Model ‚Äî ‚ö†Ô∏è n_tasks=2 for homo and lumo
    mp = nn.BondMessagePassing()
    agg = nn.MeanAggregation()
    output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
    ffn = nn.RegressionFFN(output_transform=output_transform, n_tasks=2)  # ‚úÖ CHANGED FROM 3 TO 2
    batch_norm = True
    metric_list = [nn.metrics.RMSE(), nn.metrics.MAE(), nn.metrics.R2Score()]

    mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

    # Trainer with checkpointing
    checkpointing = ModelCheckpoint(
        monitor="val_loss",
        mode="min",
        save_last=True,
        dirpath="checkpoints",
        filename=f"fold_{fold_idx}_best"
    )

    trainer = pl.Trainer(
        logger=False,
        enable_checkpointing=True,
        enable_progress_bar=True,
        accelerator="auto",
        devices=1,
        max_epochs=300,
        callbacks=[checkpointing],
        deterministic=True
    )

    # Train
    trainer.fit(mpnn, train_loader, val_loader)

    # Predict on test set
    test_preds = trainer.predict(mpnn, test_loader)
    preds = torch.cat([p for p in test_preds], dim=0).numpy()

    # Store results for this fold
    all_true.append(ys_test)
    all_pred.append(preds)
    all_smiles.extend([smis[i] for i in test_idx])
    all_names.extend([names[i] for i in test_idx])

# Combine results from all folds
all_true = np.vstack(all_true)  # Shape: (N, 2)
all_pred = np.vstack(all_pred)  # Shape: (N, 2)
all_smiles = np.array(all_smiles)
all_names = np.array(all_names)

print(f"\n‚úÖ K-Fold completed. Total test predictions: {len(all_names)}")

 # Step 3: Results & Plotting

In [None]:
# Step 3: Results & Plotting ‚Äî EHOMO & ELUMO ONLY
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Create results DataFrame ‚Äî NO GAP
results_df = pd.DataFrame({
    'Name': all_names,
    'True_homo': all_true[:, 0],
    'True_lumo': all_true[:, 1],
    'Pred_homo': all_pred[:, 0],
    'Pred_lumo': all_pred[:, 1]
})

# Add error metrics for homo and lumo only
for tgt in ['homo', 'lumo']:
    results_df[f'Error_{tgt}'] = (results_df[f'True_{tgt}'] - results_df[f'Pred_{tgt}']).abs()
    results_df[f'Signed_Error_{tgt}'] = results_df[f'True_{tgt}'] - results_df[f'Pred_{tgt}']
    results_df[f'Pct_Error_{tgt}'] = (results_df[f'Error_{tgt}'] / (results_df[f'True_{tgt}'] + 1e-8)) * 100

# Save to CSV
results_df.to_csv('test_set_predictions.csv', index=False)
print("‚úÖ Full results saved to 'test_set_predictions.csv'")

# UNIFORM BOLD FONT SETUP
plt.rcParams.update({
    'font.size': 14,
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'legend.fontsize': 12,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'axes.titlesize': 14,
    'figure.titlesize': 16
})

# Plotting: True vs Predicted ‚Äî ONLY 2 TARGETS
targets = ['homo', 'lumo']
colors = ['green', 'orange']
fig, axes = plt.subplots(1, 2, figsize=(12, 6))  # Changed from 1x3 to 1x2

for i, target in enumerate(targets):
    t = results_df[f'True_{target}']
    p = results_df[f'Pred_{target}']
    r2 = r2_score(t, p)
    mae = mean_absolute_error(t, p)
    rmse = np.sqrt(mean_squared_error(t, p))

    axes[i].scatter(t, p, alpha=0.7, edgecolors='w', s=60, color=colors[i])
    axes[i].plot([t.min(), t.max()], [t.min(), t.max()], 'r--', label='Ideal')
    axes[i].text(0.05, 0.85, f'R¬≤ = {r2:.3f}\nMAE = {mae:.3f}\nRMSE = {rmse:.3f}',
                 transform=axes[i].transAxes, fontsize=12, weight='bold',
                 bbox=dict(facecolor='white', alpha=0.8))
    axes[i].set_xlabel('True Values (eV)')
    axes[i].set_ylabel('Predicted Values (eV)')
    axes[i].set_title(f'{target.upper()}: True vs Predicted')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Error Distribution Plots ‚Äî ONLY 2 TARGETS
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for i, (target, color) in enumerate(zip(targets, colors)):
    errors = results_df[f'Signed_Error_{target}']
    axes[i].hist(errors, bins=15, alpha=0.7, color=color, edgecolor='black')
    axes[i].axvline(x=errors.mean(), color='red', linestyle='--', linewidth=2,
                    label=f'Mean: {errors.mean():.4f}')
    axes[i].axvline(x=0, color='black', linestyle='-', linewidth=1)
    axes[i].set_xlabel('Prediction Error (eV)')
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'{target.upper()} Error Distribution')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.suptitle('Prediction Error Distributions', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Final metrics summary ‚Äî ONLY 2 TARGETS
print("\nüìä Performance Summary:")
for target in ['homo', 'lumo']:
    t = results_df[f'True_{target}']
    p = results_df[f'Pred_{target}']
    r2 = r2_score(t, p)
    mae = mean_absolute_error(t, p)
    rmse = np.sqrt(mean_squared_error(t, p))
    print(f"{target.upper()}: R¬≤={r2:.4f}, MAE={mae:.4f}, RMSE={rmse:.4f}")

# Create a table for 10 representative quantum dots ‚Äî NO GAP
print("\nüìä Table of True vs Predicted Values for Representative Quantum Dots:")
table_data = []

# Select 10 random rows
representative_samples = results_df.sample(n=10, random_state=42)

for _, row in representative_samples.iterrows():
    name = row['Name']
    true_homo, true_lumo = row['True_homo'], row['True_lumo']
    pred_homo, pred_lumo = row['Pred_homo'], row['Pred_lumo']
    error_homo = abs(true_homo - pred_homo)
    error_lumo = abs(true_lumo - pred_lumo)

    table_data.append([
        name,
        true_homo, true_lumo,
        pred_homo, pred_lumo,
        error_homo, error_lumo
    ])

# DataFrame with NO GAP columns
table_columns = [
    "Name",
    "True EHOMO", "True ELUMO",
    "Pred EHOMO", "Pred ELUMO",
    "Error EHOMO", "Error ELUMO"
]

table_df = pd.DataFrame(table_data, columns=table_columns)
print(table_df.to_string(index=False))

# Save table
table_df.to_csv("representative_predictions_table.csv", index=False)
print("\n‚úÖ Table saved to 'representative_predictions_table.csv'")