In [1]:
"""
Multi-fidelity materials prediction using REAL Materials Project data
This demonstrates the approach on actual materials, not synthetic data
"""

'\nMulti-fidelity materials prediction using REAL Materials Project data\nThis demonstrates the approach on actual materials, not synthetic data\n'

In [1]:
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings('ignore')
import os

In [3]:
print("=" * 80)
print("MULTI-FIDELITY PREDICTION ON REAL MATERIALS PROJECT DATA")
print("=" * 80)

MULTI-FIDELITY PREDICTION ON REAL MATERIALS PROJECT DATA


In [2]:
# Check if data file exists
data_file = 'materials_project_real_data.csv'
if not os.path.exists(data_file):
    print(f"\n❌ Error: {data_file} not found!")
    print("\nPlease run first: python fetch_real_data.py")
    print("This will download real materials data from Materials Project")
    exit(1)

In [3]:
# Load real data
print("\n[1/6] Loading real Materials Project data...")
df_all = pd.read_csv(data_file)


[1/6] Loading real Materials Project data...


In [4]:
print(f"✓ Loaded {len(df_all)} real materials")
print(f"✓ Features: {len([c for c in df_all.columns if c not in ['material_id', 'formula', 'formation_energy_pbe', 'formation_energy_r2scan']])}")

✓ Loaded 2500 real materials
✓ Features: 11


In [5]:
# Show some example materials
print(f"\nExample materials:")
print(df_all[['formula', 'formation_energy_pbe', 'formation_energy_r2scan']].head(10).to_string(index=False))


Example materials:
formula  formation_energy_pbe  formation_energy_r2scan
     Ac              0.396146                 0.038757
     Ac              0.294846                 0.044631
     Ac              0.408975                 0.001882
     Ac              0.528454                -0.018367
Ac2AgIr             -0.148408                -0.445840
Ac2AgPb             -0.215399                -0.505040
Ac2Br2O             -0.923191                -1.371600
Ac2CdGa              0.012284                -0.365491
Ac2CdGe             -0.235230                -0.505341
Ac2CdHg             -0.052298                -0.473356


In [6]:
# Prepare features
feature_cols = [c for c in df_all.columns if c not in ['material_id', 'formula', 'formation_energy_pbe', 'formation_energy_r2scan']]

In [7]:
# Split into "low-fidelity-only" and "both-fidelity" datasets
# Simulate real scenario: we have many cheap measurements, fewer expensive ones
n_total = len(df_all)
n_high_fidelity = int(0.3 * n_total)  # Only 30% have expensive measurements

In [8]:
# Randomly select which materials get high-fidelity measurements
np.random.seed(42)
high_fidelity_mask = np.random.choice(n_total, n_high_fidelity, replace=False)

In [9]:
df_high = df_all.iloc[high_fidelity_mask].copy()
df_low = df_all.copy()  # All materials have low-fidelity

In [10]:
print(f"\n✓ Simulated realistic scenario:")
print(f"  Total materials: {n_total}")
print(f"  Low-fidelity (PBE) available: {len(df_low)}")
print(f"  High-fidelity (r2SCAN) available: {len(df_high)} ({100*len(df_high)/n_total:.1f}%)")


✓ Simulated realistic scenario:
  Total materials: 2500
  Low-fidelity (PBE) available: 2500
  High-fidelity (r2SCAN) available: 750 (30.0%)


In [11]:
# Analyze fidelity gap
corr = np.corrcoef(df_high['formation_energy_pbe'],
                   df_high['formation_energy_r2scan'])[0, 1]
error = df_high['formation_energy_pbe'] - df_high['formation_energy_r2scan']
print(f"\n✓ Fidelity analysis on real materials:")
print(f"  PBE-r2SCAN correlation: {corr:.3f}")
print(f"  Systematic bias: {error.mean():.3f} ± {error.std():.3f} eV/atom")


✓ Fidelity analysis on real materials:
  PBE-r2SCAN correlation: 0.991
  Systematic bias: 0.305 ± 0.149 eV/atom


In [14]:
# ============================================================================
# PART 2: Baseline Model (High-Fidelity Only)
# ============================================================================
print("\n[2/6] Training baseline model on real materials...")


[2/6] Training baseline model on real materials...


In [12]:
X = df_high[feature_cols].values
y = df_high['formation_energy_r2scan'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
baseline_model = GradientBoostingRegressor(
    n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42
)
baseline_model.fit(X_train_scaled, y_train)
y_pred_baseline = baseline_model.predict(X_test_scaled)

In [16]:
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))
r2_baseline = r2_score(y_test, y_pred_baseline)

In [17]:
print(f"✓ Baseline (high-fidelity only):")
print(f"  Training samples: {len(X_train)} real materials")
print(f"  MAE:  {mae_baseline:.4f} eV/atom")
print(f"  RMSE: {rmse_baseline:.4f} eV/atom")
print(f"  R²:   {r2_baseline:.4f}")

✓ Baseline (high-fidelity only):
  Training samples: 600 real materials
  MAE:  0.4656 eV/atom
  RMSE: 0.6430 eV/atom
  R²:   0.5071


In [18]:
# ============================================================================
# PART 3: Delta Learning
# ============================================================================
print("\n[3/6] Training delta learning model on real materials...")


[3/6] Training delta learning model on real materials...


In [19]:
train_size = int(0.8 * len(df_high))
train_idx = df_high.index[:train_size]
test_idx = df_high.index[train_size:]

In [20]:
X_train_delta = df_high.loc[train_idx, feature_cols].values
y_train_pbe = df_high.loc[train_idx, 'formation_energy_pbe'].values
y_train_r2scan = df_high.loc[train_idx, 'formation_energy_r2scan'].values

In [21]:
X_test_delta = df_high.loc[test_idx, feature_cols].values
y_test_pbe = df_high.loc[test_idx, 'formation_energy_pbe'].values
y_test_r2scan = df_high.loc[test_idx, 'formation_energy_r2scan'].values

In [22]:
delta_train = y_train_r2scan - y_train_pbe

In [23]:
scaler_delta = StandardScaler()
X_train_delta_scaled = scaler_delta.fit_transform(X_train_delta)
X_test_delta_scaled = scaler_delta.transform(X_test_delta)

In [24]:
X_train_delta_augmented = np.column_stack([X_train_delta_scaled, y_train_pbe])
X_test_delta_augmented = np.column_stack([X_test_delta_scaled, y_test_pbe])

In [25]:
delta_model = GradientBoostingRegressor(
    n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42
)
delta_model.fit(X_train_delta_augmented, delta_train)

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,5
,min_impurity_decrease,0.0


In [26]:
delta_pred = delta_model.predict(X_test_delta_augmented)
y_pred_delta = y_test_pbe + delta_pred

In [27]:
mae_delta = mean_absolute_error(y_test_r2scan, y_pred_delta)
rmse_delta = np.sqrt(mean_squared_error(y_test_r2scan, y_pred_delta))
r2_delta = r2_score(y_test_r2scan, y_pred_delta)

In [28]:
print(f"✓ Delta learning (bias correction):")
print(f"  Mean correction learned: {delta_train.mean():.4f} eV/atom")
print(f"  MAE:  {mae_delta:.4f} eV/atom")
print(f"  RMSE: {rmse_delta:.4f} eV/atom")
print(f"  R²:   {r2_delta:.4f}")
improvement_delta = 100*(mae_baseline - mae_delta)/mae_baseline
print(f"  Improvement: {improvement_delta:+.1f}% vs baseline")

✓ Delta learning (bias correction):
  Mean correction learned: -0.3088 eV/atom
  MAE:  0.1349 eV/atom
  RMSE: 0.1665 eV/atom
  R²:   0.9733
  Improvement: +71.0% vs baseline


In [29]:
# ============================================================================
# PART 4: Transfer Learning
# ============================================================================
print("\n[4/6] Training transfer learning model on real materials...")


[4/6] Training transfer learning model on real materials...


In [30]:
X_low_all = df_low[feature_cols].values
y_low_all = df_low['formation_energy_pbe'].values

In [31]:
scaler_transfer = StandardScaler()
X_low_scaled = scaler_transfer.fit_transform(X_low_all)

In [32]:
pretrained_model = MLPRegressor(
    hidden_layer_sizes=(64, 32),
    max_iter=100,
    random_state=42,
    early_stopping=True,
    verbose=False
)
pretrained_model.fit(X_low_scaled, y_low_all)

0,1,2
,loss,'squared_error'
,hidden_layer_sizes,"(64, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,100


In [33]:
print(f"✓ Pre-trained on {len(X_low_all)} materials with PBE data")

✓ Pre-trained on 2500 materials with PBE data


In [34]:
X_train_transfer = scaler_transfer.transform(X_train_delta)
X_test_transfer = scaler_transfer.transform(X_test_delta)

In [36]:
transfer_model = MLPRegressor(
    hidden_layer_sizes=(64, 32),
    max_iter=50,
    warm_start=True,
    random_state=42,
    verbose=False
)
transfer_model.fit(X_train_transfer, y_train_r2scan)

0,1,2
,loss,'squared_error'
,hidden_layer_sizes,"(64, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,50


In [37]:
y_pred_transfer = transfer_model.predict(X_test_transfer)

In [38]:
mae_transfer = mean_absolute_error(y_test_r2scan, y_pred_transfer)
rmse_transfer = np.sqrt(mean_squared_error(y_test_r2scan, y_pred_transfer))
r2_transfer = r2_score(y_test_r2scan, y_pred_transfer)

In [39]:
print(f"✓ Fine-tuned on {len(X_train_transfer)} materials with r2SCAN data")
print(f"  MAE:  {mae_transfer:.4f} eV/atom")
print(f"  RMSE: {rmse_transfer:.4f} eV/atom")
print(f"  R²:   {r2_transfer:.4f}")
improvement_transfer = 100*(mae_baseline - mae_transfer)/mae_baseline
print(f"  Improvement: {improvement_transfer:+.1f}% vs baseline")

✓ Fine-tuned on 600 materials with r2SCAN data
  MAE:  0.5570 eV/atom
  RMSE: 0.7547 eV/atom
  R²:   0.4525
  Improvement: -19.6% vs baseline


In [40]:
# ============================================================================
# PART 5: Uncertainty Quantification
# ============================================================================
print("\n[5/6] Computing uncertainty on real materials predictions...")


[5/6] Computing uncertainty on real materials predictions...


In [41]:
rf_delta = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_delta.fit(X_train_delta_augmented, delta_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [42]:
tree_predictions = np.array([tree.predict(X_test_delta_augmented)
                             for tree in rf_delta.estimators_])

In [43]:
delta_pred_mean = tree_predictions.mean(axis=0)
delta_pred_std = tree_predictions.std(axis=0)

In [44]:
y_pred_ensemble = y_test_pbe + delta_pred_mean
uncertainty = delta_pred_std
errors = np.abs(y_test_r2scan - y_pred_ensemble)

In [45]:
corr_uncertainty = np.corrcoef(uncertainty, errors)[0, 1]
high_conf_mask = uncertainty < np.median(uncertainty)

In [46]:
print(f"✓ Uncertainty quantification:")
print(f"  Mean uncertainty: {uncertainty.mean():.4f} eV/atom")
print(f"  Uncertainty-error correlation: {corr_uncertainty:.3f}")
if high_conf_mask.sum() > 0:
    print(f"  High-confidence MAE: {mean_absolute_error(y_test_r2scan[high_conf_mask], y_pred_ensemble[high_conf_mask]):.4f} eV/atom")
    print(f"  High-confidence samples: {high_conf_mask.sum()} ({100*high_conf_mask.sum()/len(high_conf_mask):.1f}%)")

✓ Uncertainty quantification:
  Mean uncertainty: 0.1158 eV/atom
  Uncertainty-error correlation: 0.196
  High-confidence MAE: 0.1146 eV/atom
  High-confidence samples: 75 (50.0%)


In [47]:
# ============================================================================
# PART 6: Results on Real Materials
# ============================================================================
print("\n[6/6] Final results on real Materials Project data...")


[6/6] Final results on real Materials Project data...


In [48]:
results_df = pd.DataFrame({
    'Approach': ['Baseline (High-Fidelity Only)', 'Delta Learning', 'Transfer Learning'],
    'MAE (eV/atom)': [mae_baseline, mae_delta, mae_transfer],
    'RMSE (eV/atom)': [rmse_baseline, rmse_delta, rmse_transfer],
    'R²': [r2_baseline, r2_delta, r2_transfer],
})

In [49]:
print("\n" + "=" * 80)
print("RESULTS ON REAL MATERIALS PROJECT DATA")
print("=" * 80)
print(results_df.to_string(index=False))
print("=" * 80)


RESULTS ON REAL MATERIALS PROJECT DATA
                     Approach  MAE (eV/atom)  RMSE (eV/atom)       R²
Baseline (High-Fidelity Only)       0.465552        0.643009 0.507122
               Delta Learning       0.134867        0.166518 0.973347
            Transfer Learning       0.556997        0.754729 0.452477


In [50]:
# Identify best and worst predictions
test_materials = df_high.loc[test_idx].copy()
test_materials['predicted'] = y_pred_delta
test_materials['error'] = np.abs(y_test_r2scan - y_pred_delta)
test_materials = test_materials.sort_values('error')

In [51]:
print(f"\n✓ Best predictions (lowest error):")
print(test_materials[['formula', 'formation_energy_r2scan', 'predicted', 'error']].head(5).to_string(index=False))


✓ Best predictions (lowest error):
formula  formation_energy_r2scan  predicted    error
Ba2Br2F                -1.694878  -1.695139 0.000261
AlSiMo6                -0.292189  -0.291670 0.000520
Al2OsRu                 1.527512   1.531929 0.004417
  AlVN2                -1.128235  -1.138155 0.009920
 AlTlF4                -3.207265  -3.195801 0.011464


In [52]:
print(f"\n✓ Most challenging predictions (highest error):")
print(test_materials[['formula', 'formation_energy_r2scan', 'predicted', 'error']].tail(5).to_string(index=False))


✓ Most challenging predictions (highest error):
       formula  formation_energy_r2scan  predicted    error
Ba2CaCu(HgO3)2                -1.798490  -1.469629 0.328862
         Al4CO                -0.650308  -0.986268 0.335960
          AgN3                 0.616502   0.218006 0.398496
          BC2N                 0.236637  -0.300496 0.537133
          AlIr                -1.049741  -0.485591 0.564150


In [53]:
print("\n" + "=" * 80)
print("VALIDATION COMPLETE!")
print("=" * 80)
print(f"\n✓ Demonstrated multi-fidelity ML on {len(df_all)} REAL materials")
print(f"✓ Improvement: {max(improvement_delta, improvement_transfer):.1f}%")
print(f"✓ Ready for actual lab characterization data (XRD, spectroscopy, etc.)")


VALIDATION COMPLETE!

✓ Demonstrated multi-fidelity ML on 2500 REAL materials
✓ Improvement: 71.0%
✓ Ready for actual lab characterization data (XRD, spectroscopy, etc.)
