In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from megnet.models import MEGNetModel
from megnet_graphs_train import get_8x8_experiments

In [2]:
experiments = filter(lambda e: e.target=="formation_energy", get_8x8_experiments())

In [3]:
save_path = os.path.join("datasets", "predicted_dichalcogenides_innopolis_202105_v5")
plots_path = os.path.join(save_path, "plots")

In [4]:
def get_prediction_column(target):
  return f"predicted_{target}"

In [5]:
def process_experiment(experiment):
  assert experiment.target == "formation_energy"
  data = {}
  results = {}
  results.update(experiment.__dict__)
  data["train"] = pd.read_pickle(experiment.train_path).set_index("_id")
  data["test"] = pd.read_pickle(experiment.test_path).set_index("_id")
  for data_part in ("train", "test"):
    results[f"{data_part}_size"] = len(data[data_part])
    
  formation_prediction_column = get_prediction_column(experiment.target)
  energy_column = "energy"
  energy_prediction_column = get_prediction_column(energy_column)
  energy_per_atom_column = "energy_per_atom"
  energy_per_atom_prediction_column = get_prediction_column(energy_per_atom_column)
  
  for data_name, this_data in data.items():
    prediction_file = pd.read_csv(
      os.path.join(save_path, f"{experiment.name}_{experiment.target}_{data_name}.csv.gz"), index_col="_id")
    this_data.loc[:, formation_prediction_column] = prediction_file.loc[:, formation_prediction_column]
    this_data.loc[:, energy_prediction_column] = this_data.loc[:, energy_column] - \
      this_data.loc[:, "formation_energy"] + this_data.loc[:, formation_prediction_column]
    n_atoms = this_data.loc[:, energy_column]/this_data.loc[:, energy_per_atom_column]
    this_data.loc[:, energy_per_atom_prediction_column] = this_data.loc[:, energy_prediction_column]/n_atoms
  
  fig, ax = plt.subplots()
  for data_name, data_part in data.items():
    errors = np.abs(data_part.loc[:, energy_per_atom_column] - data_part.loc[:, energy_per_atom_prediction_column])
    mae = np.mean(errors)
    mae_std = np.std(errors)
    results[f"{data_name}_mae"] = mae
    results[f"{data_name}_mae_std"] = mae_std
    ax.scatter(data_part.loc[:, energy_per_atom_column], data_part.loc[:, energy_per_atom_prediction_column],
               label=f"{data_name}, $MAE={mae:.4f} \pm {mae_std:.4f}$",
               alpha=0.5)
  ax.set_xlabel(f"DFT {energy_per_atom_column}, eV")
  ax.set_ylabel(f"Predicted {energy_per_atom_column}, eV")
  ax.legend()

  lims = [
      np.min([ax.get_xlim(), ax.get_ylim()]),
      np.max([ax.get_xlim(), ax.get_ylim()]),
    ]

  ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
  ax.set_aspect('equal')
  ax.set_xlim(lims)
  ax.set_ylim(lims)
  ax.set_title(f"{experiment.name}, train size={len(data['train'])}")
  fig.savefig(os.path.join(plots_path, f"{energy_per_atom_column}_{experiment.name}.pdf"),
              bbox_inches="tight",
              metadata={
                "Author": "Nikita Kazeev",
                "Title": f"MEGNet on defect-only representation, {energy_per_atom_column}, {experiment.name}",
                "Keywords": "2D materials, machine learning, graph neural network, MEGNet"}
             )
  return results

In [6]:
from multiprocessing import Pool
with Pool(20) as p:
  results = p.map(process_experiment, experiments)