In [1]:
%env CUDA_VISIBLE_DEVICES=

env: CUDA_VISIBLE_DEVICES=


In [2]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from megnet.models import MEGNetModel
from megnet_graphs_train import generate_ss_experiments, generate_promissing_experiments

In [3]:
experiments = list(set(generate_ss_experiments() + generate_promissing_experiments()))



In [4]:
save_path = os.path.join("datasets", "predicted_dichalcogenides_innopolis_202105_v6")
plots_path = os.path.join(save_path, "plots")

In [5]:
def get_prediction_column(target):
  return f"predicted_{target}"

In [6]:
def process_experiment(experiment):
  data = {}
  results = {}
  results.update(experiment.__dict__)
  data["train"] = pd.read_pickle(experiment.train_path)
  data["test"] = pd.read_pickle(experiment.test_path)
  for data_part in ("train", "test"):
    results[f"{data_part}_size"] = len(data[data_part])
  prediction_column = get_prediction_column(experiment.target)
  try:
    model = MEGNetModel.from_file(os.path.join(experiment.model_path))
  except:
    # TODO(kazeevn) investigate the missing experiments
    return dict()
  for this_data in data.values():
    this_data.loc[:, prediction_column] = model.predict_structures(this_data.defect_representation)
  fig, ax = plt.subplots()
  for data_name, data_part in data.items():
    errors = np.abs(data_part.loc[:, experiment.target]-data_part.loc[:, prediction_column])
    mae = np.mean(errors)
    mae_std = np.std(errors)
    results[f"{data_name}_mae"] = mae
    results[f"{data_name}_mae_std"] = mae_std
    ax.scatter(data_part.loc[:, experiment.target], data_part.loc[:, prediction_column],
               label=f"{data_name}, $MAE={mae:.4f} \pm {mae_std:.4f}$",
               alpha=0.5)
  ax.set_xlabel(f"DFT {experiment.target}, eV")
  ax.set_ylabel(f"Predicted {experiment.target}, eV")
  ax.legend()

  lims = [
      np.min([ax.get_xlim(), ax.get_ylim()]),
      np.max([ax.get_xlim(), ax.get_ylim()]),
    ]

  ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
  ax.set_aspect('equal')
  ax.set_xlim(lims)
  ax.set_ylim(lims)
  ax.set_title(f"{experiment.name}, train size={len(data['train'])}")
  fig.savefig(os.path.join(plots_path, f"{experiment.target}_{experiment.name}.pdf"),
              bbox_inches="tight",
              metadata={
                "Author": "Nikita Kazeev",
                "Title": f"MEGNet on defect-only representation, {experiment.target}, {experiment.name}",
                "Keywords": "2D materials, machine learning, graph neural network, MEGNet"}
             )
  columns_to_save = [experiment.target, get_prediction_column(experiment.target)]
  for data_name, data_part in data.items():
    data_part.to_csv(os.path.join(save_path, f"{experiment.name}_{experiment.target}_{data_name}.csv.gz"),
                     columns=columns_to_save, index_label="_id")
  return results

In [7]:
from multiprocessing import Pool
with Pool(20) as p:
  results = p.map(process_experiment, experiments)

In [8]:
results_pd = pd.DataFrame.from_dict(data=results).dropna(how="all")
results_pd.to_csv(os.path.join(save_path, "summary.csv.gz"), index=False)
results_pd

Unnamed: 0,data_name,train_path,test_path,name,target,epochs,atom_features,add_bond_z_coord,model_path,learning_rate,supercell_replication,vacancy_only,train_size,test_size,train_mae,train_mae_std,test_mae,test_mae_std
0,vac_only_no_8x8_in_train,datasets/train_defects_vac_only_no_8x8_in_trai...,datasets/test_defects_vac_only_no_8x8_in_train...,vac_only_no_8x8_in_train_bond_z_Z_1000,formation_energy_per_site,1000.0,Z,True,models/MEGNet-defect-only/formation_energy_per...,0.0002,,True,3200.0,113.0,0.014648,0.017155,0.253286,0.28077
1,vac_only_8x8_split,datasets/train_defects_vac_only_8x8_split.pick...,datasets/test_defects_vac_only_8x8_split.pickl...,vac_only_8x8_split_bond_z_Z_ss_replication_1000,band_gap,1000.0,Z,True,models/MEGNet-defect-only/band_gap/vac_only_8x...,0.0002,"{'epochs_per_replication_variant': 100, 'repli...",True,3256.0,57.0,0.019009,0.020432,0.022363,0.029439
4,full,datasets/train_defects.pickle.gzip,datasets/test_defects.pickle.gzip,full_bond_z_Z_1000,band_gap,1000.0,Z,True,models/MEGNet-defect-only/band_gap/full_bond_z...,0.0002,,False,2694.0,899.0,0.039319,0.042914,0.049172,0.0575
5,vac_only,datasets/train_defects_vac_only.pickle.gzip,datasets/test_defects_vac_only.pickle.gzip,vac_only_bond_z_Z_1000,band_gap,1000.0,Z,True,models/MEGNet-defect-only/band_gap/vac_only_bo...,0.0002,,True,2484.0,829.0,0.018039,0.01816,0.032228,0.042781
6,vac_only_no_8x8_in_train,datasets/train_defects_vac_only_no_8x8_in_trai...,datasets/test_defects_vac_only_no_8x8_in_train...,vac_only_no_8x8_in_train_bond_z_Z_ss_replicati...,formation_energy_per_site,1000.0,Z,True,models/MEGNet-defect-only/formation_energy_per...,0.0002,"{'epochs_per_replication_variant': 100, 'repli...",True,3200.0,113.0,0.020991,0.023176,0.140756,0.174723
9,vac_only,datasets/train_defects_vac_only.pickle.gzip,datasets/test_defects_vac_only.pickle.gzip,vac_only_bond_z_Z_ss_replication_1000,formation_energy_per_site,1000.0,Z,True,models/MEGNet-defect-only/formation_energy_per...,0.0002,"{'epochs_per_replication_variant': 100, 'repli...",True,2484.0,829.0,0.020673,0.022898,0.027996,0.04174
10,vac_only_no_8x8_in_train,datasets/train_defects_vac_only_no_8x8_in_trai...,datasets/test_defects_vac_only_no_8x8_in_train...,vac_only_no_8x8_in_train_bond_z_Z_ss_replicati...,homo,1000.0,Z,True,models/MEGNet-defect-only/homo/vac_only_no_8x8...,0.0002,"{'epochs_per_replication_variant': 100, 'repli...",True,3200.0,113.0,0.014707,0.015298,0.065939,0.040089
11,vac_only_8x8_split,datasets/train_defects_vac_only_8x8_split.pick...,datasets/test_defects_vac_only_8x8_split.pickl...,vac_only_8x8_split_bond_z_Z_1000,formation_energy_per_site,1000.0,Z,True,models/MEGNet-defect-only/formation_energy_per...,0.0002,,True,3256.0,57.0,0.017645,0.020972,0.041797,0.078237
13,vac_only,datasets/train_defects_vac_only.pickle.gzip,datasets/test_defects_vac_only.pickle.gzip,vac_only_bond_z_Z_1000,formation_energy_per_site,1000.0,Z,True,models/MEGNet-defect-only/formation_energy_per...,0.0002,,True,2484.0,829.0,0.017429,0.021423,0.026586,0.047447
14,vac_only,datasets/train_defects_vac_only.pickle.gzip,datasets/test_defects_vac_only.pickle.gzip,vac_only_bond_z_Z_1000,band_gap,1000.0,Z,True,models/MEGNet-defect-only/band_gap/vac_only_bo...,0.0002,,True,2484.0,829.0,0.018039,0.01816,0.032228,0.042781
