In [1]:
import sys
import pandas as pd
sys.path.append('..')

from ai4mat.data.data import (
    read_structures_descriptions,
    read_defects_descriptions,
    StorageResolver,
    TEST_FOLD,
    TRAIN_FOLD,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datasets = [f"high_density_defects/{name}_500" for name in ("BP_spin", "GaSe_spin", "hBN_spin", "InSe_spin", "MoS2", "WSe2")] + \
           [f"low_density_defects/{name}" for name in ("MoS2", "WSe2")]

In [3]:
storage_resolver = StorageResolver()
structures = pd.concat([read_structures_descriptions(storage_resolver['csv_cif']/dataset) for dataset in datasets], axis=0)
defects = pd.concat([read_defects_descriptions(storage_resolver['csv_cif']/dataset) for dataset in datasets], axis=0)

In [4]:
test_defect_id = defects[defects.defects.apply(lambda d: d == [{'type': 'vacancy', 'element': 'Mo'}, {'type': 'vacancy', 'element': 'S'}])].index[0]

In [20]:
folds = pd.Series(index=structures.index, name='fold', data=TRAIN_FOLD)
folds[structures.descriptor_id == test_defect_id] = TEST_FOLD

In [21]:
experiment_path = storage_resolver['experiments'].joinpath('MoS2_V2')

In [23]:
folds.to_csv(experiment_path / 'folds.csv.gz', index_label='_id')

In [24]:
import yaml
config = {"datasets": datasets,
          "strategy": "train_test",
          "n-folds": 2,
          "targets": ["formation_energy_per_site"]}
with open(experiment_path.joinpath("config.yaml"), "wt") as config_file:
    yaml.dump(config, config_file)