In [1]:
import json
with open('mp_elastic_with_desc0and1.json') as f:
    origin_data = json.load(f)

In [2]:
for k, v in origin_data[0].items():
    print(k, v)

builder_meta {'emmet_version': '0.71.1', 'pymatgen_version': '2023.10.4', 'pull_request': 990, 'database_version': '2023.11.1', 'build_date': '2023-10-20 07:26:54.333000', 'license': None}
nsites 4
elements ['B', 'Fe']
nelements 2
composition {'Fe': 2.0, 'B': 2.0}
composition_reduced {'Fe': 1.0, 'B': 1.0}
formula_pretty FeB
formula_anonymous AB
chemsys B-Fe
volume 32.18772121930709
density 6.87746058623732
density_atomic 8.046930304826773
symmetry {'crystal_system': 'Orthorhombic', 'symbol': 'Cmcm', 'number': 63, 'point_group': 'mmm', 'symprec': 0.1, 'version': '1.16.5'}
property_name elasticity
material_id mp-1007881
deprecated False
deprecation_reasons None
last_updated 2023-10-20 07:26:54.333000
origins []
structure {'@module': 'pymatgen.core.structure', '@class': 'Structure', 'charge': 0.0, 'lattice': {'matrix': [[1.44888685, -3.77005152, 0.0], [1.44888685, 3.77005152, -0.0], [0.0, -0.0, 2.94631027]], 'pbc': [True, True, True], 'a': 4.038881227215927, 'b': 4.038881227215927, 'c': 2

In [3]:
from pymatgen.analysis.elasticity import ElasticTensor
def get_elastic_tensor(raw_datapoint: dict):
    return ElasticTensor.from_voigt(raw_datapoint['elastic_tensor']['ieee_format']).tolist()

def get_structure(raw_datapoint: dict):
    return raw_datapoint['structure']

def construct_datapoint(raw_datapoint: dict):
    dic = {}
    dic['crystal_system'] : str = raw_datapoint['symmetry']['crystal_system']
    dic['bulk modulus'] : dict or None = raw_datapoint['bulk_modulus'] if raw_datapoint['bulk_modulus'] else None
    dic['shear modulus'] : dict or None = raw_datapoint['shear_modulus'] if raw_datapoint['shear_modulus'] else None
    dic['young modulus'] : float or None = raw_datapoint['young_modulus'] if raw_datapoint['young_modulus'] else None
    return dic

In [4]:
def construct_dataset(raw_data):
    dataset = {'structure': {}, 'elastic_tensor_full': {}, 'prop_dict': {}}
    for i in range(len(raw_data)):
        dataset['structure'][str(i)] = get_structure(raw_data[i])
        dataset['elastic_tensor_full'][str(i)] = get_elastic_tensor(raw_data[i])
        dataset['prop_dict'][str(i)] = construct_datapoint(raw_data[i])
    print(f"Dataset constructed with {len(dataset)} data points")
    return dataset

def store_dataset(dataset, stored_file_name):
    with open(stored_file_name, 'w') as f:
        json.dump(dataset, f, indent=3)
    return print(f"Dataset stored in {stored_file_name}")

In [5]:
combined_dataset = construct_dataset(origin_data)
store_dataset(combined_dataset, 'preprocessed_dataset.json')

Dataset constructed with 3 data points
Dataset stored in preprocessed_dataset.json


In [6]:
print(len(combined_dataset['structure']))

12127


In [7]:
from pymatgen.analysis.elasticity import ElasticTensor


print("Original data length", len(combined_dataset['elastic_tensor_full']))
id_list = []
for i in range(len(combined_dataset['elastic_tensor_full'])):
    try:
        elastic_constant = combined_dataset['elastic_tensor_full'][str(i)]
        elastic_tensor = ElasticTensor(elastic_constant)
        y_m = round(elastic_tensor.y_mod / 1e9, 3)
    except Exception as e:
        id_list.append(i)

print(id_list)
print(len(id_list))

Original data length 12127
[53, 74, 159, 247, 338, 450, 488, 542, 552, 690, 719, 788, 901, 1044, 1079, 1097, 1154, 1210, 1216, 1229, 1302, 1381, 1399, 1466, 1477, 1520, 1529, 1556, 1564, 1604, 1629, 1649, 1717, 1822, 1920, 1925, 1941, 2025, 2195, 2222, 2232, 2257, 2521, 2571, 2590, 2676, 2692, 2818, 2831, 2893, 2946, 2980, 2985, 3026, 3045, 3060, 3124, 3210, 3215, 3221, 3230, 3251, 3265, 3273, 3352, 3423, 3451, 3504, 3568, 3626, 3739, 3787, 3791, 3792, 3859, 3935, 3988, 4013, 4024, 4032, 4039, 4169, 4174, 4211, 4255, 4304, 4305, 4436, 4507, 4516, 4534, 4579, 4600, 4607, 4648, 4652, 4740, 4767, 4912, 4981, 5005, 5051, 5061, 5142, 5337, 5340, 5431, 5488, 5525, 5553, 5626, 5653, 5715, 5745, 5798, 5855, 6025, 6027, 6091, 6113, 6153, 6172, 6251, 6355, 6416, 6423, 6454, 6457, 6470, 6510, 6516, 6534, 6582, 6595, 6615, 6619, 6674, 6705, 6718, 6746, 6747, 6750, 6758, 6767, 6771, 6791, 6860, 6885, 6893, 6943, 6944, 6990, 7035, 7189, 7197, 7198, 7199, 7208, 7210, 7272, 7282, 7365, 7401, 7407, 755

In [8]:
def remove_items(dataset, id_list):
    for i in id_list:
        dataset['structure'].pop(str(i))
        dataset['elastic_tensor_full'].pop(str(i))
        dataset['prop_dict'].pop(str(i))
    return dataset

In [9]:
remove_items(combined_dataset, id_list)

{'structure': {'0': {'@module': 'pymatgen.core.structure',
   '@class': 'Structure',
   'charge': 0.0,
   'lattice': {'matrix': [[1.44888685, -3.77005152, 0.0],
     [1.44888685, 3.77005152, -0.0],
     [0.0, -0.0, 2.94631027]],
    'pbc': [True, True, True],
    'a': 4.038881227215927,
    'b': 4.038881227215927,
    'c': 2.94631027,
    'alpha': 90.0,
    'beta': 90.0,
    'gamma': 137.95497998182228,
    'volume': 32.18772121930709},
   'properties': {},
   'sites': [{'species': [{'element': 'Fe', 'occu': 1}],
     'abc': [0.85682226, 0.14317774, 0.75],
     'xyz': [1.44888685, -2.69047660736567, 2.2097327025],
     'properties': {'magmom': 1.494},
     'label': 'Fe'},
    {'species': [{'element': 'Fe', 'occu': 1}],
     'abc': [0.14317774, 0.85682226, 0.25],
     'xyz': [1.44888685, 2.69047660736567, 0.7365775675],
     'properties': {'magmom': 1.494},
     'label': 'Fe'},
    {'species': [{'element': 'B', 'occu': 1}],
     'abc': [0.5690593, 0.4309407, 0.75],
     'xyz': [1.448886

In [10]:
print("After removing data length", len(combined_dataset['elastic_tensor_full']))

After removing data length 11787


In [11]:
print("Original data length", len(combined_dataset['prop_dict']))
delete_index = []
for k in combined_dataset['prop_dict'].keys():
    if combined_dataset['prop_dict'][k]['bulk modulus'] == None or combined_dataset['prop_dict'][k]['shear modulus'] == None:
        delete_index.append(k)
    elif any(value < 0 for value in combined_dataset['prop_dict'][k]['bulk modulus'].values()) or any(value > 1000 for value in combined_dataset['prop_dict'][k]['bulk modulus'].values()):
        delete_index.append(k)
    elif any(value < 0 for value in combined_dataset['prop_dict'][k]['shear modulus'].values()) or any(value > 1000 for value in combined_dataset['prop_dict'][k]['shear modulus'].values()):
        delete_index.append(k)


Original data length 11787


In [12]:
remove_items(combined_dataset, delete_index)

{'structure': {'0': {'@module': 'pymatgen.core.structure',
   '@class': 'Structure',
   'charge': 0.0,
   'lattice': {'matrix': [[1.44888685, -3.77005152, 0.0],
     [1.44888685, 3.77005152, -0.0],
     [0.0, -0.0, 2.94631027]],
    'pbc': [True, True, True],
    'a': 4.038881227215927,
    'b': 4.038881227215927,
    'c': 2.94631027,
    'alpha': 90.0,
    'beta': 90.0,
    'gamma': 137.95497998182228,
    'volume': 32.18772121930709},
   'properties': {},
   'sites': [{'species': [{'element': 'Fe', 'occu': 1}],
     'abc': [0.85682226, 0.14317774, 0.75],
     'xyz': [1.44888685, -2.69047660736567, 2.2097327025],
     'properties': {'magmom': 1.494},
     'label': 'Fe'},
    {'species': [{'element': 'Fe', 'occu': 1}],
     'abc': [0.14317774, 0.85682226, 0.25],
     'xyz': [1.44888685, 2.69047660736567, 0.7365775675],
     'properties': {'magmom': 1.494},
     'label': 'Fe'},
    {'species': [{'element': 'B', 'occu': 1}],
     'abc': [0.5690593, 0.4309407, 0.75],
     'xyz': [1.448886

In [13]:
print("Deleted data with bulk_modulus or shear_modulus abnormality", len(combined_dataset['prop_dict']))

Deleted data with bulk_modulus or shear_modulus abnormality 10520


In [14]:
import math
crystal_structure_data = [v['crystal_system'] for k, v in combined_dataset['prop_dict'].items()]
crystal_structure_set = set(crystal_structure_data)
crystal_structure_count = len(crystal_structure_set)
crystal_structure_values_count = {structure: crystal_structure_data.count(structure) for structure in crystal_structure_set}
print(f'Crystal structures: {crystal_structure_count}')
print(f'Crystal structure values count: {crystal_structure_values_count}')
csv_p5 = {k: math.floor(v*0.05) for k, v in crystal_structure_values_count.items() if v >= 5}
print(f'5% for each crystal_system: {csv_p5}')

Crystal structures: 7
Crystal structure values count: {'Cubic': 4175, 'Orthorhombic': 1386, 'Monoclinic': 572, 'Trigonal': 814, 'Hexagonal': 1471, 'Triclinic': 66, 'Tetragonal': 2036}
5% for each crystal_system: {'Cubic': 208, 'Orthorhombic': 69, 'Monoclinic': 28, 'Trigonal': 40, 'Hexagonal': 73, 'Triclinic': 3, 'Tetragonal': 101}


In [15]:
combined_dataset['label'] = {}
for k, v in csv_p5.items():
    for a in combined_dataset['prop_dict'].keys():
        if combined_dataset['prop_dict'][str(a)]['crystal_system'] == k:
            combined_dataset['label'][str(a)] = 'test'
            v -= 1
            if v == 0:
                break

for a in combined_dataset['prop_dict'].keys():
    if combined_dataset['label'].get(str(a)) == None:
        combined_dataset['label'][str(a)] = 'train'

In [16]:
train_dataset = {'structure': {}, 'elastic_tensor_full': {}, 'prop_dict': {}}
test_dataset = {'structure': {}, 'elastic_tensor_full': {}, 'prop_dict': {}}

for i in combined_dataset['prop_dict'].keys():
    if combined_dataset['label'][str(i)] == 'train':
        train_dataset['structure'][str(i)] = combined_dataset['structure'][str(i)]
        train_dataset['elastic_tensor_full'][str(i)] = combined_dataset['elastic_tensor_full'][str(i)]
        train_dataset['prop_dict'][str(i)] = combined_dataset['prop_dict'][str(i)]
    else:
        test_dataset['structure'][str(i)] = combined_dataset['structure'][str(i)]
        test_dataset['elastic_tensor_full'][str(i)] = combined_dataset['elastic_tensor_full'][str(i)]
        test_dataset['prop_dict'][str(i)] = combined_dataset['prop_dict'][str(i)]

In [17]:
print(f"Train dataset: {len(train_dataset['prop_dict'])}")
print(f"Test dataset: {len(test_dataset['prop_dict'])}")

Train dataset: 9998
Test dataset: 522


In [18]:
seq_list = train_dataset['prop_dict'].keys()
print(len(seq_list))


9998


In [29]:
import random
random.seed(42)
seq_list = list(seq_list)  # Ensure seq_list is a list for random sampling
sample_size = round(len(seq_list) * 0.05)  # 5% of the total number of sequences
random_sample = random.sample(seq_list, sample_size)
print(f"Randomly selected 5% of sequences: {random_sample}")

Randomly selected 5% of sequences: ['2492', '973', '5355', '4833', '4455', '2990', '2339', '10783', '2071', '11714', '8259', '1091', '1056', '2186', '4377', '4622', '9953', '11959', '998', '11105', '4025', '10772', '8209', '4407', '8820', '11685', '5410', '598', '3345', '8273', '6555', '5405', '3274', '4320', '6477', '2333', '2170', '7393', '2238', '6960', '6644', '11994', '5170', '1298', '9020', '10590', '2729', '7368', '1924', '10926', '5678', '7020', '11441', '3910', '1761', '1340', '4537', '5606', '1944', '4625', '2311', '7397', '5407', '8904', '7089', '3395', '7202', '6876', '4215', '5212', '1794', '12104', '3544', '10548', '4831', '3408', '9082', '7384', '5265', '11024', '4393', '6224', '1515', '4556', '1096', '6071', '7839', '5225', '1702', '4241', '11233', '6057', '4271', '9834', '7725', '9008', '3048', '5176', '2991', '4863', '11107', '10648', '5139', '11591', '8398', '11568', '7797', '7026', '4388', '2964', '10038', '9720', '2138', '1362', '2464', '3224', '3351', '8264', '118

In [21]:
print(len(random_sample))

500


In [22]:
af_train_dataset = {'structure': {}, 'elastic_tensor_full': {}, 'prop_dict': {}}
val_dataset = {'structure': {}, 'elastic_tensor_full': {}, 'prop_dict': {}}
for i in train_dataset['prop_dict'].keys():
    if i in random_sample:
        val_dataset['structure'][str(i)] = train_dataset['structure'][str(i)]
        val_dataset['elastic_tensor_full'][str(i)] = train_dataset['elastic_tensor_full'][str(i)]
        val_dataset['prop_dict'][str(i)] = train_dataset['prop_dict'][str(i)]
    else:
        af_train_dataset['structure'][str(i)] = train_dataset['structure'][str(i)]
        af_train_dataset['elastic_tensor_full'][str(i)] = train_dataset['elastic_tensor_full'][str(i)]
        af_train_dataset['prop_dict'][str(i)] = train_dataset['prop_dict'][str(i)]

In [23]:
print(f"Train dataset: {len(af_train_dataset['prop_dict'])}")
print(f"Validation dataset: {len(val_dataset['prop_dict'])}")

Train dataset: 9498
Validation dataset: 500


In [24]:
import json
with open('train_dataset.json', 'w') as f:
    json.dump(af_train_dataset, f, indent=3)
with open('validation_dataset.json', 'w') as f:
    json.dump(val_dataset, f, indent=3)
with open('test_dataset.json', 'w') as f:
    json.dump(test_dataset, f, indent=3)