In [1]:
import json

with open('processed_elasticity_dataset.json') as f:
    origin_data = json.load(f)

In [2]:
from pymatgen.analysis.elasticity import ElasticTensor
import numpy as np

print("Original data length", len(origin_data))
id_list = []
for i in range(len(origin_data)):
    try:
        elastic_constant = np.asarray(origin_data[i]['elastic_tensor'])
        elastic_tensor = ElasticTensor.from_voigt(elastic_constant)
        y_m = round(elastic_tensor.y_mod / 1e9, 3)
    except Exception as e:
        id_list.append(i)

print(id_list)
print(len(id_list))

for index in sorted(id_list, reverse=True):
    del origin_data[index]

print("After removing data length", len(origin_data))

Original data length 12127


  return 1 / self.compliance_tensor.voigt[:3, :3].sum()
  return 9.0e9 * self.k_vrh * self.g_vrh / (3 * self.k_vrh + self.g_vrh)


[53, 74, 159, 247, 338, 450, 488, 542, 552, 690, 719, 788, 901, 1044, 1079, 1097, 1154, 1210, 1216, 1229, 1302, 1381, 1399, 1466, 1477, 1520, 1529, 1556, 1564, 1604, 1629, 1649, 1717, 1822, 1920, 1925, 1941, 2025, 2195, 2222, 2232, 2257, 2521, 2571, 2590, 2676, 2692, 2818, 2831, 2893, 2946, 2980, 2985, 3026, 3045, 3060, 3124, 3210, 3215, 3221, 3230, 3251, 3265, 3273, 3352, 3423, 3451, 3504, 3568, 3626, 3739, 3787, 3791, 3792, 3859, 3935, 3988, 4013, 4024, 4032, 4039, 4169, 4174, 4211, 4255, 4304, 4305, 4436, 4507, 4516, 4534, 4579, 4600, 4607, 4648, 4652, 4740, 4767, 4912, 4981, 5005, 5051, 5061, 5142, 5337, 5340, 5431, 5488, 5525, 5553, 5626, 5653, 5715, 5745, 5798, 5855, 6025, 6027, 6091, 6113, 6153, 6172, 6251, 6355, 6416, 6423, 6454, 6457, 6470, 6510, 6516, 6534, 6582, 6595, 6615, 6619, 6674, 6705, 6718, 6746, 6747, 6750, 6758, 6767, 6771, 6791, 6860, 6885, 6893, 6943, 6944, 6990, 7035, 7189, 7197, 7198, 7199, 7208, 7210, 7272, 7282, 7365, 7401, 7407, 7557, 7560, 7574, 7633, 7691, 

  return 15 / (


In [3]:
def calculate_properties(stiffness_matrix: list) -> (float, float, float):
    elastic_constant = np.asarray(stiffness_matrix)
    elastic_tensor = ElasticTensor.from_voigt(elastic_constant)
    youngs_modulus = round(elastic_tensor.y_mod / 1e9, 3)
    return youngs_modulus

In [4]:
#删除体积模量和剪切模量异常的数据
print("Original data length", len(origin_data))
delete_index = []
for i in range(len(origin_data)):
    if origin_data[i]['bulk_modulus'] == None or origin_data[i]['shear_modulus'] == None:
        delete_index.append(i)
    elif any(value < 0 for value in origin_data[i]['bulk_modulus'].values()) or any(value > 1000 for value in origin_data[i]['bulk_modulus'].values()):
        delete_index.append(i)
    elif any(value < 0 for value in origin_data[i]['shear_modulus'].values()) or any(value > 1000 for value in origin_data[i]['shear_modulus'].values()):
        delete_index.append(i)

for index in sorted(delete_index, reverse=True):
    del origin_data[index]

print("Deleted data with bulk_modulus or shear_modulus abnormality", len(origin_data))

Original data length 11787
Deleted data with bulk_modulus or shear_modulus abnormality 10520


In [5]:
with open('delete_abnormality.jsonl', 'w') as f:
    for item in origin_data:
        json.dump(item, f)
        f.write('\n')

In [6]:
with open('delete_abnormality.jsonl', 'r') as f:
    after_deleted_data = [json.loads(line) for line in f]

In [7]:
bulk_modulus = [item['bulk_modulus']['vrh'] for item in after_deleted_data]
shear_modulus = [item['shear_modulus']['vrh'] for item in after_deleted_data]
young_modulus = [calculate_properties(item['elastic_tensor']) for item in after_deleted_data]

In [8]:
for i in range(len(after_deleted_data)):
    after_deleted_data[i]['young_modulus'] = young_modulus[i]

In [9]:
with open('delete_abnormality_and_addyoung.jsonl', 'w') as f:
    for item in after_deleted_data:
        json.dump(item, f)
        f.write('\n')

In [10]:
with open('delete_abnormality_and_addyoung.jsonl', 'r') as f:
    after_deleted_data = [json.loads(line) for line in f]

In [11]:
#划分测试集和训练集
import math
crystal_structure_data = [item['symmetry']['crystal_system'] for item in after_deleted_data]
crystal_structure_set = set(crystal_structure_data)
crystal_structure_count = len(crystal_structure_set)
crystal_structure_values_count = {structure: crystal_structure_data.count(structure) for structure in crystal_structure_set}
print(f'Crystal structures: {crystal_structure_count}')
print(f'Crystal structure values count: {crystal_structure_values_count}')
csv_p5 = {k: math.floor(v*0.05) for k, v in crystal_structure_values_count.items() if v >= 5}
print(f'5% for each crystal_system: {csv_p5}')

for k, v in csv_p5.items():
    for item in after_deleted_data:
        if item['symmetry']['crystal_system'] == k:
            item['label'] = 'test'
            v -= 1
            if v == 0:
                break

for item in after_deleted_data:
    if 'label' not in item:
        item['label'] = 'train'

test_label_count = sum(1 for item in after_deleted_data if item.get('label') == 'test')
print(f"Number of items labeled as 'test': {test_label_count}")

Crystal structures: 7
Crystal structure values count: {'Cubic': 4175, 'Hexagonal': 1471, 'Monoclinic': 572, 'Triclinic': 66, 'Orthorhombic': 1386, 'Tetragonal': 2036, 'Trigonal': 814}
5% for each crystal_system: {'Cubic': 208, 'Hexagonal': 73, 'Monoclinic': 28, 'Triclinic': 3, 'Orthorhombic': 69, 'Tetragonal': 101, 'Trigonal': 40}
Number of items labeled as 'test': 522


In [12]:
with open('labeled_data.jsonl', 'w') as f:
    for item in after_deleted_data:
        json.dump(item, f)
        f.write('\n')

In [21]:
with open('labeled_data.jsonl', 'r') as f:
    labeled_data = [json.loads(line) for line in f]

In [22]:
#打印key看一下
for k, v in labeled_data[0].items():
    print(k, v)

elements ['B', 'Fe']
formula_pretty FeB
symmetry {'crystal_system': 'Orthorhombic', 'symbol': 'Cmcm', 'number': 63, 'point_group': 'mmm', 'symprec': 0.1}
primitive_structure {'@module': 'pymatgen.core.structure', '@class': 'Structure', 'charge': 0.0, 'lattice': {'matrix': [[1.448887, -3.770052, 0.0], [1.448887, 3.770052, 0.0], [0.0, 0.0, 2.94631]], 'pbc': [True, True, True], 'a': 4.038881, 'b': 4.038881, 'c': 2.94631, 'alpha': 90.0, 'beta': 90.0, 'gamma': 137.95498, 'volume': 32.187721}, 'properties': {}, 'sites': [{'species': [{'element': 'Fe', 'occu': 1}], 'abc': [0.856822, 0.143178, 0.75], 'xyz': [1.448887, -2.690477, 2.209733], 'properties': {'magmom': 1.494}, 'label': 'Fe'}, {'species': [{'element': 'Fe', 'occu': 1}], 'abc': [0.143178, 0.856822, 0.25], 'xyz': [1.448887, 2.690477, 0.736578], 'properties': {'magmom': 1.494}, 'label': 'Fe'}, {'species': [{'element': 'B', 'occu': 1}], 'abc': [0.569059, 0.430941, 0.75], 'xyz': [1.448887, -0.520714, 2.209733], 'properties': {'magmom': -

In [15]:
def fix_some_keys(re_item):
    re_item = {'material_formula': re_item['formula_pretty'], **re_item}
    del re_item['elements']
    del re_item['formula_pretty']
    del re_item['primitive_structure']
    del re_item['conventional_structure']['@module']
    del re_item['conventional_structure']['@class']
    del re_item['conventional_structure']['charge']
    del re_item['conventional_structure']['lattice']['pbc']
    del re_item['conventional_structure']['properties']
    for site in re_item['conventional_structure']['sites']:
        del site['abc']
        del site['properties']
        del site['label']
    del re_item['label']
    re_item = {key: value for key, value in re_item.items() if key not in ['elastic_tensor','bulk_modulus', 'shear_modulus', 'young_modulus', 'universal_anisotropy', 'isotropic_possion_ratio']}
    return re_item

In [20]:
#构建alpaca数据集
def build_alpaca_dataset(data):
    train_dataset = []
    test_dataset = []
    instruction = f"""Given a material's symmetry and conventional cell structure, predict the elastic tensor of it directly and accurately with scientific logic. Answer without any other comments, descriptions, or explanations. The answer should be a 6x6 Python matrix. The material information is presented in JSON format. """
    for item in data:
        label = item['label']
        input_item = f"Information JSON of Material {item['formula_pretty']}:" + f"{fix_some_keys(item)}"
        output_item = f"{item['elastic_tensor']}"
        if label == 'test':
            test_dataset.append({
                'instruction': instruction,
                'input': input_item,
                'output': output_item,
            })
        else:
            train_dataset.append({
                'instruction': instruction,
                'input': input_item,
                'output': output_item,
            })
    return train_dataset, test_dataset

In [23]:
tr, te = build_alpaca_dataset(labeled_data)


with open('ec_short_train_dataset.json', 'w') as file:
    json.dump(tr, file)

with open('ec_short_test_dataset.json', 'w') as file:
    json.dump(te, file)

In [24]:
print(len(tr))
print(len(te))

9998
522


In [25]:
print(tr[0])
print(te[0])

{'instruction': "Given a material's symmetry and conventional cell structure, predict the elastic tensor of it directly and accurately with scientific logic. Answer without any other comments, descriptions, or explanations. The answer should be a 6x6 Python matrix. The material information is presented in JSON format. ", 'input': "Information JSON of Material TiB2:{'material_formula': 'TiB2', 'symmetry': {'crystal_system': 'Hexagonal', 'symbol': 'P6/mmm', 'number': 191, 'point_group': '6/mmm', 'symprec': 0.1}, 'conventional_structure': {'lattice': {'matrix': [[1.517019, -2.627554, 0.0], [1.517019, 2.627554, 0.0], [0.0, 0.0, 3.227438]], 'a': 3.034038, 'b': 3.034038, 'c': 3.227438, 'alpha': 90.0, 'beta': 90.0, 'gamma': 120.0, 'volume': 25.729454}, 'sites': [{'species': [{'element': 'Ti', 'occu': 1}], 'xyz': [0.0, 0.0, 0.0]}, {'species': [{'element': 'B', 'occu': 1}], 'xyz': [1.517019, -0.875851, 1.613719]}, {'species': [{'element': 'B', 'occu': 1}], 'xyz': [1.517019, 0.875851, 1.613719]}

In [3]:
import json

with open('ec_short_train_dataset.json', 'r') as file:
    train_data = json.load(file)

print(train_data[0]['instruction']+train_data[0]['input'])


Given a material's symmetry and conventional cell structure, predict the elastic tensor of it directly and accurately with scientific logic. Answer without any other comments, descriptions, or explanations. The answer should be a 6x6 Python matrix. The material information is presented in JSON format. Information JSON of Material TiB2:{'material_formula': 'TiB2', 'symmetry': {'crystal_system': 'Hexagonal', 'symbol': 'P6/mmm', 'number': 191, 'point_group': '6/mmm', 'symprec': 0.1}, 'conventional_structure': {'lattice': {'matrix': [[1.517019, -2.627554, 0.0], [1.517019, 2.627554, 0.0], [0.0, 0.0, 3.227438]], 'a': 3.034038, 'b': 3.034038, 'c': 3.227438, 'alpha': 90.0, 'beta': 90.0, 'gamma': 120.0, 'volume': 25.729454}, 'sites': [{'species': [{'element': 'Ti', 'occu': 1}], 'xyz': [0.0, 0.0, 0.0]}, {'species': [{'element': 'B', 'occu': 1}], 'xyz': [1.517019, -0.875851, 1.613719]}, {'species': [{'element': 'B', 'occu': 1}], 'xyz': [1.517019, 0.875851, 1.613719]}]}}
