In [None]:
# Combine stable and unstable data into one file
# import json
# with open('../download_data/mp_elastic_stable_with_desc.json') as f:
#     stable = json.load(f)
# with open('../download_data/mp_elastic_unstable_with_desc.json') as f:
#     unstable = json.load(f)

# data = stable + unstable
# with open('mp_elastic_with_desc.json', 'w') as f:
#         json.dump(data, f, indent=2)


In [1]:
#Add basic features
import json
with open("mp_elastic_with_desc.json") as f:
    data = json.load(f)

In [None]:
for k, v in data[2].items():
    print(k, v)

In [None]:
from pymatgen.core.periodic_table import Element
import math
# 获取所有元素的电负性 Get the electronegativity of all elements
electronegativities = {element.symbol: element.X for element in Element}

# 获取所有元素的离化能 Get the ionization energy of all elements
ionization_energy = {element.symbol: round(element.ionization_energy, 3) if element.ionization_energy is not None and not math.isnan(element.ionization_energy) else None for element in Element}

# 获取所有元素的modulus Get the modulus of all elements
bulk_modulus = {element.symbol: element.bulk_modulus for element in Element}
youngs_modulus = {element.symbol: element.youngs_modulus for element in Element}
poissons_ratio = {element.symbol: element.poissons_ratio for element in Element}

# 获取所有元素的原子半径 Get the atomic radius of all elements
atomic_radius = {element.symbol: element.atomic_radius_calculated for element in Element}



In [None]:
def append_property_info(element, property_value, property_name, unit=""):
    if property_value[element]:
        return f"{property_name} of {property_value[element]}{unit}, "
    return ""

In [None]:
for i in range(len(data)):
    formula = data[i]['formula_pretty']
    elements = data[i]['elements']
    composition = data[i]['composition_reduced']
    density = round(data[i]['density'], 3)
    density_per_atom = round(data[i]['density_atomic'], 3)
    intro = f"The material {formula} with a reduced composition of {composition} exhibits a density of {density} g/cm^3 and a density per atom of {density_per_atom} g/cm^3. "
    interval = "The information about the elements contained in the material is as follows. "
    elem_info = ""
    for element in elements:
        elem_info += f"{element} has "
        elem_info += append_property_info(element, electronegativities, "an electronegativity")
        elem_info += append_property_info(element, ionization_energy, "an ionization energy", " eV")
        elem_info += append_property_info(element, bulk_modulus, "a bulk modulus", "")
        elem_info += append_property_info(element, youngs_modulus, "a Young's modulus", "")
        elem_info += append_property_info(element, poissons_ratio, "a Poisson's ratio")
        elem_info += append_property_info(element, atomic_radius, "an atomic radius", " Å")
        if elem_info.endswith(", "):
            elem_info = elem_info[:-2] + ". "
        if elem_info == f"{element} has ":
            elem_info = ""
    if elem_info != "":
        elem_info = interval + elem_info
    data[i]['description'] = intro + elem_info + data[i]['description']


In [None]:
with open('mp_elastic_with_desc0and1.json', 'w') as f:
    json.dump(data, f, indent=2)

In [None]:
print(data[2]['description'])

In [6]:
import json
with open("mp_elastic_with_desc0and1.json") as f:
    data = json.load(f)

In [7]:
for k, v in data[0].items():
    print(k, v)

builder_meta {'emmet_version': '0.71.1', 'pymatgen_version': '2023.10.4', 'pull_request': 990, 'database_version': '2023.11.1', 'build_date': '2023-10-20 07:26:54.333000', 'license': None}
nsites 4
elements ['B', 'Fe']
nelements 2
composition {'Fe': 2.0, 'B': 2.0}
composition_reduced {'Fe': 1.0, 'B': 1.0}
formula_pretty FeB
formula_anonymous AB
chemsys B-Fe
volume 32.18772121930709
density 6.87746058623732
density_atomic 8.046930304826773
symmetry {'crystal_system': 'Orthorhombic', 'symbol': 'Cmcm', 'number': 63, 'point_group': 'mmm', 'symprec': 0.1, 'version': '1.16.5'}
property_name elasticity
material_id mp-1007881
deprecated False
deprecation_reasons None
last_updated 2023-10-20 07:26:54.333000
origins []
structure {'@module': 'pymatgen.core.structure', '@class': 'Structure', 'charge': 0.0, 'lattice': {'matrix': [[1.44888685, -3.77005152, 0.0], [1.44888685, 3.77005152, -0.0], [0.0, -0.0, 2.94631027]], 'pbc': [True, True, True], 'a': 4.038881227215927, 'b': 4.038881227215927, 'c': 2

In [2]:
def construct_datapoint(raw_datapoint: dict):
    dic = {}
    dic['crystal system'] : str = raw_datapoint['symmetry']['crystal_system']
    dic['description'] : str = raw_datapoint['description']
    # Extract the elastic tensor
    dic['elastic tensor'] : list or None = raw_datapoint['elastic_tensor']['ieee_format'] if raw_datapoint['elastic_tensor']['ieee_format'] else None
    # Traverse the 6x6 2D array elastic_tensor and change all -0.0 to 0.0
    if dic['elastic tensor']:
        for i in range(6):
            for j in range(6):
                if dic['elastic tensor'][i][j] == -0.0:
                    dic['elastic tensor'][i][j] = 0.0
    # Extract other elasticity properties
    dic['bulk modulus'] : dict or None = raw_datapoint['bulk_modulus'] if raw_datapoint['bulk_modulus'] else None
    dic['shear modulus'] : dict or None = raw_datapoint['shear_modulus'] if raw_datapoint['shear_modulus'] else None
    dic['young modulus'] : float or None = raw_datapoint['young_modulus'] if raw_datapoint['young_modulus'] else None
    dic['universal anisotropy'] : float or None = raw_datapoint['universal_anisotropy'] if raw_datapoint['universal_anisotropy'] else None
    dic['isotropic possion ratio'] : float or None = raw_datapoint['homogeneous_poisson'] if raw_datapoint['homogeneous_poisson'] else None

    return dic

In [3]:
def construct_dataset(raw_data):
    dataset = [construct_datapoint(data) for data in raw_data]
    print(f"Dataset constructed with {len(dataset)} data points")
    return dataset

def store_dataset(dataset, stored_file_name):
    with open(stored_file_name, 'w') as f:
        json.dump(dataset, f, indent=2)
    return print(f"Dataset stored in {stored_file_name}")

In [4]:
combined_dataset = construct_dataset(data)
store_dataset(combined_dataset, 'mp_only_desc_elasticity.json')

Dataset constructed with 12127 data points
Dataset stored in mp_only_desc_elasticity.json


In [8]:
#build_alpaca
import json

with open('mp_only_desc_elasticity.json') as f:
    origin_data = json.load(f)

In [9]:
from pymatgen.analysis.elasticity import ElasticTensor
import numpy as np

print("Original data length", len(origin_data))
id_list = []
for i in range(len(origin_data)):
    try:
        elastic_constant = np.asarray(origin_data[i]['elastic tensor'])
        elastic_tensor = ElasticTensor.from_voigt(elastic_constant)
        y_m = round(elastic_tensor.y_mod / 1e9, 3)
    except Exception as e:
        id_list.append(i)

print(id_list)
print(len(id_list))

for index in sorted(id_list, reverse=True):
    del origin_data[index]

print("After removing data length", len(origin_data))

def calculate_properties(stiffness_matrix: list) -> (float, float, float):
    elastic_constant = np.asarray(stiffness_matrix)
    elastic_tensor = ElasticTensor.from_voigt(elastic_constant)
    youngs_modulus = round(elastic_tensor.y_mod / 1e9, 3)
    return youngs_modulus

# Delete data with abnormal bulk modulus and shear modulus
print("Original data length", len(origin_data))
delete_index = []
for i in range(len(origin_data)):
    if origin_data[i]['bulk modulus'] == None or origin_data[i]['shear modulus'] == None:
        delete_index.append(i)
    elif any(value < 0 for value in origin_data[i]['bulk modulus'].values()) or any(value > 1000 for value in origin_data[i]['bulk modulus'].values()):
        delete_index.append(i)
    elif any(value < 0 for value in origin_data[i]['shear modulus'].values()) or any(value > 1000 for value in origin_data[i]['shear modulus'].values()):
        delete_index.append(i)

for index in sorted(delete_index, reverse=True):
    del origin_data[index]

print("Deleted data with bulk_modulus or shear_modulus abnormality", len(origin_data))

with open('delete_abnormality.jsonl', 'w') as f:
    for item in origin_data:
        json.dump(item, f)
        f.write('\n')

Original data length 12127
[53, 74, 159, 247, 338, 450, 488, 542, 552, 690, 719, 788, 901, 1044, 1079, 1097, 1154, 1210, 1216, 1229, 1302, 1381, 1399, 1466, 1477, 1520, 1529, 1556, 1564, 1604, 1629, 1649, 1717, 1822, 1920, 1925, 1941, 2025, 2195, 2222, 2232, 2257, 2521, 2571, 2590, 2676, 2692, 2818, 2831, 2893, 2946, 2980, 2985, 3026, 3045, 3060, 3124, 3210, 3215, 3221, 3230, 3251, 3265, 3273, 3352, 3423, 3451, 3504, 3568, 3626, 3739, 3787, 3791, 3792, 3859, 3935, 3988, 4013, 4024, 4032, 4039, 4169, 4174, 4211, 4255, 4304, 4305, 4436, 4507, 4516, 4534, 4579, 4600, 4607, 4648, 4652, 4740, 4767, 4912, 4981, 5005, 5051, 5061, 5142, 5337, 5340, 5431, 5488, 5525, 5553, 5626, 5653, 5715, 5745, 5798, 5855, 6025, 6027, 6091, 6113, 6153, 6172, 6251, 6355, 6416, 6423, 6454, 6457, 6470, 6510, 6516, 6534, 6582, 6595, 6615, 6619, 6674, 6705, 6718, 6746, 6747, 6750, 6758, 6767, 6771, 6791, 6860, 6885, 6893, 6943, 6944, 6990, 7035, 7189, 7197, 7198, 7199, 7208, 7210, 7272, 7282, 7365, 7401, 7407, 755

In [12]:
with open('delete_abnormality.jsonl', 'r') as f:
    after_deleted_data = [json.loads(line) for line in f]

bulk_modulus = [item['bulk modulus']['vrh'] for item in after_deleted_data]
shear_modulus = [item['shear modulus']['vrh'] for item in after_deleted_data]
young_modulus = [calculate_properties(item['elastic tensor']) for item in after_deleted_data]

for i in range(len(after_deleted_data)):
    after_deleted_data[i]['young modulus'] = young_modulus[i]

with open('delete_abnormality_and_addyoung.jsonl', 'w') as f:
    for item in after_deleted_data:
        json.dump(item, f)
        f.write('\n')

In [13]:
import json
with open('delete_abnormality_and_addyoung.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

import math
crystal_structure_data = [item['crystal system'] for item in data]
crystal_structure_set = set(crystal_structure_data)
crystal_structure_count = len(crystal_structure_set)
crystal_structure_values_count = {structure: crystal_structure_data.count(structure) for structure in crystal_structure_set}
print(f'Crystal structures: {crystal_structure_count}')
print(f'Crystal structure values count: {crystal_structure_values_count}')
csv_p5 = {k: math.floor(v*0.05) for k, v in crystal_structure_values_count.items() if v >= 5}
print(f'5% for each crystal_system: {csv_p5}')

for k, v in csv_p5.items():
    for item in data:
        if item['crystal system'] == k:
            item['label'] = 'test'
            v -= 1
            if v == 0:
                break

for item in data:
    if 'label' not in item:
        item['label'] = 'train'

alpaca_train = []
alpaca_test = []
def build_alpaca(datapoint):
    dic = dict()
    dic['instruction'] = f"{datapoint}"
    dic['input'] = ""
    dic['output'] = ""
    return dic

for item in data:
    if item['label'] == 'train':
        alpaca_train.append(build_alpaca(item))
        del item['label']
    else:
        alpaca_test.append(build_alpaca(item))
        del item['label']

with open('formula_elasticity_dataset.json', 'w') as f:
    json.dump(alpaca_train, f)

Crystal structures: 7
Crystal structure values count: {'Monoclinic': 572, 'Orthorhombic': 1386, 'Hexagonal': 1471, 'Triclinic': 66, 'Cubic': 4175, 'Trigonal': 814, 'Tetragonal': 2036}
5% for each crystal_system: {'Monoclinic': 28, 'Orthorhombic': 69, 'Hexagonal': 73, 'Triclinic': 3, 'Cubic': 208, 'Trigonal': 40, 'Tetragonal': 101}


In [14]:
for k, v in csv_p5.items():
    for item in data:
        if item['crystal system'] == k:
            item['label'] = 'test'
            v -= 1
            if v == 0:
                break

for item in data:
    if 'label' not in item:
        item['label'] = 'train'

In [15]:
for k, v in data[0].items():
    print(k, v)

crystal system Orthorhombic
description The material FeB with a reduced composition of {'Fe': 1.0, 'B': 1.0} exhibits a density of 6.877 g/cm^3 and a density per atom of 8.047 g/cm^3. The information about the elements contained in the material is as follows. B has an electronegativity of 2.04, an ionization energy of 8.298 eV, a bulk modulus of 320.0 GPa, an atomic radius of 0.87 Å. Fe has an electronegativity of 1.83, an ionization energy of 7.902 eV, a bulk modulus of 170.0 GPa, a Young's modulus of 211.0 GPa, a Poisson's ratio of 0.29, an atomic radius of 1.56 Å. FeB crystallizes in the orthorhombic Cmcm space group. Fe(1)3+ is bonded in a 7-coordinate geometry to seven equivalent B(1)3- atoms. There are a spread of Fe(1)-B(1) bond distances ranging from 2.14-2.17 Å. B(1)3- is bonded in a 9-coordinate geometry to seven equivalent Fe(1)3+ and two equivalent B(1)3- atoms. Both B(1)-B(1) bond lengths are 1.80 Å.
elastic tensor [[450.0, 195.0, 171.0, 0.0, 0.0, 0.0], [195.0, 470.0, 150.

In [17]:
#构建alpaca数据集
def build_alpaca_dataset(data):
    train_dataset = []
    test_dataset = []
    instruction = "Given a material description, predict the elastic tensor of it directly and accurately with scientific logic. Answer without any other comments, descriptions, or explanations. The answer should be a 6x6 Python matrix. "
    for item in data:
        label = item['label']
        input_item = item['description']
        output_item = f"{item['elastic tensor']}"
        if label == 'test':
            test_dataset.append({
                'instruction': instruction,
                'input': input_item,
                'output': output_item,
            })
        else:
            train_dataset.append({
                'instruction': instruction,
                'input': input_item,
                'output': output_item,
            })
    return train_dataset, test_dataset

In [18]:
tr, te = build_alpaca_dataset(data)

with open('ec_desc_train_dataset.json', 'w') as file:
    json.dump(tr, file)

with open('ec_desc_test_dataset.json', 'w') as file:
    json.dump(te, file)