In [1]:
import json
with open('../temp_data/combined_temp_data.json') as f:
    temp_data = json.load(f)
with open('mp_for_reverse_3.jsonl') as f:
    combined_data = [json.loads(line) for line in f]

### Infill Task

In [2]:
mp_formula_list = []
temp_formula_list = []

In [3]:
from pymatgen.core.composition import Composition
def build_infill_data(datapoint, label):
    infill_instruction = "Below is a partial description of a material where the chemical formula has been replaced with the string '[MASK]'. "
    infill_input = "Generate the chemical formula that could replace [MASK]: "
    if label == 'mp':
        mp_formula_list.append(datapoint['formula_pretty'])
        formula = datapoint['formula_pretty']
        composition = datapoint['composition_reduced']
        input = f"The material is [MASK] with a reduced composition of {composition}. "
        return {"instruction": infill_instruction, "input": input+infill_input, "output": formula}
    elif label == 'temp':
        temp_formula_list.append(datapoint['formula'])
        formula = datapoint['formula']
        composition = Composition(datapoint['formula'])
        reduced_dict = composition.to_reduced_dict
        sum_value = sum(reduced_dict.values())
        ratio_dict = {element: round((amount / sum_value)*100, 2) for element, amount in reduced_dict.items()}
        input = f"The material is [MASK] with composition ratio {ratio_dict} (total is 100%). "
        return {"instruction": infill_instruction, "input": input+infill_input, "output": formula}

In [4]:
mp_infill_data = [build_infill_data(data, 'mp') for data in combined_data if data['formula_pretty'] not in mp_formula_list]
temp_infill_data = [build_infill_data(data, 'temp') for data in temp_data if data['pressure'] == 1 and data['temperature'] != 0 and data['formula'] not in temp_formula_list]

In [5]:
print(mp_infill_data[0])
print(temp_infill_data[0])

{'instruction': "Below is a partial description of a material where the chemical formula has been replaced with the string '[MASK]'. ", 'input': "The material is [MASK] with a reduced composition of {'Fe': 1.0, 'B': 1.0}. Generate the chemical formula that could replace [MASK]: ", 'output': 'FeB'}
{'instruction': "Below is a partial description of a material where the chemical formula has been replaced with the string '[MASK]'. ", 'input': "The material is [MASK] with composition ratio {'Al': 100.0} (total is 100%). Generate the chemical formula that could replace [MASK]: ", 'output': 'Al'}


In [6]:
print(len(mp_infill_data))
print(len(temp_infill_data))
with open('mp_infill_data.json', 'w') as f:
    json.dump(mp_infill_data, f, indent=3)
with open('temp_infill_data.json', 'w') as f:
    json.dump(temp_infill_data, f, indent=3)

9215
159


In [14]:
with open('mp_infill_data.json') as f:
    mp_infill_data = json.load(f)
with open('temp_infill_data.json') as f:
    temp_infill_data = json.load(f)

### Generation Task

In [7]:
import json
with open("mp_for_reverse_3.jsonl", 'r') as f:
    mp_data = [json.loads(line) for line in f]
with open('../temp_data/combined_temp_data.json') as f:
    temp_data = json.load(f)

In [8]:
from pymatgen.analysis.elasticity.elastic import ElasticTensor
def build_gen_data(datapoint, label_for_dataset, label_for_properties):
    if label_for_dataset == 'mp':
        if label_for_properties == 'bulk':
            elastic_prop = "Voigt bulk modulus"
            prop_value = round(ElasticTensor.from_voigt(datapoint['elastic tensor']).k_voigt, 2)
            temperature = 0
            output = f"{datapoint['formula_pretty']}, {datapoint['crystal system']}"
        elif label_for_properties == 'shear':
            elastic_prop = "Voigt shear modulus"
            prop_value = round(ElasticTensor.from_voigt(datapoint['elastic tensor']).g_voigt, 2)
            temperature = 0
            output = f"{datapoint['formula_pretty']}, {datapoint['crystal system']}"
        elif label_for_properties == 'young':
            elastic_prop = "Young's modulus"
            prop_value = round(ElasticTensor.from_voigt(datapoint['elastic tensor']).y_mod / 1e9, 2)
            temperature = 0
            output = f"{datapoint['formula_pretty']}, {datapoint['crystal system']}"
            
    elif label_for_dataset == 'temp':
        if label_for_properties == 'bulk':
            elastic_prop = "Voigt bulk modulus"
            prop_value = round(ElasticTensor.from_voigt(datapoint['elastic tensor voigt notation']).k_voigt, 2)
            temperature = datapoint['temperature']
            output = f"{datapoint['formula']}, {datapoint['crystal system']}"
        elif label_for_properties == 'shear':
            elastic_prop = "Voigt shear modulus"
            prop_value = round(ElasticTensor.from_voigt(datapoint['elastic tensor voigt notation']).g_voigt, 2)
            temperature = datapoint['temperature']
            output = f"{datapoint['formula']}, {datapoint['crystal system']}"
        elif label_for_properties == 'young':
            elastic_prop = "Young's modulus"
            prop_value = round(ElasticTensor.from_voigt(datapoint['elastic tensor voigt notation']).y_mod / 1e9, 2)
            temperature = datapoint['temperature']
            output = f"{datapoint['formula']}, {datapoint['crystal system']}"
            
    input = f"Generate a material chemical formula and its crystal system with a {elastic_prop} of {prop_value} GPa at a temperature of {temperature}K. Use scientific reasoning step-by-step and directly output the answer without additional comments, descriptions, or explanations."
    return {"instruction": "", "input": input, "output": output}

In [9]:
bulk_mp_data = [build_gen_data(data, 'mp', 'bulk') for data in mp_data]
# shear_mp_data = [build_gen_data(data, 'mp', 'shear') for data in mp_data]
# young_mp_data = [build_gen_data(data, 'mp', 'young') for data in mp_data]
bulk_temp_data = [build_gen_data(data, 'temp', 'bulk') for data in temp_data if data['pressure'] == 1 and data['temperature'] != 0]
# shear_temp_data = [build_gen_data(data, 'temp', 'shear') for data in temp_data if data['pressure'] == 1 and data['temperature'] != 0]
# young_temp_data = [build_gen_data(data, 'temp', 'young') for data in temp_data if data['pressure'] == 1 and data['temperature'] != 0]
# combined_data = bulk_mp_data + shear_mp_data + young_mp_data + bulk_temp_data + shear_temp_data + young_temp_data
combined_data = bulk_mp_data + bulk_temp_data

In [10]:
print(bulk_mp_data[0])
print(bulk_temp_data[0])

{'instruction': '', 'input': 'Generate a material chemical formula and its crystal system with a Voigt bulk modulus of 255.22 GPa at a temperature of 0K. Use scientific reasoning step-by-step and directly output the answer without additional comments, descriptions, or explanations.', 'output': 'FeB, Orthorhombic'}
{'instruction': '', 'input': 'Generate a material chemical formula and its crystal system with a Voigt bulk modulus of 77.52 GPa at a temperature of 300K. Use scientific reasoning step-by-step and directly output the answer without additional comments, descriptions, or explanations.', 'output': 'Al, cubic'}


In [11]:
with open('crystal_gen_data.json', 'w') as f:
    json.dump(combined_data, f, indent=3)