In [1]:
import json
import os
import glob
import argparse
import random
import warnings
import numpy as np
import pandas as pd
from pymatgen.core.structure import Structure
import pickle
from tqdm import tqdm

random.seed(42)

PATH = "/scratch/civil/phd/cez198233/vaibhav_nlp/robocif/"

In [2]:
with open('/scratch/cse/btech/cs1200448/MatLlama/processed_cifs/gnome_cifs.pkl', 'rb') as f:
    data = pickle.load(f)

with open('/scratch/cse/btech/cs1200448/MatLlama/processed_cifs/mp_cifs.pkl', 'rb') as f:
    data.update(pickle.load(f))

In [3]:
cif_data_list = []
for el in list(data.keys()):
    cif_data_list.append([el, data[el]])
#     print(el, data[el])
#     break

In [3]:
# ft_data = []
# with open("ift_files.txt", 'r') as f:
#     for line in f:
#         ft_data.append([line.strip(), data[line.strip()]])

In [4]:
def get_structure(cif_str):
    structure = Structure.from_str(cif_str, fmt="cif")

#     structure.translate_sites(
#         indices=range(len(structure.sites)), vector=np.random.uniform(size=(3,))
#     )
    return structure

def get_crystal_string_nate(structure):
    lengths = structure.lattice.parameters[:3]
    angles = structure.lattice.parameters[3:]
    atom_ids = structure.species
    frac_coords = structure.frac_coords

    crystal_str = \
        " ".join(["{0:.1f}".format(x) for x in lengths]) + "\n" + \
        " ".join([str(int(x)) for x in angles]) + "\n" + \
        "\n".join([
            str(t) + "\n" + " ".join([
                "{0:.2f}".format(x) for x in c
            ]) for t,c in zip(atom_ids, frac_coords)
        ])

    return crystal_str

def get_crystal_string(structure):
    lengths = structure.lattice.parameters[:3]
    angles = structure.lattice.parameters[3:]
    atom_ids = structure.species
    frac_coords = structure.frac_coords

    lens = [float("{0:.1f}".format(x)) for x in lengths]
    angs = [int(x) for x in angles]
    coords = "\n".join([
            str(t) + "\n" + " ".join([
                "{0:.2f}".format(x) for x in c
            ]) for t,c in zip(atom_ids, frac_coords)
    ])

    crystal = {
        "lengths": lens,
        "angles": angs,
        "coordinates": coords
    }

    return json.dumps(crystal)

In [5]:
def syntactic_atom_cnt(atom_ids, cif):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your understanding of the CIF file format to extract information about the unit cell structure.",
        "Utilize your expertise in Material Science to extract data regarding the unit cell structure from CIF files, drawing upon your comprehension of the file format.",
        "As a specialist in Material Science, employ your knowledge of CIF files to extract pertinent details concerning the unit cell structure.",
        "As a Material Science expert, utilize CIF file parsing to extract essential data regarding the unit cell configuration.",
        "Draw upon your Material Science expertise to extract unit cell structure information from CIF files, utilizing your understanding of the file format.",
        "Employ your understanding of Material Science and CIF file format to extract crucial information concerning the unit cell arrangement.",
        "As a specialist in Material Science, employ CIF file analysis to gather insights into the unit cell structure.",
        "Utilize your proficiency in Material Science to parse CIF files and extract relevant details regarding the unit cell configuration.",
        "Draw upon your expertise in Material Science to extract insights into the unit cell structure by analyzing CIF files."
    ]
    prompt = "Below is a CIF file.\n"+cif+"\n"
    atom = random.choice(atom_ids)
    prompt += f"Find the number of {atom.name} atoms in the unit cell described by the above CIF file.\n"

    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": atom_ids.count(atom),
        "task": "atom count"
    }

def syntactic_dims(lengths, angles, cif):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your understanding of the CIF file format to extract information about the unit cell structure.",
        "Utilize your expertise in Material Science to extract data regarding the unit cell structure from CIF files, drawing upon your comprehension of the file format.",
        "As a specialist in Material Science, employ your knowledge of CIF files to extract pertinent details concerning the unit cell structure.",
        "As a Material Science expert, utilize CIF file parsing to extract essential data regarding the unit cell configuration.",
        "Draw upon your Material Science expertise to extract unit cell structure information from CIF files, utilizing your understanding of the file format.",
        "Employ your understanding of Material Science and CIF file format to extract crucial information concerning the unit cell arrangement.",
        "As a specialist in Material Science, employ CIF file analysis to gather insights into the unit cell structure.",
        "Utilize your proficiency in Material Science to parse CIF files and extract relevant details regarding the unit cell configuration.",
        "Draw upon your expertise in Material Science to extract insights into the unit cell structure by analyzing CIF files."
    ]
    prompt = "Below is a CIF file.\n"+cif+"\n"
    x = random.random()
    answer = ""
    if x < 0.5:
        prompt += f"Find the lengths of the lattice vectors of the unit cell described by the above CIF file.\n"
        answer = ", ".join(["{0:.1f}".format(x) for x in lengths])
    else:
        prompt += f"Find the angles between the lattice vectors of the unit cell described by the above CIF file.\n"
        answer = ", ".join([str(int(x)) for x in angles])
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": answer,
        "task": "dimensions_synt"
    }

def syntactic_coords(frac_coords, atom_ids, cif):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your understanding of the CIF file format to extract information about the unit cell structure.",
        "Utilize your expertise in Material Science to extract data regarding the unit cell structure from CIF files, drawing upon your comprehension of the file format.",
        "As a specialist in Material Science, employ your knowledge of CIF files to extract pertinent details concerning the unit cell structure.",
        "As a Material Science expert, utilize CIF file parsing to extract essential data regarding the unit cell configuration.",
        "Draw upon your Material Science expertise to extract unit cell structure information from CIF files, utilizing your understanding of the file format.",
        "Employ your understanding of Material Science and CIF file format to extract crucial information concerning the unit cell arrangement.",
        "As a specialist in Material Science, employ CIF file analysis to gather insights into the unit cell structure.",
        "Utilize your proficiency in Material Science to parse CIF files and extract relevant details regarding the unit cell configuration.",
        "Draw upon your expertise in Material Science to extract insights into the unit cell structure by analyzing CIF files."
    ]
    prompt = "Below is a CIF file.\n"+cif+"\n"
    idx = random.choice(list(range(len(atom_ids))))
    atom = atom_ids[idx].name
    coords = frac_coords[idx]
    prompt += "What is the element at " + ", ".join(["{0:.3f}".format(x) for x in coords]) + "?"
    
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": atom,
        "task": "atom name"
    }

def syntactic_spacegrp(space_grp, cif):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your understanding of the CIF file format to extract information about the unit cell structure.",
        "Utilize your expertise in Material Science to extract data regarding the unit cell structure from CIF files, drawing upon your comprehension of the file format.",
        "As a specialist in Material Science, employ your knowledge of CIF files to extract pertinent details concerning the unit cell structure.",
        "As a Material Science expert, utilize CIF file parsing to extract essential data regarding the unit cell configuration.",
        "Draw upon your Material Science expertise to extract unit cell structure information from CIF files, utilizing your understanding of the file format.",
        "Employ your understanding of Material Science and CIF file format to extract crucial information concerning the unit cell arrangement.",
        "As a specialist in Material Science, employ CIF file analysis to gather insights into the unit cell structure.",
        "Utilize your proficiency in Material Science to parse CIF files and extract relevant details regarding the unit cell configuration.",
        "Draw upon your expertise in Material Science to extract insights into the unit cell structure by analyzing CIF files."
    ]
    prompt = "Below is a CIF file.\n"+cif+"\n"
    prompt += "What is the name of the symmetry space group of this crystal?"
    
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": space_grp,
        "task": "space group"
    }

def syntactic_volume(cell_volume, cif):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your understanding of the CIF file format to extract information about the unit cell structure.",
        "Utilize your expertise in Material Science to extract data regarding the unit cell structure from CIF files, drawing upon your comprehension of the file format.",
        "As a specialist in Material Science, employ your knowledge of CIF files to extract pertinent details concerning the unit cell structure.",
        "As a Material Science expert, utilize CIF file parsing to extract essential data regarding the unit cell configuration.",
        "Draw upon your Material Science expertise to extract unit cell structure information from CIF files, utilizing your understanding of the file format.",
        "Employ your understanding of Material Science and CIF file format to extract crucial information concerning the unit cell arrangement.",
        "As a specialist in Material Science, employ CIF file analysis to gather insights into the unit cell structure.",
        "Utilize your proficiency in Material Science to parse CIF files and extract relevant details regarding the unit cell configuration.",
        "Draw upon your expertise in Material Science to extract insights into the unit cell structure by analyzing CIF files."
    ]
    prompt = "Below is a CIF file.\n"+cif+"\n"
    prompt += "What is the volume of a unit cell of this crystal?"
    
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": cell_volume,
        "task": "cell_volume"
    }

def syntactic_formula(formula, cif):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your understanding of the CIF file format to extract information about the unit cell structure.",
        "Utilize your expertise in Material Science to extract data regarding the unit cell structure from CIF files, drawing upon your comprehension of the file format.",
        "As a specialist in Material Science, employ your knowledge of CIF files to extract pertinent details concerning the unit cell structure.",
        "As a Material Science expert, utilize CIF file parsing to extract essential data regarding the unit cell configuration.",
        "Draw upon your Material Science expertise to extract unit cell structure information from CIF files, utilizing your understanding of the file format.",
        "Employ your understanding of Material Science and CIF file format to extract crucial information concerning the unit cell arrangement.",
        "As a specialist in Material Science, employ CIF file analysis to gather insights into the unit cell structure.",
        "Utilize your proficiency in Material Science to parse CIF files and extract relevant details regarding the unit cell configuration.",
        "Draw upon your expertise in Material Science to extract insights into the unit cell structure by analyzing CIF files."
    ]
    prompt = "Below is a CIF file.\n"+cif+"\n"
    prompt += "What is the chemical formula of the material whose crystal has this particular unit cell?"
    
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": formula,
        "task": "formula"
    }

def syntactic_replacable(cif, frac_coords, atom_ids):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your understanding of the CIF file format to extract information about the unit cell structure.",
        "Utilize your expertise in Material Science to extract data regarding the unit cell structure from CIF files, drawing upon your comprehension of the file format.",
        "As a specialist in Material Science, employ your knowledge of CIF files to extract pertinent details concerning the unit cell structure.",
        "As a Material Science expert, utilize CIF file parsing to extract essential data regarding the unit cell configuration.",
        "Draw upon your Material Science expertise to extract unit cell structure information from CIF files, utilizing your understanding of the file format.",
        "Employ your understanding of Material Science and CIF file format to extract crucial information concerning the unit cell arrangement.",
        "As a specialist in Material Science, employ CIF file analysis to gather insights into the unit cell structure.",
        "Utilize your proficiency in Material Science to parse CIF files and extract relevant details regarding the unit cell configuration.",
        "Draw upon your expertise in Material Science to extract insights into the unit cell structure by analyzing CIF files."
    ]
    prompt = "Below is a CIF file.\n"+cif+"\n"
    idx1 = random.choice(list(range(len(atom_ids))))
    idx2 = random.choice(list(range(len(atom_ids))))
    while idx2==idx1:
        idx2 = random.choice(list(range(len(atom_ids))))
    prompt += "Can you replace the atom at co-ordinates " + ", ".join(["{0:.3f}".format(x) for x in frac_coords[idx1]]) + " with the atom at co-ordinates " + ", ".join(["{0:.3f}".format(x) for x in frac_coords[idx2]]) + " so that the crystal remains unchanged?"
    answer = "Yes" if atom_ids[idx1].name==atom_ids[idx2].name else "No"
    
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": answer,
        "task": "replace"
    }

In [6]:
syntactic_tasks = []

cnt = 0
for _id, cif_data in tqdm(ft_data):
    try:
        path = f"mpcifs_all/{_id}" if _id.startswith("mp") else f"gnomecifs_all/{_id}"
        cif = open(os.path.join(PATH, path), 'r').read()
        structure = get_structure(cif)
        lengths = structure.lattice.parameters[:3]
        angles = structure.lattice.parameters[3:]
        atom_ids = structure.species
        frac_coords = structure.frac_coords

        space_grp = None
        cell_volume = None
        formula = None
        for line in cif.split('\n'):
            if line.startswith("_symmetry_space_group_name_H-M"):
                space_grp = line.split('\'')[1].replace(" ", "")
            if line.startswith("_cell_volume"):
                cell_volume = line.split()[1]
            if line.startswith("_chemical_formula_structural"):
                formula = line.split()[1]

        task1 = syntactic_atom_cnt(atom_ids, cif)
        task2 = syntactic_dims(lengths, angles, cif)
        task3 = syntactic_coords(frac_coords, atom_ids, cif)
        task1.update({"material": _id})
        task2.update({"material": _id})
        task3.update({"material": _id})
        syntactic_tasks += [task1, task2, task3]
        if space_grp:
            task4 = syntactic_spacegrp(space_grp, cif)
            task4.update({"material": _id})
            syntactic_tasks.append(task4)
        if cell_volume:
            task5 = syntactic_volume(cell_volume, cif)
            task5.update({"material": _id})
            syntactic_tasks.append(task5)
        if formula:
            task6 = syntactic_formula(formula, cif)
            task6.update({"material": _id})
            syntactic_tasks.append(task6)
        task7 = syntactic_replacable(cif, frac_coords, atom_ids)
        task7.update({"material": _id})
        syntactic_tasks.append(task7)
        cnt += 1
    except:
        continue

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [03:08<00:00, 53.07it/s]


In [7]:
with open("/scratch/cse/btech/cs1200448/MatLlama/ift_cif/syntactic_all.jsonl", 'w') as f:
    for doc in syntactic_tasks:
        f.write(json.dumps(doc)+'\n')

In [8]:
_id = "mp-371.cif"
path = f"mpcifs_all/{_id}" if _id.startswith("mp") else f"gnomecifs_all/{_id}"
cif = open(os.path.join(PATH, path), 'r').read()
structure = get_structure(cif)
lengths = structure.lattice.parameters[:3]
angles = structure.lattice.parameters[3:]
atom_ids = structure.species
frac_coords = structure.frac_coords

space_grp = None
cell_volume = None
formula = None
for line in cif.split('\n'):
    if line.startswith("_symmetry_space_group_name_H-M"):
        space_grp = line.split('\'')[1].replace(" ", "")
    if line.startswith("_cell_volume"):
        cell_volume = line.split()[1]
    if line.startswith("_chemical_formula_structural"):
        formula = line.split()[1]
print(frac_coords)

task1 = syntactic_atom_cnt(atom_ids, cif)
task2 = syntactic_dims(lengths, angles, cif)
task3 = syntactic_coords(frac_coords, atom_ids, cif)
if space_grp:
    task4 = syntactic_spacegrp(space_grp, cif)
if cell_volume:
    task5 = syntactic_volume(cell_volume, cif)
if formula:
    task6 = syntactic_formula(formula, cif)
task7 = syntactic_replacable(cif, frac_coords, atom_ids)

[[0.  0.5 0.5]
 [0.5 0.5 0. ]
 [0.5 0.  0.5]
 [0.  0.  0. ]]


In [9]:
print(task7["system"], task7["input"], task7["output"], sep='\n')

Utilize your expertise in Material Science to extract data regarding the unit cell structure from CIF files, drawing upon your comprehension of the file format.
Below is a CIF file.
# generated using pymatgen
data_La3Tl
_symmetry_space_group_name_H-M   'P 1'
_cell_length_a   5.09044228
_cell_length_b   5.09041111
_cell_length_c   5.09043905
_cell_angle_alpha   90.00000000
_cell_angle_beta   90.00000000
_cell_angle_gamma   90.00000000
_symmetry_Int_Tables_number   1
_chemical_formula_structural   La3Tl
_chemical_formula_sum   'La3 Tl1'
_cell_volume   131.90571650
_cell_formula_units_Z   1
loop_
 _symmetry_equiv_pos_site_id
 _symmetry_equiv_pos_as_xyz
  1  'x, y, z'
loop_
 _atom_site_type_symbol
 _atom_site_label
 _atom_site_symmetry_multiplicity
 _atom_site_fract_x
 _atom_site_fract_y
 _atom_site_fract_z
 _atom_site_occupancy
  La  La0  1  -0.00000000  0.50000000  0.50000000  1
  La  La1  1  0.50000000  0.50000000  0.00000000  1
  La  La2  1  0.50000000  -0.00000000  0.50000000  1
  Tl 

In [10]:
def generation_task_nate(cif, input_dict, crystal_str):

    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your expertise to answer the following question related to generation of stable material when some information about it is described.",
        "Employ your expertise in Material Science, particularly in working with CIF files, to address the question concerning the creation of stable materials with partial descriptive information.",
        "Utilize your proficiency in Material Science and handling CIF files to provide insights into generating stable materials with limited descriptive data.",
        "Apply your knowledge as a Material Science specialist, specifically in manipulating CIF files, to respond to queries regarding the production of stable materials given incomplete information.",
        "Utilize your skills as a Material Science expert, with a focus on CIF files, to tackle the question concerning the development of stable materials based on partial descriptions.",
        "Employ your expertise in Material Science, particularly in the realm of CIF files, to address inquiries related to the creation of stable materials despite incomplete data.",
        "Utilize your proficiency in working with CIF files, as well as your background in Material Science, to answer questions regarding the generation of stable materials with limited descriptive details.",
        "Apply your knowledge and experience in Material Science, including your familiarity with CIF files, to provide solutions for generating stable materials when only partial information is available.",
        "Employ your specialized knowledge in Material Science, specifically your experience with CIF files, to tackle questions related to creating stable materials with partial information.",
        "Apply your skills as a Material Science expert, particularly in managing CIF files, to provide insights into generating stable materials despite incomplete descriptive data."
    ]
    prompt = "Below is a description of a bulk material. "
        
    all_attributes = [
        "formation_energy_per_atom",
        # "band_gap",
        "e_above_hull",
        "spacegroup.number",
    ]

    num_attributes = random.randint(0, len(all_attributes))
    if num_attributes > 0:
        attributes = random.sample(all_attributes, num_attributes)
        attributes = ["pretty_formula", "elements"] + attributes

        prompt_lookup = {
            "formation_energy_per_atom": "The formation energy per atom is",
            "band_gap": "The band gap is",
            "pretty_formula": "The chemical formula is",
            "e_above_hull": "The energy above the convex hull is",
            "elements": "The elements are",
            "spacegroup.number": "The spacegroup number is",
        }

        for attr in attributes:
            if attr == "elements":
                if "_chemical_formula_sum" not in input_dict:
                    continue
                elems = input_dict["_chemical_formula_sum"].split()
                for i in range(len(elems)):
                    while ord(elems[i][-1])<=57:
                        elems[i] = elems[i][:-1]
                        if elems[i].startswith('\''):
                            elems[i] = elems[i][1:]
                prompt += f"{prompt_lookup[attr]} {', '.join(elems)}. "
            elif attr not in input_dict:
                continue
            elif attr in ["formation_energy_per_atom", "band_gap", "e_above_hull"]:
                prompt += f"{prompt_lookup[attr]} {round(float(input_dict[attr]), 4)}. "
            else:
                prompt += f"{prompt_lookup[attr]} {input_dict[attr]}. "

    prompt += (
        "Generate a description of the lengths and angles of the lattice vectors "
        "and then the element type and coordinates for each atom within the lattice.\n"
    )
    prompt += (
        "The output should be of the following format ONLY:\n"
    )
    prompt += (
        "l1, l2, l3\n"
        "a1, a2, a3\n"
        "atom1\nx, y, z\natom2\nx, y, z\n ...\n\n"
    )
    prompt += "l1, l2, l3 should be the predicted cell lengths.\n"
    prompt += "a1, a2, a3 should be the predicted cell angles.\n"
    prompt += "atom1, atom2, atom3, and so on, should be replaced with atom names and corresponding x, y, z with their coordinates in the lattice.\n"
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": crystal_str,
        "task": "conditional_generation",
    }

In [11]:
def element_generation(cif, input_dict, crystal_str):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your expertise to answer the following question related to generation of stable material when some information about it is described.",
        "Employ your expertise in Material Science, particularly in working with CIF files, to address the question concerning the creation of stable materials with partial descriptive information.",
        "Utilize your proficiency in Material Science and handling CIF files to provide insights into generating stable materials with limited descriptive data.",
        "Apply your knowledge as a Material Science specialist, specifically in manipulating CIF files, to respond to queries regarding the production of stable materials given incomplete information.",
        "Utilize your skills as a Material Science expert, with a focus on CIF files, to tackle the question concerning the development of stable materials based on partial descriptions.",
        "Employ your expertise in Material Science, particularly in the realm of CIF files, to address inquiries related to the creation of stable materials despite incomplete data.",
        "Utilize your proficiency in working with CIF files, as well as your background in Material Science, to answer questions regarding the generation of stable materials with limited descriptive details.",
        "Apply your knowledge and experience in Material Science, including your familiarity with CIF files, to provide solutions for generating stable materials when only partial information is available.",
        "Employ your specialized knowledge in Material Science, specifically your experience with CIF files, to tackle questions related to creating stable materials with partial information.",
        "Apply your skills as a Material Science expert, particularly in managing CIF files, to provide insights into generating stable materials despite incomplete descriptive data."
    ]
    
    if "_chemical_formula_sum" not in input_dict:
        return
    elems = input_dict["_chemical_formula_sum"].split()
    for i in range(len(elems)):
        while ord(elems[i][-1])<=57:
            elems[i] = elems[i][:-1]
            if elems[i].startswith('\''):
                elems[i] = elems[i][1:]
    num_els = random.randint(1, len(elems))
    # el_sampled = random.sample(elems, num_els)
    el_sampled = elems
    
    prompt = "Consider the following elements:\n"
    for el in el_sampled:
        prompt += el+'\n'
    prompt += "You need to create a stable crystal that contains at least one instance of each of these elements. It should not contain elements other than the specified ones. "
    prompt += (
        "Generate a description of the lengths and angles of the lattice vectors "
        "and then the element type and coordinates for each atom within the lattice:\n"
    )
    prompt += (
        "The output should be of the following format ONLY:\n"
    )
    prompt += (
        "l1, l2, l3\n"
        "a1, a2, a3\n"
        "atom1\nx, y, z\natom2\nx, y, z\n ...\n\n"
    )
    prompt += "l1, l2, l3 should be the predicted cell lengths.\n"
    prompt += "a1, a2, a3 should be the predicted cell angles.\n"
    prompt += "atom1, atom2, atom3, and so on, should be replaced with atom names and corresponding x, y, z with their coordinates in the lattice.\n"
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": crystal_str,
        "task": "element_generation"
    }

In [12]:
def infill_task(cif, structure):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your expertise to answer the following question related to predicting the masked element in a CIF file.",
        "Utilize your expertise as a Material Science specialist, well-versed in CIF files, to address queries concerning the anticipation of the hidden element within a CIF file.",
        "Employ your proficiency in Material Science and crystallographic file analysis to tackle questions related to predicting the concealed element in a CIF file.",
        "Apply your knowledge in Material Science, particularly your experience with CIF files, to provide insights into predicting the masked element within a CIF file.",
        "Utilize your skills as a Material Science expert, specializing in CIF files, to offer solutions for predicting the undisclosed element in a CIF file.",
        "Employ your expertise in Material Science and crystallographic file manipulation to address questions concerning the forecast of the hidden element in a CIF file.",
        "Apply your specialized knowledge in Material Science, particularly your expertise with CIF files, to provide solutions for predicting the concealed element within a CIF file.",
        "Utilize your proficiency in crystallographic file analysis, coupled with your background in Material Science, to respond to questions regarding the prediction of the masked element in a CIF file.",
        "Apply your expertise in Material Science, particularly your familiarity with crystallographic files, to address inquiries concerning the prediction of the masked element in a CIF file."
    ]
    prompt = (
        'Below is a partial description of a bulk material where one '
        'element has been replaced with the string "[MASK]":\n'
    )

    species = [str(s) for s in structure.species]
    species_to_remove = random.choice(species)
    lengths = structure.lattice.parameters[:3]
    angles = structure.lattice.parameters[3:]
    atom_ids = structure.species
    frac_coords = structure.frac_coords

    crystal_string = \
        " ".join(["{0:.1f}".format(x) for x in lengths]) + "\n" + \
        " ".join([str(int(x)) for x in angles]) + "\n" + \
        "\n".join([
            str(t) + "\n" + " ".join([
                "{0:.2f}".format(x) for x in c
            ]) for t,c in zip(atom_ids, frac_coords)
        ])

    partial_crystal_str = crystal_string.replace(
        species_to_remove, "[MASK]"
    )

    # l = partial_crystal_str.split('\n')
    # partial_crystal_str = '\n'.join(l[2:])
    # lens = [float(x) for x in l[0].split()]
    # angs = [int(x) for x in l[1].split()]
    infill_str = prompt + partial_crystal_str + "\n"

    infill_str += (
        "Generate an element that could replace [MASK] in the bulk material:\n"
    )
    return {
        "system": random.choice(SYSTEM),
        "input": infill_str,
        "output": species_to_remove,
        "task": "infill"
    }

In [13]:
def dimensions_task(cif, crystal_string):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your expertise to answer the following question related to predicting the dimensions of a stable crystal conditioned on some information about the crystal.",
        "Utilize your expertise in Material Science and familiarity with CIF files to address the task of predicting the dimensions of a stable crystal based on provided information.",
        "As a Material Science specialist working with CIF files, apply your knowledge to forecast the dimensions of a stable crystal given certain parameters.",
        "Employ your proficiency in crystallography and CIF file analysis to tackle the question of predicting the dimensions of a stable crystal conditioned on specific data.",
        "Utilize your expertise in Material Science and experience with CIF files to provide insights into predicting the dimensions of a stable crystal with given information.",
        "Apply your knowledge as a Material Science expert, particularly in working with CIF files, to answer questions related to predicting the dimensions of a stable crystal.",
        "Leverage your understanding of crystallographic principles and CIF files to address inquiries about predicting the dimensions of a stable crystal based on provided criteria.",
        "Utilize your expertise in Material Science, coupled with your familiarity with CIF files, to provide solutions for predicting the dimensions of a stable crystal conditioned on known parameters.",
        "Apply your knowledge as a Material Science specialist to analyze CIF files and predict the dimensions of a stable crystal given specific information."
    ]  
    # crystal_string = get_crystal_string_nate(cif)
    l = crystal_string.split('\n')
    crystal_string = '\n'.join(l[2:])
    lens = [float(x) for x in l[0].split()]
    angs = [int(x) for x in l[1].split()]
    
    prompt = "Below is a description of the element type and coordinates for each atom within the lattice of a stable crystal:\n"
    prompt += crystal_string + "\n"
    prompt += "Predict possible values for lengths and angles of the lattice vectors so that the crystal is stable."
    prompt += (
        "The output should be of the following format ONLY:\n"
    )
    prompt += (
        "l1, l2, l3\n"
        "a1, a2, a3\n\n"
    )
    prompt += "l1, l2, l3 should be the predicted cell lengths.\n"
    prompt += "a1, a2, a3 should be the predicted cell angles.\n"
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": '\n'.join(l[:2]),
        "task": "dimensions_sem"
    }

In [None]:
def volume_calc(cif, crystal_string):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your expertise to compute the volume of a unit cell of the crystal described below.",
        "As a Material Science expert dealing with CIF files, please compute the unit cell volume for the given crystal.",
        "With your knowledge in Material Science and experience with crystallographic files, determine the volume of the crystal's unit cell.",
        "Given your background in Material Science and familiarity with CIF files, please find the volume of the described crystal's unit cell.",
        "As a Material Science specialist working with CIF files, calculate the volume of the unit cell of the provided crystal.",
        "With your proficiency in Material Science and crystallographic files, determine the unit cell volume for this crystal.",
        "Given your expertise in Material Science and knowledge of CIF files, compute the volume of the described crystal's unit cell.",
        "As an expert in Material Science and CIF files, calculate the unit cell volume for the given crystal.",
        "Using your Material Science and CIF file expertise, determine the volume of the unit cell of the crystal described."
    ]  
    # crystal_string = get_crystal_string_nate(cif)
    for line in cif.split('\n'):
        if "_cell_volume" in line:
            vol = float(line.split()[1].strip())
    
    prompt = "Below is a description dimensions, as well as of the element type and coordinates for each atom within the lattice of a stable crystal:\n"
    prompt += crystal_string + "\n"
    prompt += "Compute the volume of a unit cell of this stable crystal.\n"
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": vol,
        "task": "vol_calc"
    }

In [None]:
def formula_compute(cif, crystal_string):
    SYSTEM = [
        "You are a Material Science expert who works with crystallographic files (CIF files). Use your expertise to find the chemical formula of the crystal whose unit cell is described below.",
    ]  
    # crystal_string = get_crystal_string_nate(cif)
    for line in cif.split('\n'):
        if "_chemical_formula_structural" in line:
            formula = float(line.split()[1].strip())
    
    prompt = "Below is a description dimensions, as well as of the element type and coordinates for each atom within the lattice of a stable crystal:\n"
    prompt += crystal_string + "\n"
    prompt += "Find the chemical formula of the stable crystal which has the above unit cell.\n"
    return {
        "system": random.choice(SYSTEM),
        "input": prompt,
        "output": formula,
        "task": "formula_compute"
    }

In [14]:
def get_structure(cif_str):
    structure = Structure.from_str(cif_str, fmt="cif")

    structure.translate_sites(
        indices=range(len(structure.sites)), vector=np.random.uniform(size=(3,))
    )
    return structure

In [None]:
val_mats = dict()
with open('/scratch/cse/btech/cs1200448/MatLlama/ift_cif/semantic_val.jsonl', 'r') as f:
    l = f.readlines()
    for line in l:
        data = json.loads(line)
        # print(data['material'])
        val_mats[data['material']] = 0
        # break

In [15]:
data = []

for _id, cif_data in tqdm(cif_data_list):
    # try:
        path = f"mpcifs_all/{_id}" if _id.startswith("mp") else f"gnomecifs_all/{_id}"
        cif = open(os.path.join(PATH, path), 'r').read()
        input_dict = cif_data
        structure = get_structure(cif)

        crystal_str2 = get_crystal_string_nate(structure)
        task1 = volume_calc(cif, crystal_str2)
        task2 = formula_compute(cif, crystal_str2)
        task1.update({"material": _id})
        task2.update({"material": _id})

        data += [task1, task2]

        # if _id in val_mats:
        #     with open('/scratch/cse/btech/cs1200448/MatLlama/ift_cif_large/val.jsonl', 'a') as f:
        #         f.write(json.dumps(task1)+'\n')
        #         f.write(json.dumps(task2)+'\n')            
        # else:
        #     with open('/scratch/cse/btech/cs1200448/MatLlama/ift_cif_large/semantic.jsonl', 'a') as f:
        #         f.write(json.dumps(task1)+'\n')
        #         f.write(json.dumps(task2)+'\n')
    # except:
        # continue
# cnt = 0
# for _id, cif in tqdm(ft_data):
#     # print(cnt)
#     try:
#         _id = el
#         path = f"mpcifs_all/{_id}" if _id.startswith("mp") else f"gnomecifs_all/{_id}"
#         cif = open(os.path.join(PATH, path), 'r').read()
#         input_dict = data[el]
#         structure = get_structure(cif)
#         # crystal_str1 = get_crystal_string(structure)
#         crystal_str2 = get_crystal_string_nate(structure)
#         task1 = generation_task_nate(cif, input_dict, crystal_str2)
#         task2 = element_generation(cif, input_dict, crystal_str2)
#         task3 = infill_task(cif, structure)
#         task4 = dimensions_task(cif, crystal_str2)

        # print(cif)
        # print(task1["system"], task1["input"])

        # with open("example.txt", 'w') as f:
        #     f.write(cif+'\n')
        #     f.write(task1["system"] + '\n' + task1["input"] + "\nOutput:\n" + task1["output"] + '\n')
#         #     f.write(task2["system"] + '\n' + task2["input"] + "\nOutput:\n" + task2["output"] + '\n')
#         #     f.write(task3["system"] + '\n' + task3["input"] + "\nOutput:\n" + task3["output"] + '\n')
#         #     f.write(task4["system"] + '\n' + task4["input"] + "\nOutput:\n" + task4["output"] + '\n')
        
#         # print(task2["output"])
#         # print(task3["output"])
#         # print(task4["output"])
#         # break
#         # args_list = [
#         #     (cif, input_dict, crystal_str1),
#         #     (cif, input_dict, crystal_str1),
#         #     (cif, structure),
#         #     (cif, crystal_str2)
#         # ]
        
#         # functions = [generation_task_nate, element_generation, infill_task, dimensions_task]
        
#         # with multiprocessing.Pool(processes=4) as pool:
#         #     results = pool.starmap(lambda f, args: f(*args), zip(functions, args_list))
#         with open('/scratch/cse/btech/cs1200448/MatLlama/cif_data/all.jsonl', 'a') as f:
#             f.write(json.dumps(task1)+'\n')
#             f.write(json.dumps(task2)+'\n')
#             f.write(json.dumps(task3)+'\n')
#             f.write(json.dumps(task4)+'\n')
#         cnt += 1
#     except:
#         continue

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:03<00:00, 156.95it/s]


In [23]:
print(cnt)

1


In [None]:
train = []
val = []
test = []

with open('/scratch/cse/btech/cs1200448/MatLlama/cif_data/all.jsonl', 'r') as f:
    l = f.readlines()
    for line in l:
        task = json.loads(line)
        x = random.random()
        if x < 0.8:
            train.append(task)
        elif 0.8 < x < 0.9:
            val.append(task)
        else:
            test.append(task)

In [None]:
with open('/scratch/cse/btech/cs1200448/MatLlama/cif_data/train.jsonl', 'w') as f:
    for task in train:
        f.write(json.dumps(task)+'\n')
with open('/scratch/cse/btech/cs1200448/MatLlama/cif_data/val.jsonl', 'w') as f:
    for task in val:
        f.write(json.dumps(task)+'\n')
with open('/scratch/cse/btech/cs1200448/MatLlama/cif_data/test.jsonl', 'w') as f:
    for task in test:
        f.write(json.dumps(task)+'\n')