In [1]:
from jarvis.core.atoms import Atoms
from jarvis.io.vasp.inputs import Poscar
from jarvis.db.figshare import data
import os
import pandas as pd
import json


In [1]:
#Structure file generator 
struc_path = "/data/yll6162/alignntl_dft_3d/jid"
dft_3d = data(dataset='dft_3d')
jids_file = []
jids = []
for i in dft_3d:
    atoms = Atoms.from_dict(i['atoms'])
    poscar = Poscar(atoms)
    jid = i['jid']
    jids.append(jid)
    jids_file.append(jid+'.vasp')
    filename1 = os.path.join(struc_path, 'POSCAR-'+jid+'.vasp')
    filename2 = os.path.join(struc_path, jid+'.vasp')
    if (not os.path.exists(filename1)) and (not os.path.exists(filename2)):
        poscar.write_file(filename2)
data = {'jid_file': jids_file, 'jid': jids}
df = pd.DataFrame(data)
csv_path = os.path.join(struc_path, "id_prop_full.csv")
df.to_csv(csv_path, header=False, index=False)
#save id_prop csv



In [5]:
#Dataset split
# props = ["formation_energy_peratom"]
props = ["slme", "spillage", "magmom_outcar", "mbj_bandgap", "Tc_supercon"]
split_json_dir = "/data/yll6162/alignntl_dft_3d/dataset"
#Follow Chemnlp + robo split for now
split = {'id_train': [], 'id_val': [],'id_test':[]}



# Construct the largest common dataset across all sources

#chemnlp bert embeddings
df_chemnlp_bert = pd.read_csv("/data/yll6162/alignntl_dft_3d/embeddings/embeddings_bert-base-uncased_chemnlp_75973.csv", index_col = 0)


# robo bert embeddings
df_robo_bert = pd.read_csv("/data/yll6162/alignntl_dft_3d/embeddings/embeddings_bert-base-uncased_robo_75966_err_fixed.csv", index_col = 0)


df_alignn_embeddings = pd.read_csv("/data/yll6162/alignntl_dft_3d/jid/x+y+z/data0.csv", index_col=0)

chem_set =set(df_chemnlp_bert.index)
# robo_set =set(df_robo_bert.iloc[:, 0])
robo_set =set(df_robo_bert.index)
alignn_set =set(df_alignn_embeddings['id'])
intersection = chem_set.intersection(robo_set, alignn_set)
# jids = list(intersection)
# jids_file = [jid+".vasp" for jid in jids]
# data_intersec = {'jid_file': jids_file, 'jid': jids}
# df = pd.DataFrame(data_intersec)
# csv_path = os.path.join(struc_path, "id_prop_intersec.csv")
# df.to_csv(csv_path, index=False)

for prop in props:
    for mode in ['train', 'test', 'val']:
        file_path = f"/scratch/yll6162/CrossPropertyTL/llm_data/dataset_bert-base-uncased_chemnlp_prop_{prop}_{mode}.csv"
        df = pd.read_csv(file_path)
        split[f"id_{mode}"] = df['ids'].tolist()
    for key in split:
        split[key] = [val.rstrip('.vasp') for val in split[key] if val.rstrip('.vasp')  in intersection]
        print(f"{prop} {key}: {len(split[key])}")

    split_json_path = os.path.join(split_json_dir, f"dataset_split_{prop}.json")
    with open(split_json_path, 'w') as json_file:
        json.dump(split, json_file)

slme id_train: 7810
slme id_val: 977
slme id_test: 977
spillage id_train: 9041
spillage id_val: 1131
spillage id_test: 1131
magmom_outcar id_train: 59370
magmom_outcar id_val: 7422
magmom_outcar id_test: 7422
mbj_bandgap id_train: 15642
mbj_bandgap id_val: 1957
mbj_bandgap id_test: 1957
Tc_supercon id_train: 842
Tc_supercon id_val: 106
Tc_supercon id_test: 106


In [4]:
#id prop csv (for running ALIGNN) based on the data split
props =  ["slme", "spillage", "magmom_outcar", "mbj_bandgap", "Tc_supercon"]
split_json_dir = "/data/yll6162/alignntl_dft_3d/dataset"
struc_path = "/data/yll6162/alignntl_dft_3d/jid"
dft_3d = data(dataset='dft_3d')

for prop in props:
    # df_file = pd.read_csv(full_data_path, header=None, names = ["file", "jid"])
    split_json_path = os.path.join(split_json_dir, f"dataset_split_{prop}.json")
    prop_folder = os.path.join(struc_path, prop)
    os.makedirs(prop_folder , exist_ok=True)
    with open(split_json_path, 'r') as file:
        # Load the JSON data into a dictionary
        ids_dict = json.load(file)
        print(prop)
        ids_train = [id + '.vasp' for id in ids_dict['id_train']]
        print(len(ids_train))
        ids_val = [id + '.vasp' for id in ids_dict['id_val']]
        print(len(ids_val))
        ids_test = [id + '.vasp' for id in ids_dict['id_test']]
        print(len(ids_test))
    jids_file = []
    values = []
    for i in dft_3d:
        if i[prop]!='na':
            atoms = Atoms.from_dict(i['atoms'])
            poscar = Poscar(atoms)
            jid = i['jid']
            value = i[prop]
            values.append(value)
            jids_file.append(jid+'.vasp')

    data_id = {'jid_file': jids_file, 'prop': values}
    df = pd.DataFrame(data_id)
    df_train = df[df.jid_file.isin(ids_train)].copy()
    df_val = df[df.jid_file.isin(ids_val)].copy()
    df_test = df[df.jid_file.isin(ids_test)].copy()
    df_sorted = pd.concat([df_train, df_val, df_test])

    csv_path = os.path.join(prop_folder, "id_prop.csv")

    df_sorted.to_csv(csv_path, header=False, index=False)

df_sorted

Obtaining 3D dataset 76k ...
Reference:https://www.nature.com/articles/s41524-020-00440-1
Other versions:https://doi.org/10.6084/m9.figshare.6815699
Loading the zipfile...
Loading completed.
slme
7810
977
977
spillage
9041
1131
1131
magmom_outcar
59370
7422
7422
mbj_bandgap
15642
1957
1957
Tc_supercon
842
106
106


Unnamed: 0,jid_file,prop
1,JVASP-15345.vasp,0.433734
2,JVASP-42657.vasp,2.631345
3,JVASP-16080.vasp,22.900096
5,JVASP-91700.vasp,19.406985
6,JVASP-65030.vasp,2.912094
...,...,...
1025,JVASP-139418.vasp,3.048069
1026,JVASP-136706.vasp,0.128532
1038,JVASP-136725.vasp,0.000008
1039,JVASP-138900.vasp,0.830195


In [10]:
prop="formation_energy_peratom"
split_json_dir = "/data/yll6162/alignntl_dft_3d/dataset"
split_json_path = os.path.join(split_json_dir, f"dataset_split_{prop}.json")

with open(split_json_path, 'r') as file:
    ids_dict = json.load(file)
ids_dict["id_test"]

['JVASP-134028',
 'JVASP-13104',
 'JVASP-140322',
 'JVASP-128314',
 'JVASP-157831',
 'JVASP-118127',
 'JVASP-123304',
 'JVASP-100467',
 'JVASP-54561',
 'JVASP-123917',
 'JVASP-108024',
 'JVASP-93329',
 'JVASP-153428',
 'JVASP-94922',
 'JVASP-97581',
 'JVASP-64284',
 'JVASP-38787',
 'JVASP-143676',
 'JVASP-151900',
 'JVASP-8169',
 'JVASP-58235',
 'JVASP-16252',
 'JVASP-118009',
 'JVASP-63245',
 'JVASP-48368',
 'JVASP-153402',
 'JVASP-118350',
 'JVASP-61265',
 'JVASP-74498',
 'JVASP-13968',
 'JVASP-128477',
 'JVASP-142072',
 'JVASP-99213',
 'JVASP-8102',
 'JVASP-79601',
 'JVASP-122156',
 'JVASP-128020',
 'JVASP-91708',
 'JVASP-108749',
 'JVASP-99690',
 'JVASP-145278',
 'JVASP-33802',
 'JVASP-134441',
 'JVASP-142084',
 'JVASP-115074',
 'JVASP-63929',
 'JVASP-113662',
 'JVASP-101814',
 'JVASP-121145',
 'JVASP-156387',
 'JVASP-98745',
 'JVASP-117759',
 'JVASP-38517',
 'JVASP-100800',
 'JVASP-155393',
 'JVASP-18644',
 'JVASP-121893',
 'JVASP-61682',
 'JVASP-124043',
 'JVASP-9729',
 'JVASP-15