In [2]:
import os
from glob import glob
import json
import pickle
import numpy as np
import pandas as pd

model_path = "Models/PROFiT-Net"

target_dirs = ["dielectric_constant", "exp_band_gap", "exp_formation_enthalpy","hse06_band_gap", "pbe_+u_band_gap", "pbe_+u_formation_enthalpy"]
# check target_dirs is exist
for target_dir in target_dirs:
    if not os.path.exists(os.path.join(model_path ,target_dir, "dataset")):
        # os.mkdir(target_dir)
        print("No such directory: ", target_dir)
    else:
        print("Directory exists: ", target_dir)

Directory exists:  dielectric_constant
Directory exists:  exp_band_gap
Directory exists:  exp_formation_enthalpy
Directory exists:  hse06_band_gap
Directory exists:  pbe_+u_band_gap
Directory exists:  pbe_+u_formation_enthalpy


In [3]:
import pandas as pd

def merge_multiheader(file_path):
    # Read the first two rows as header
    df = pd.read_csv(file_path, header=[0, 1])
    
    # Merge headers: if the second header row is empty or NaN, use the value from the first header row
    new_columns = []
    for first, second in df.columns:
        new_columns.append(second if pd.notna(second) and second != "" else first)
    
    # Update the DataFrame with the merged headers
    df.columns = new_columns
    
    # Drop the initial multiheader row structure
    df.columns.name = None
    
    return df


In [4]:
Total_data = {}
for target in target_dirs:
    if target == "dielectric_constant":
        target_data = os.path.join(model_path, target, "dataset", "raw_data.pickle")
        with open(target_data, "rb") as f:
            data = pd.read_pickle(f)
            Total_data[target] = data
    elif target == "exp_band_gap":
        target_data = os.path.join(model_path, target, "dataset", "ct9b00322_si_002.csv")
        data = merge_multiheader(target_data)
        # data = pd.read_csv(target_data)
        # data = merge_headers(data)
        Total_data[target] = data
    elif target == "exp_formation_enthalpy":
        target_data = os.path.join(model_path, target, "dataset", "experimental_dataset.csv")
        data = pd.read_csv(target_data)
        Total_data[target] = data
    elif target == "hse06_band_gap":
        # get all file names in this folder
        target_data_path = os.path.join(model_path, target, "dataset")
        all_files = glob(target_data_path + "/*.json")
        data = []
        for filename in all_files:
            with open(filename) as f:
                data.append(json.load(f))
        Total_data[target] = data
    elif target == "pbe_+u_band_gap":
        target_data = os.path.join(model_path, target, "dataset", "mp-ids-46744.csv")
        data = pd.read_csv(target_data, header=None)
        Total_data[target] = data
    elif target == "pbe_+u_formation_enthalpy":
        # Totally same file.
        target_data = os.path.join(model_path, target, "dataset", "mp-ids-46744.csv")
        data = pd.read_csv(target_data, header=None)
        Total_data[target] = data
    # else:
    #     continue
    
    

In [5]:
######## targe_dirs[0]: dielectric_constant ########
Total_data[target_dirs[0]]
# concatenate Total_data[target_dirs[0]][2] to Total_data[target_dirs[0]][0]
temp_df_1 = pd.concat([Total_data[target_dirs[0]][0], Total_data[target_dirs[0]][2]], axis=1)
temp_df_2 = pd.concat([Total_data[target_dirs[0]][1], Total_data[target_dirs[0]][3]], axis=1)

# merge temp_df_1 and temp_df_2
temp_df = pd.concat([temp_df_1, temp_df_2], axis=0)
dielectric_constant_df = temp_df.sort_index()
dielectric_constant_df.to_csv(os.path.join("Models/PROFiT-Net/dielectric_constant/dataset/","dielectric_constant.csv"), index=False)

### ion의 종류에 대해서만 one-hot encoding으로 표기된 형태이기 때문에, 이에 맞는 mp-id를 찾아줘야함.

In [6]:
######## targe_dirs[1]: exp_band_gap ########
Total_data[target_dirs[1]]
temp_df = Total_data[target_dirs[1]]
# read the second column
prop_1_mp = list(temp_df.iloc[:, 1])

In [7]:
######## targe_dirs[2]: exp_formation_enthalpy ########
Total_data[target_dirs[2]]
temp_df = Total_data[target_dirs[2]]
# read the first column
prop_2_mp = list(temp_df.iloc[:, 0])

In [8]:
######## targe_dirs[3]: hse06_band_gap ########
Total_data[target_dirs[3]]
temp_jsons = Total_data[target_dirs[3]]
prop_3_icsd = []
for i in temp_jsons:
    prop_3_icsd.append(i["ICSD_number"])

In [9]:
######## targe_dirs[4]: pbe_+u_band_gap ########
Total_data[target_dirs[4]]
temp_df = Total_data[target_dirs[4]]
# read the first column
prop_4_mp = list(temp_df.iloc[:, 0])

In [10]:
######## targe_dirs[5]: pbe_+u_formation_enthalpy ########
Total_data[target_dirs[5]]
temp_df = Total_data[target_dirs[5]]
# read the first column
prop_5_mp = list(temp_df.iloc[:, 0])

In [11]:
ids = {
    "dielectric_constant": {"DB_source": "Materials Project", "mp-id": None},
    "exp_band_gap": {"DB_source": "Materials Project", "mp-id": prop_1_mp},
    "exp_formation_enthalpy": {"DB_source": "Materials Project", "mp-id": prop_2_mp},
    "hse06_band_gap": {"DB_source": "ICSD", "ICSD_number": prop_3_icsd},
    "pbe_+u_band_gap": {"DB_source": "Materials Project", "mp-id": prop_4_mp},
    "pbe_+u_formation_enthalpy": {"DB_source": "Materials Project", "mp-id": prop_5_mp},
}

In [12]:
for prop in ids:
    if prop == "dielectric_constant":
        continue
    print(prop)
    for key in ids[prop]:
        if key == "DB_source":
            continue
        else:
            print(key, ":", ids[prop][key][:3])
            print(len(ids[prop][key]))


exp_band_gap
mp-id : ['mp-557056', 'mp-9900', 'mp-19318']
472
exp_formation_enthalpy
mp-id : ['mp-625994', 'mp-669466', 'mp-567638']
1143
hse06_band_gap
ICSD_number : ['407646', '155159', '300020']
10481
pbe_+u_band_gap
mp-id : ['mp-754118', 'mp-978908', 'mp-633688']
46744
pbe_+u_formation_enthalpy
mp-id : ['mp-754118', 'mp-978908', 'mp-633688']
46744


In [13]:
structures_dir = "all_structures"
# get all filenames in this folder
all_files = glob(structures_dir + "/*.cif")

with open("original_current.json", "r", encoding="utf-8-sig") as f:
    original_current = json.load(f)

originals = list(original_current.keys())
originals = set(originals)

In [14]:
"mvc-6390" in originals

True

In [15]:
len(originals)

10103

In [16]:
for i in originals:
    if i == "mvc-6390":
        print("found")

found


In [17]:
legacy_unmatch = {
    "mp-22743":'mp-542705',
    'mp-540857':'mp-17646',
    'mp-24951':'mp-19793',
    'mp-640827':'mp-559944',
    'mp-541337':'mp-20445',
    'mp-565817':'mp-556817',
    'mp-541004':'mp-16867',
    'mp-579927':'mp-566385',
    'mp-566082':'mp-559579',
    'mp-565220':'mp-557339',
    'mp-541597':'mp-21083',
    'mp-561879':'mp-541580',
    'mp-19700':'mp-18715',
    'mp-25036':'mp-22518',
    'mp-25738':'mp-22245',
    'mp-566968':'mp-558044',
    'mp-566140':'mp-560124',
    'mp-704223':'mp-21545'
}

In [18]:
legacy_unmatch_keys=list(legacy_unmatch.keys())

In [22]:
from tqdm import tqdm

for prop in ids:
    if prop == "dielectric_constant":
        continue
    # if prop == "exp_formation_enthalpy":
    #     pass
    # else:
    #     continue

    if prop == "hse06_band_gap":
        pass
    else:
        continue

    print(prop)
    for key in ids[prop]:
        if key == "DB_source":
            continue
        else:
            structure_name_list = ids[prop][key]
        for structure_name in tqdm(structure_name_list):
            if structure_name in legacy_unmatch_keys:
                original = structure_name.replace("-", "_")
                legacy_mp_id = legacy_unmatch[structure_name].replace("-", "_")
                command = "cp " + os.path.join(structures_dir, legacy_mp_id + ".cif") + " " + os.path.join("Models/PROFiT-Net", prop, "structure_cif", original + ".cif")

            elif structure_name in originals:
                original = structure_name.replace("-", "_")
                current = original_current[structure_name].replace("-", "_")
                command = "cp " + os.path.join(structures_dir, current + ".cif") + " " + os.path.join("Models/PROFiT-Net", prop, "structure_cif", original + ".cif")

            else:
                if "mp" in structure_name:
                    structure_name = structure_name.replace("-", "_")
                    command = "cp " + os.path.join(structures_dir, structure_name + ".cif") + " " + os.path.join("Models/PROFiT-Net", prop, "structure_cif", structure_name + ".cif")
                elif "_" not in structure_name:
                    structure_name = "icsd_" + structure_name
                    command = "cp " + os.path.join(structures_dir, structure_name + ".cif") + " " + os.path.join("Models/PROFiT-Net", prop, "structure_cif", structure_name + ".cif")

            os.system(command)

hse06_band_gap


  0%|          | 0/10481 [00:00<?, ?it/s]

100%|██████████| 10481/10481 [12:08<00:00, 14.39it/s]


In [23]:
from tqdm import tqdm

for prop in ids:
    if prop == "dielectric_constant":
        continue
    # if prop == "exp_formation_enthalpy":
    #     pass
    # else:
    #     continue

    print(prop)
    for key in ids[prop]:
        if key == "DB_source":
            continue
        else:
            structure_name_list = ids[prop][key]
        for structure_name in tqdm(structure_name_list):
            if structure_name in legacy_unmatch_keys:
                original = structure_name.replace("-", "_")
                legacy_mp_id = legacy_unmatch[structure_name].replace("-", "_")
                command = "cp " + os.path.join(structures_dir, legacy_mp_id + ".cif") + " " + os.path.join("Models/PROFiT-Net", prop, "structure_cif", original + ".cif")

            # elif structure_name in originals:
            #     original = structure_name.replace("-", "_")
            #     current = original_current[structure_name].replace("-", "_")
            #     command = "cp " + os.path.join(structures_dir, current + ".cif") + " " + os.path.join("Models/PROFiT-Net", prop, "structure_cif", original + ".cif")

            # else:
            #     if "mp" in structure_name:
            #         structure_name = structure_name.replace("-", "_")
            #         command = "cp " + os.path.join(structures_dir, structure_name + ".cif") + " " + os.path.join("Models/PROFiT-Net", prop, "structure_cif", structure_name + ".cif")
            #     elif "icsd" in structure_name:
            #         structure_name = "icsd_" + structure_name
            #         command = "cp " + os.path.join(structures_dir, structure_name + ".cif") + " " + os.path.join("Models/PROFiT-Net", prop, "structure_cif", structure_name + ".cif")
            else:
                continue

            os.system(command)

exp_band_gap


100%|██████████| 472/472 [00:00<00:00, 1281366.66it/s]


exp_formation_enthalpy


100%|██████████| 1143/1143 [00:00<00:00, 1421734.72it/s]


hse06_band_gap


100%|██████████| 10481/10481 [00:00<00:00, 1039894.50it/s]


pbe_+u_band_gap


100%|██████████| 46744/46744 [00:00<00:00, 60473.86it/s]


pbe_+u_formation_enthalpy


100%|██████████| 46744/46744 [00:00<00:00, 72857.03it/s] 


In [23]:
with open("ids.json", "r", encoding="utf-8-sig") as f:
    ids = json.load(f)

In [25]:
with open("mp_ids_from_legacy.json", "r", encoding="utf-8-sig") as f:
    mp_ids_from_legacy = json.load(f)
with open("mp_ids_from_mp.json", "r", encoding="utf-8-sig") as f:
    mp_ids_from_mp = json.load(f)

In [60]:
current_mp_ids = []
for i in mp_ids_from_mp:
    current_mp_ids.append(i["material_id"])

current_mp_ids_set = set(current_mp_ids)

In [61]:
legacy_mp_ids = []
for i in mp_ids_from_legacy:
    legacy_mp_ids.append(i["material_id"])

legacy_mp_ids_set = set(legacy_mp_ids)

In [50]:
legacy_task_dicts = {}
for i in mp_ids_from_legacy:
    legacy_task_dicts.update({i["material_id"]: i["task_ids"]})

In [32]:
# for i in mp_ids_from_mp[0].keys():
#     if "gap" in i:
#         print(i)
mp_ids_from_mp[0].keys()



In [41]:
mp_ids_from_mp[1]["band_gap"]
mp_ids_from_mp[1]["material_id"]

'mp-29354'

In [96]:
mp_ids_from_mp[5432]

{'builder_meta': {'emmet_version': '0.72.20',
  'pymatgen_version': '2023.11.12',
  'run_id': None,
  'database_version': '2023.11.1',
  'build_date': '2023-11-22 19:43:59.090000',
  'license': 'BY-C'},
 'nsites': 32,
 'elements': ['Cl', 'K', 'Pb'],
 'nelements': 3,
 'composition': {'K': 4.0, 'Pb': 8.0, 'Cl': 20.0},
 'composition_reduced': {'K': 1.0, 'Pb': 2.0, 'Cl': 5.0},
 'formula_pretty': 'KPb2Cl5',
 'formula_anonymous': 'AB2C5',
 'chemsys': 'Cl-K-Pb',
 'volume': 890.5713125473281,
 'density': 4.704427760789219,
 'density_atomic': 27.830353517104005,
 'symmetry': {'crystal_system': 'Monoclinic',
  'symbol': 'P2_1/c',
  'number': 14,
  'point_group': '2/m',
  'symprec': 0.1,
  'version': '2.0.2'},
 'property_name': 'summary',
 'material_id': 'mp-607267',
 'deprecated': False,
 'deprecation_reasons': None,
 'last_updated': '2023-11-22 19:43:59.090000',
 'origins': [{'name': 'structure',
   'task_id': 'mp-2214311',
   'last_updated': '2022-04-30 01:47:38.230000'},
  {'name': 'energy',


In [85]:
import tqdm
##### allocate target of pbe_+u_band_gap and pbe_+u_formation_enthalpy #####
id = ids["pbe_+u_band_gap"]["mp-id"]
target_4 = []
target_5 = []
c = 0
c_t = 0
l = 0
l_t = 0
# for i in tqdm(id):
for i in id:
    # c mp에서 찾아보기
    found_in_current_mp = False
    if i in current_mp_ids_set:
        c += 1
        target_4.append(mp_ids_from_mp[current_mp_ids.index(i)]["band_gap"])
        target_5.append(mp_ids_from_mp[current_mp_ids.index(i)]["formation_energy_per_atom"])
        print("current")
        found_in_current_mp = True
        continue
    
    found_in_current_mp_task = False
    if not found_in_current_mp:
        c_t += 1
        for current in mp_ids_from_mp:
            if i in current["task_ids"]:
                target_4.append(current["band_gap"])
                target_5.append(current["formation_energy_per_atom"])
                found_in_current_mp_task = True
                print("current_task")
                break
        if found_in_current_mp_task:
            continue
        else:
            pass        
    
    # legacy에서 찾아보기
    if i in legacy_mp_ids:
        l += 1
        target_4.append(mp_ids_from_legacy[legacy_mp_ids.index(i)]["band_gap"])
        target_5.append(mp_ids_from_legacy[legacy_mp_ids.index(i)]["formation_energy_per_atom"])
        print("legacy")
        continue

    else:
        l_t += 1
        for legacy in  mp_ids_from_legacy:
            if i in legacy["task_ids"]:
                target_4.append(legacy["band_gap"])
                target_5.append(legacy["formation_energy_per_atom"])
                print("legacy_task")
                break
        print("you fucked up")
        continue
        

current
current
current_task
current
current_task
current
current
current
current
current
current
current
current
current
current
legacy
current
current
current
current_task
current
current
current
current
current
current
current_task
current
legacy
current
current
current
current
current
current
current
current
current_task
current
current_task
current
current
current_task
current
current
current
current
current
current
current_task
current
current
current
current
current
current_task
current_task
current
current_task
current
current
current
current
current
current
current
current
current_task
current
current
current
current_task
current
current
current_task
current_task
current
current
current
current
current
current
current
current
current
current
current
current
current_task
current
current
current
current
current
current_task
current_task
current
current
current_task
current
current_task
current
current_task
current
current
current
current
current_task
current
current
current
curr

In [86]:
c, c_t,l, l_t

(36074, 10670, 563, 18)

In [87]:
len(ids["pbe_+u_band_gap"]["mp-id"]), len(target_4), len(target_5)

(46744, 46744, 46744)

In [88]:
ids["pbe_+u_band_gap"]["target"] = target_4
ids["pbe_+u_formation_enthalpy"]["target"] = target_5

In [92]:
for prop in ids:
    if prop == "dielectric_constant":
        continue
    if "mp-id" in ids[prop].keys():
        ids[prop]["file_name"] = [i.replace("-","_")+".cif" for i in ids[prop]["mp-id"]]
    if "ICSD_number" in ids[prop].keys():
        ids[prop]["file_name"] = ["icsd_"+i+".cif" for i in ids[prop]["ICSD_number"]]
    # file_name = ids[prop][]

In [94]:
ids["hse06_band_gap"]
ids["pbe_+u_band_gap"]

{'DB_source': 'Materials Project',
 'mp-id': ['mp-754118',
  'mp-978908',
  'mp-633688',
  'mp-3799',
  'mp-761650',
  'mp-12487',
  'mp-541594',
  'mp-775132',
  'mp-9712',
  'mp-754249',
  'mp-758115',
  'mp-755009',
  'mp-569482',
  'mp-32736',
  'mp-8675',
  'mp-980063',
  'mp-755364',
  'mp-19113',
  'mp-753497',
  'mp-31808',
  'mp-772777',
  'mp-558875',
  'mp-561117',
  'mp-849412',
  'mp-2387',
  'mp-772260',
  'mp-540352',
  'mp-541044',
  'mp-568944',
  'mp-557769',
  'mp-866076',
  'mp-999072',
  'mp-571420',
  'mp-30055',
  'mp-569945',
  'mp-28435',
  'mp-977189',
  'mp-772343',
  'mp-974981',
  'mp-769707',
  'mp-15257',
  'mp-29796',
  'mp-773411',
  'mp-1019110',
  'mp-23339',
  'mp-21001',
  'mp-766236',
  'mp-975198',
  'mp-866056',
  'mp-849977',
  'mp-5477',
  'mp-41732',
  'mp-675028',
  'mp-560768',
  'mp-734019',
  'mp-781076',
  'mp-31793',
  'mp-3075',
  'mp-32041',
  'mp-34144',
  'mp-10114',
  'mp-2908',
  'mp-771075',
  'mp-4341',
  'mp-29579',
  'mp-21248'

In [95]:
import json
with open("ids_target_updated.json", "w", encoding='utf-8-sig') as f:
    json.dump(ids, f, indent = 4)

In [99]:
for i in ids:
    if i == "dielectric_constant":
        continue

    print(len(ids[i]["file_name"]))

472
1143
10481
46744
46744
