In [24]:
import os
from glob import glob
import json
import pickle
import numpy as np
import pandas as pd

model_path = "Models/PROFiT-Net"

target_dirs = ["dielectric_constant", "exp_band_gap", "exp_formation_enthalpy","hse06_band_gap", "pbe_+u_band_gap", "pbe_+u_formation_enthalpy"]
# check target_dirs is exist
for target_dir in target_dirs:
    if not os.path.exists(os.path.join(model_path ,target_dir, "dataset")):
        # os.mkdir(target_dir)
        print("No such directory: ", target_dir)
    else:
        print("Directory exists: ", target_dir)

Directory exists:  dielectric_constant
Directory exists:  exp_band_gap
Directory exists:  exp_formation_enthalpy
Directory exists:  hse06_band_gap
Directory exists:  pbe_+u_band_gap
Directory exists:  pbe_+u_formation_enthalpy


In [25]:
import pandas as pd

def merge_multiheader(file_path):
    # Read the first two rows as header
    df = pd.read_csv(file_path, header=[0, 1])
    
    # Merge headers: if the second header row is empty or NaN, use the value from the first header row
    new_columns = []
    for first, second in df.columns:
        new_columns.append(second if pd.notna(second) and second != "" else first)
    
    # Update the DataFrame with the merged headers
    df.columns = new_columns
    
    # Drop the initial multiheader row structure
    df.columns.name = None
    
    return df


In [3]:
Total_data = {}
for target in target_dirs:
    if target == "dielectric_constant":
        target_data = os.path.join(model_path, target, "dataset", "raw_data.pickle")
        with open(target_data, "rb") as f:
            data = pd.read_pickle(f)
            Total_data[target] = data
    elif target == "exp_band_gap":
        target_data = os.path.join(model_path, target, "dataset", "ct9b00322_si_002.csv")
        data = merge_multiheader(target_data)
        # data = pd.read_csv(target_data)
        # data = merge_headers(data)
        Total_data[target] = data
    elif target == "exp_formation_enthalpy":
        target_data = os.path.join(model_path, target, "dataset", "experimental_dataset.csv")
        data = pd.read_csv(target_data)
        Total_data[target] = data
    elif target == "hse06_band_gap":
        # get all file names in this folder
        target_data_path = os.path.join(model_path, target, "dataset")
        all_files = glob(target_data_path + "/*.json")
        data = []
        for filename in all_files:
            with open(filename) as f:
                data.append(json.load(f))
        Total_data[target] = data
    elif target == "pbe_+u_band_gap":
        target_data = os.path.join(model_path, target, "dataset", "mp-ids-46744.csv")
        data = pd.read_csv(target_data, header=None)
        Total_data[target] = data
    elif target == "pbe_+u_formation_enthalpy":
        # Totally same file.
        target_data = os.path.join(model_path, target, "dataset", "mp-ids-46744.csv")
        data = pd.read_csv(target_data, header=None)
        Total_data[target] = data
    # else:
    #     continue
    

In [26]:
######## targe_dirs[0]: dielectric_constant ########
Total_data[target_dirs[0]]
# concatenate Total_data[target_dirs[0]][2] to Total_data[target_dirs[0]][0]
temp_df_1 = pd.concat([Total_data[target_dirs[0]][0], Total_data[target_dirs[0]][2]], axis=1)
temp_df_2 = pd.concat([Total_data[target_dirs[0]][1], Total_data[target_dirs[0]][3]], axis=1)

# merge temp_df_1 and temp_df_2
temp_df = pd.concat([temp_df_1, temp_df_2], axis=0)
dielectric_constant_df = temp_df.sort_index()
dielectric_constant_df.to_csv(os.path.join("dielectric_constant.csv"), index=False)

### ion의 종류에 대해서만 one-hot encoding으로 표기된 형태이기 때문에, 이에 맞는 mp-id를 찾아줘야함.

In [35]:
######## targe_dirs[1]: exp_band_gap ########
Total_data[target_dirs[1]]
temp_df = Total_data[target_dirs[1]]
# read the second column
prop_1_mp = list(temp_df.iloc[:, 1])
target_1 = list(temp_df["Experimental"])

In [40]:
######## targe_dirs[2]: exp_formation_enthalpy ########
Total_data[target_dirs[2]]
temp_df = Total_data[target_dirs[2]]
# read the first column
prop_2_mp = list(temp_df.iloc[:, 0])
target_2 = list(temp_df["exp"])

In [43]:
######## targe_dirs[3]: hse06_band_gap ########
Total_data[target_dirs[3]]
temp_jsons = Total_data[target_dirs[3]]
prop_3_icsd = []
for i in temp_jsons:
    prop_3_icsd.append(i["ICSD_number"])
target_3 = []
for i in temp_jsons:
    target_3.append(i["Band_gap_HSE"])

In [46]:
######## targe_dirs[4]: pbe_+u_band_gap ########
Total_data[target_dirs[4]]
temp_df = Total_data[target_dirs[4]]
# read the first column
prop_4_mp = list(temp_df.iloc[:, 0])

In [47]:
Total_data.keys()

dict_keys(['dielectric_constant', 'exp_band_gap', 'exp_formation_enthalpy', 'hse06_band_gap', 'pbe_+u_band_gap', 'pbe_+u_formation_enthalpy'])

In [48]:
######## targe_dirs[5]: pbe_+u_formation_enthalpy ########
Total_data[target_dirs[5]]
temp_df = Total_data[target_dirs[5]]
# read the first column
prop_5_mp = list(temp_df.iloc[:, 0])

In [59]:
ids = {
    "dielectric_constant": {"DB_source": "Materials Project", "mp-id": None, "target_name": "dielectric_constant", "target": None},
    "exp_band_gap": {"DB_source": "Materials Project", "mp-id": prop_1_mp, "target_name": "exp_band_gap", "target": target_1},
    "exp_formation_enthalpy": {"DB_source": "Materials Project", "mp-id": prop_2_mp, "target_name": "exp_formation_enthalpy", "target": target_2},
    "hse06_band_gap": {"DB_source": "ICSD", "ICSD_number": prop_3_icsd, "target_name": "hse06_band_gap", "target": target_3},
    "pbe_+u_band_gap": {"DB_source": "Materials Project", "mp-id": prop_4_mp, "target_name": "pbe_+u_band_gap", "target": None},
    "pbe_+u_formation_enthalpy": {"DB_source": "Materials Project", "mp-id": prop_5_mp, "target_name": "pbe_+u_formation_enthalpy", "target": None},
}

In [60]:
import json
with open("ids.json", "w", encoding='utf-8-sig') as f:
    json.dump(ids, f, indent = 4)

In [14]:
ids.keys()

dict_keys(['dielectric_constant', 'exp_band_gap', 'exp_formation_enthalpy', 'hse06_band_gap', 'pbe_+u_band_gap', 'pbe_+u_formation_enthalpy'])

In [15]:
ids['exp_band_gap']["mp-id"]

['mp-557056',
 'mp-9900',
 'mp-19318',
 'mp-353',
 'mp-5495',
 'mp-610517',
 'mp-568936',
 'mp-4431',
 'mp-4515',
 'mp-554648',
 'mp-11794',
 'mp-5782',
 'mp-14091',
 'mp-14092',
 'mp-556434',
 'mp-569126',
 'mp-23231',
 'mp-22922',
 'mp-7592',
 'mp-5342',
 'mp-5518',
 'mp-4899',
 'mp-22894',
 'mp-22925',
 'mp-22660',
 'mp-19833',
 'mp-20554',
 'mp-22386',
 'mp-13383',
 'mp-1143',
 'mp-2172',
 'mp-661',
 'mp-1550',
 'mp-3955',
 'mp-2624',
 'mp-23155',
 'mp-641',
 'mp-909',
 'mp-484',
 'mp-23218',
 'mp-160',
 'mp-28395',
 'mp-864638',
 'mp-1078405',
 'mp-1079630',
 'mp-6325',
 'mp-1095255',
 'mp-7394',
 'mp-555166',
 'mp-861618',
 'mp-552806',
 'mp-23199',
 'mp-17947',
 'mp-5970',
 'mp-12364',
 'mp-13287',
 'mp-1029',
 'mp-2139',
 'mp-28007',
 'mp-1342',
 'mp-1500',
 'mp-866301',
 'mp-1253',
 'mp-1477',
 'mp-3163',
 'mp-1000',
 'mp-19049',
 'mp-570198',
 'mp-548469',
 'mp-2542',
 'mp-1541',
 'mp-252',
 'mp-30200',
 'mp-23262',
 'mp-23195',
 'mp-22856',
 'mp-541837',
 'mp-34202',
 'mp-23

In [16]:
mp_ids = []
for key, value in ids.items():
    if key == 'dielectric_constant' or key == 'hse06_band_gap' or key == 'pbe_+u_formation_enthalpy':
        continue
    print(len(ids[key]["mp-id"]))
    mp_ids.extend(ids[key]["mp-id"])

472
1143
46744


In [20]:
pbe = ids['pbe_+u_band_gap']["mp-id"]
print(len(pbe))
pbe_unique = list(set(pbe))
print(len(pbe_unique))

46744
46744


In [17]:
print(len(mp_ids))
mp_ids = list(set(mp_ids))
print(len(mp_ids))

48359
47143


In [18]:
mp_ids

['mp-21742',
 'mp-25634',
 'mp-767416',
 'mp-554553',
 'mp-768018',
 'mp-232',
 'mp-570609',
 'mp-989196',
 'mp-851262',
 'mp-851024',
 'mp-27172',
 'mp-984755',
 'mp-554294',
 'mp-9794',
 'mp-763046',
 'mp-689817',
 'mp-862865',
 'mp-772424',
 'mp-5338',
 'mp-3897',
 'mp-761422',
 'mp-626552',
 'mp-973410',
 'mp-23166',
 'mp-504613',
 'mp-510452',
 'mp-772632',
 'mp-22793',
 'mp-761931',
 'mp-4393',
 'mp-561414',
 'mp-770671',
 'mp-1005831',
 'mp-752754',
 'mp-971705',
 'mp-769026',
 'mp-17942',
 'mp-585265',
 'mp-23417',
 'mp-542752',
 'mp-667327',
 'mp-775968',
 'mp-758052',
 'mp-763238',
 'mp-557016',
 'mp-21122',
 'mp-971757',
 'mp-559959',
 'mp-10245',
 'mp-676725',
 'mp-31080',
 'mp-14288',
 'mp-16964',
 'mp-973250',
 'mp-571513',
 'mp-772524',
 'mp-627442',
 'mp-763169',
 'mp-761610',
 'mp-774194',
 'mp-11152',
 'mp-565849',
 'mp-30879',
 'mp-21229',
 'mp-10226',
 'mp-28897',
 'mp-28315',
 'mp-631567',
 'mp-6621',
 'mp-1011834',
 'mp-778079',
 'mp-756272',
 'mp-998758',
 'mp-73

In [23]:
import json


# mp_ids 리스트를 mp_ids 파일에 저장
with open("mp_ids.json", "w", encoding="utf-8-sig") as f:
    json.dump(mp_ids, f, indent=4)

# 2. 리스트 불러오기
with open("mp_ids.json", "r", encoding="utf-8-sig") as f:
    loaded_mp_ids = json.load(f)

# 불러온 리스트 출력하기
print(loaded_mp_ids)


['mp-20145', 'mp-14928', 'mp-4360', 'mp-770061', 'mp-15559', 'mp-776639', 'mp-770643', 'mp-545847', 'mp-555838', 'mp-733443', 'mp-1019602', 'mp-1018081', 'mp-765966', 'mp-17651', 'mp-1018093', 'mp-763161', 'mp-672256', 'mp-774233', 'mp-540276', 'mp-550514', 'mp-641367', 'mp-20326', 'mp-15431', 'mp-1007821', 'mp-766207', 'mp-780633', 'mp-762777', 'mp-10130', 'mp-531265', 'mp-646967', 'mp-705031', 'mp-755491', 'mp-6281', 'mp-558325', 'mp-505302', 'mp-622110', 'mp-5984', 'mp-765742', 'mp-850377', 'mp-776741', 'mp-558538', 'mp-542448', 'mp-26594', 'mp-29392', 'mp-504900', 'mp-779951', 'mp-18376', 'mp-6255', 'mp-542188', 'mp-972077', 'mp-773335', 'mp-754553', 'mp-754971', 'mp-753009', 'mp-570216', 'mp-673116', 'mp-1020027', 'mp-5082', 'mp-12146', 'mp-555874', 'mp-1016886', 'mp-27176', 'mp-549487', 'mp-761395', 'mp-12049', 'mp-764782', 'mp-851038', 'mp-1008394', 'mp-30385', 'mp-721617', 'mp-763389', 'mp-504554', 'mp-6455', 'mp-776233', 'mp-978304', 'mp-1008820', 'mp-764957', 'mp-1877', 'mp-3