In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import re

### Generate ID prop csv (Optional)


In [3]:
# id_prop dataset
prop = "oxo"
run_idx = 1
first_row = False
prop_map = {"oxo":"Oxo Formation Energy", "h": "Hydrogen Affinity Energy"}
df_label = pd.read_csv("/scratch/yll6162/MOF-oxo/labels/Combined_Energies_Removed_Infeasible_Out.csv", index_col = 0)
if first_row:
    df_label = df_label[df_label.Metal.isin(['Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn'])]
df_label['sample'] = df_label.MOF+".cif"
df_label = df_label[["sample", "Site", prop_map[prop]]]
if first_row:
    df_label.to_csv(f"/scratch/yll6162/MOF-oxo/MOFs_oms/id_prop_{prop}_{run_idx}_first_row.csv", index=None)
df_label.to_csv(f"/scratch/yll6162/MOF-oxo/MOFs_oms/id_prop_{prop}_{run_idx}.csv", index=None)

### Select RAC features


In [9]:
tl = False

folder_path = './mof_features'
df_fea = pd.read_csv(os.path.join(folder_path, 'Features_RACS.csv'), index_col=0)


# df_fea.values
label_encoder = LabelEncoder()
# if 'compound possible' in df_fea.columns:
#     df_fea['compound possible'] = df_fea['compound possible'].replace({'False': 0, 'True': 1, False: 0, True: 1, '0.0': 0}).astype(float)
# for str_label in ['HOMO_character', 'HOMO_element', 'LUMO_character', 'LUMO_element']:
#     if str_label in df_fea.columns:
#         df_fea[str_label] = label_encoder.fit_transform(df_fea[str_label].astype(str))
encoder = OneHotEncoder(sparse=False)
encoded_feature = encoder.fit_transform(df_fea[['Metal']])
df_fea = df_fea.reset_index(drop=True)
df_fea = pd.concat([df_fea, pd.DataFrame(encoded_feature, columns=encoder.get_feature_names_out(['Metal']))], axis=1)
df_fea = df_fea.drop(columns = ['Metal'])
imputer = SimpleImputer(strategy='mean')
feature_cols = df_fea.columns[2:]
df_fea[feature_cols] = imputer.fit_transform(df_fea[feature_cols].values)
df_fea['sample'] = df_fea['MOF Name'] + '.cif'
filename = 'racs_all_clean.csv'
df_fea.to_csv(os.path.join(folder_path, filename))
# df_fea[['HOMO_character', 'HOMO_element', 'LUMO_character', 'LUMO_element']]


  df_fea['sample'] = df_fea['MOF Name'] + '.cif'


### Combine with ALIGNN Embeddings (Optional)


In [10]:
tl = True
# run = 'mof_dband_embed'
# run = 'mof_form_e_embed'
run = 'mof_bandgap_embed'
# run = 'mof_ehull_embed'
# run = 'mof_opt_bandgap_embed'
# embed_filepath = f"/scratch/yll6162/ALIGNNTL/examples/{run}/x+y+z/data0.csv"
embed_filepath = f"/data/yll6162/mof/{run}/x+y+z/data0.csv"
df_embed = pd.read_csv(embed_filepath)
df_embed = df_embed.drop_duplicates(subset = ['id'])
df_fea_all = df_fea.merge(df_embed, how='left', left_on = "MOF Name", right_on = "id").drop(columns = ['oxo_1','id','full'], errors='ignore')
df_fea = df_fea_all


### Feature Scale

In [11]:
from sklearn.preprocessing import StandardScaler

racs_col = [col for col in df_fea.columns if col.startswith("racs_")]
embed_cols = [str(i) for i in range(768)]
scaler_A = StandardScaler()
df_fea[racs_col] = scaler_A.fit_transform(df_fea[racs_col].values)
# if embed_cols:
#     scaler_B = StandardScaler()
#     df_fea[embed_cols] = scaler_B.fit_transform(df_fea[embed_cols].values)

### Generate Train Dataset

In [12]:
# prop = 'Oxo Formation Energy'
first_row = True
prop = 'Hydrogen Affinity Energy'
output_dir = './data/'
query_idx = 1
# df_qmof = pd.read_csv("./labels/qmofruns_2_suffled.csv", index_col = 0)
df_qmof = pd.read_csv("/scratch/yll6162/MOF-oxo/labels/Combined_Energies_Removed_Infeasible_Out.csv", index_col = 0)
if first_row:
    df_qmof = df_qmof[df_qmof.Metal.isin(['Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn'])]
df_qmof['sample'] = df_qmof.MOF + '.cif'

df_qmof['prop'] = df_qmof[prop]
df_qmof = df_qmof[['sample', 'Site', prop]]

df_fea['sample'] = df_fea['MOF Name'] + '.cif'

df_sample = df_qmof.merge(df_fea, how='inner', left_on=['sample', 'Site'], right_on = ['sample','Metal_index'])
df_sample['ids'] = df_sample['sample'] + '_' + df_sample['Site'].astype(str)
df_sample = df_sample.drop(columns = ['sample', 'Site', 'MOF Name', 'Metal_index'])
df_sample = df_sample.assign(**{prop: df_sample.pop(prop)})
df_sample = df_sample.assign(**{'ids': df_sample.pop('ids')})
if tl:
    filename = f"query_{query_idx}_{run}_{prop}"
    if first_row:
        filename += "_first_row"
    df_sample.to_csv(os.path.join(output_dir, filename + ".csv"), index=False)
else:
    filename = f"query_{query_idx}_racs_{prop}"
    if first_row:
        filename += "_first_row"
    df_sample.to_csv(os.path.join(output_dir, filename + ".csv"), index=False)

  df_sample['ids'] = df_sample['sample'] + '_' + df_sample['Site'].astype(str)
