In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split



### Isotherms Dataset

In [13]:
df_iso1 = pd.read_csv('/data/yll6162/mof_cnn/PSED_data/extracted_cm3_per_cm3_values_part1.csv')
df_iso2 = pd.read_csv('/data/yll6162/mof_cnn/PSED_data/extracted_cm3_per_cm3_values_part2.csv')
df_iso3 = pd.read_csv('/data/yll6162/mof_cnn/PSED_data/extracted_cm3_per_cm3_values_part3.csv')
df_iso = pd.concat([df_iso1, df_iso2, df_iso3], ignore_index=True)
df_iso['database'] = df_iso['project'].apply(lambda x: 'qmof' if x.startswith('qmof') else 'ToBaCCo')
df_iso = df_iso[~df_iso['Xe_cm3_per_cm3_value'].isna()] # exclude NAN values
df_iso.to_csv('/data/yll6162/mof_cnn/data_mix_13k/all.csv', index=False)
df_iso

Unnamed: 0,project,Kr_cm3_per_cm3_value,Kr_cm3_per_cm3_error,Xe_cm3_per_cm3_value,Xe_cm3_per_cm3_error,database
0,f1_toc_35_1x1x1,21.968105,0.145037,45.545074,0.669707,ToBaCCo
1,f1_jus_68_1x1x1,19.520688,0.558390,39.775483,0.330554,ToBaCCo
2,f4_msw_53_1x1x1,6.545588,0.048030,6.104163,0.051518,ToBaCCo
3,f5_hea_216_1x1x1,5.038593,0.087883,4.116906,0.043364,ToBaCCo
4,f5_ssb_269_1x1x1,8.775253,0.167659,12.015802,0.294389,ToBaCCo
...,...,...,...,...,...,...
15942,qmof-9fd01d7,5.715072,0.099135,5.179809,0.061983,qmof
15943,qmof-e05a715,3.364124,0.068859,2.239603,0.029000,qmof
15944,qmof-92f453c,11.135041,0.912316,17.929968,0.925652,qmof
15945,qmof-424ce8c,21.451271,0.139641,36.696615,0.183825,qmof


### Energy Grid Dataset

In [1]:
!ls  /data/yll6162/mof_cnn/PSED_data/

converted_10bar_5k.csv			extracted_cm3_per_cm3_values_part3.csv
converted_10bar_qmof.csv		New_Parsed
converted_8k_10bar.csv			New_Parsed_new_new.tar.gz
extracted_cm3_per_cm3_values_part1.csv	New_Parsed.tar.gz
extracted_cm3_per_cm3_values_part2.csv


#### 1. Data Integrity Check

In [3]:
import os

# Define the top-level directory
base_dir = "/data/yll6162/mof_cnn/PSED_data/New_Parsed_new_new/New_Parsed/" ## REPLACE WITH YOUR DIRECTORY

# Define the relative subdirectory and file names to check
subdir = "ASCI_Grids"
file_names = ["energy_grid.txt"]
abnormal_files = []
empty_files = []
absent_files = []
normal_files = []
# Loop through all subdirectories under the base directory
for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    
    # Ensure it is a directory
    if os.path.isdir(folder_path):
        asci_grids_path = os.path.join(folder_path, subdir)
        
        # Ensure the ASCI_Grids subdirectory exists
        if os.path.exists(asci_grids_path):
            # print(f"\nProcessing ASCI_Grids in folder: {folder}")
            
            # Check for each required file
            for file_name in file_names:
                file_path = os.path.join(asci_grids_path, file_name)
                
                # Verify the file exists
                if os.path.exists(file_path):
                    # print(f"\nReading file: {file_name} in folder {folder}")
                    try:
                        # Open and print each line of the file
                        total_count = 0

                        with open(file_path, 'r') as file:
                            lines = file.readlines()
                            if len(lines) != 68921:
                                if len(lines) == 0:
                                    empty_files.append(folder)

                                # print(f"Error: {file_name} in folder {folder} has {len(lines)} lines")
                                abnormal_files.append(folder)
                                break
                            else:
                                normal_files.append(folder)
                    except Exception as e:
                        print(f"Error reading {file_name} in folder {folder}: {e}")
                else:
                    absent_files.append(folder)
                    print(f"{file_name} does not exist in folder: {folder}")
        else:
            print(f"ASCI_Grids subdirectory does not exist in folder: {folder}")

print(f"Absent energy_grid.txt files: {len(absent_files)}")
print(f"Empty energy_grid.txt files: {len(empty_files)}")
print(f"Abnormal energy_grid.txt files: {len(abnormal_files)}")
print(f"Normal energy_grid.txt files: {len(normal_files)}")


energy_grid.txt does not exist in folder: f1_tpt_5_1x1x1
energy_grid.txt does not exist in folder: f1_tpt_26_1x1x1
energy_grid.txt does not exist in folder: f6_wml_28_1x1x1
energy_grid.txt does not exist in folder: f4_ukk_13_1x1x1
energy_grid.txt does not exist in folder: f5_tpt_5_1x1x1
energy_grid.txt does not exist in folder: f7_tpt_89_1x1x1
energy_grid.txt does not exist in folder: f5_tpt_145_1x1x1
Absent energy_grid.txt files: 7
Empty energy_grid.txt files: 0
Abnormal energy_grid.txt files: 1
Normal energy_grid.txt files: 13977


In [8]:
#Filter by isotherm values
filterd_mofs = []
unmatched_mofs = []
for mof_file in normal_files:
    if mof_file in df_iso['project'].values:
        filterd_mofs.append(mof_file)
print(f"Filtered mofs: {len(filterd_mofs)}")
print(f"Unmatched mofs: {len(normal_files) - len(filterd_mofs)}")

Filtered mofs: 13966
Unmatched mofs: 11


#### 2. Dataset Split: Train and Test Set

In [9]:
from sklearn.model_selection import train_test_split
import json
test_size = 0.1
output_dir = '/data/yll6162/mof_cnn/data_mix_13k'
train_files, test_files = train_test_split(filterd_mofs, test_size=0.1, random_state=42)
print(f"Training files: {len(train_files)}")
print(f"Testing files: {len(test_files)}")

datset_mof = {'train': train_files, 'test': test_files}

grid = 41

for subset in ['train', 'test']:
    dir_path = os.path.join(output_dir, subset)
    os.makedirs(dir_path, exist_ok=True)
    subset_data = {}
    subset_data['name'] = datset_mof[subset]
    subset_data['grid'] = grid
    subset_data['size'] = len(datset_mof[subset])
    with open(f'{dir_path}/clean.json', 'w') as f:
        json.dump(subset_data, f)
    print(f"saved {subset} data to {dir_path}/clean.json")


Training files: 12569
Testing files: 1397
saved train data to /data/yll6162/mof_cnn/data_mix_13k/train/clean.json
saved test data to /data/yll6162/mof_cnn/data_mix_13k/test/clean.json


#### 3. Filter and Merge Energy Grid to Structured Dataset

In [12]:
pos_cap = 0
source_dir = "/data/yll6162/mof_cnn/PSED_data/New_Parsed_new_new/New_Parsed"
grid = 41
data_splits = [train_files, test_files]

for subset in ['train', 'test']:
    array_list = []
    dir_path = os.path.join(output_dir, subset)
    for mof_file in datset_mof[subset]:
        with open(f"{source_dir}/{mof_file}/ASCI_Grids/energy_grid.txt", "r") as file:
            lines = file.readlines()  # Each line is stored as an element in the list
        # Step 1: Convert list to NumPy array
        data_array = np.array(lines, dtype=object)
        clean_array = np.where(data_array == '?\n', pos_cap, data_array).astype(float)
        cap_array = np.clip(clean_array, None, pos_cap)
        reshaped_array = cap_array.reshape((grid, grid, grid))
        assert np.isnan(reshaped_array).sum()==0
        array_list.append(reshaped_array)

    dataset_array = np.stack(array_list, axis=0)
    np.save(f"{dir_path}/clean.npy", dataset_array)
    print(f"saved {subset} data to {dir_path}/clean.npy")

saved train data to /data/yll6162/mof_cnn/data_mix_13k/train/clean.npy
saved test data to /data/yll6162/mof_cnn/data_mix_13k/test/clean.npy
