### Batch process the JSON file to obtain the MOF-Guest molecule Information Card

In [2]:
import os
import json
import csv

# 设置路径
data_folder = 'destination_folder_running'
output_csv = 'output.csv'
output_csv_to_llm = 'output_to_llm.csv'
error_log_file = 'error_log.txt'

# 清空或创建输出文件和日志文件
with open(output_csv, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        'Materials Name',
        'Molecule Name',
        'Can Pass Sieving'
        'Pore Size', 
        'Structure Type', 
        'Material Category',  # 新增字段
        'Sieving Mechanism',  # 新增字段
        'Molecular Size',     # Guest 分子新增字段
        'Mechanism',          # Guest 分子新增字段
        'File Name',
        "Test Conditions",
        "Threshold Pressure",
        "Additional Info"
    ])

with open(output_csv_to_llm, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        'Materials Name',
        'Molecule Name',
        'Can Pass Sieving'
        'Sieving Mechanism', 
        'Mechanism',    
        "Test Conditions",
        "Threshold Pressure",
    ])                      

with open(error_log_file, mode='w', encoding='utf-8') as f:
    pass  # 清空日志文件

# 开始处理
for filename in os.listdir(data_folder):
    if not filename.endswith('.json'):
        continue

    file_path = os.path.join(data_folder, filename)

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = json.load(f)

        # 检查是否为 null 或非列表
        if not isinstance(content, list):
            raise ValueError("JSON 内容不是列表")

        for item in content:
            try:
                materials_name = item.get('Materials Name')
                # materials_name = item.get('MOF Name')
                material_category = item.get('Material Category')
                sieving_mechanism = item.get('Sieving Mechanism')
                pore_features = item.get('Pore/Structural Features', {})
                pore_size = pore_features.get('Pore Size')
                structure_type = pore_features.get('Structure Type')
                guest_molecules = item.get('Guest Molecules', [])

                if not isinstance(guest_molecules, list):
                    raise ValueError("Guest Molecules 不是列表")
                for guest in guest_molecules:
                    molecule_name = guest.get('Molecule Name')
                    molecular_size = guest.get('Molecular Size')
                    can_pass_sieving = guest.get('Can Pass Sieving')
                    mechanism = guest.get('Mechanism')

                    other_info = guest.get("Other Information",{})
                    Test_Conditions = other_info["Test Conditions"]
                    Threshold_Pressure = other_info["Threshold Pressure"]
                    Additional_Info = other_info["Additional Info"]

                    # 写入 CSV
                    with open(output_csv, mode='a', newline='', encoding='utf-8') as f:
                        writer = csv.writer(f)
                        writer.writerow([
                            materials_name,
                            molecule_name,
                            can_pass_sieving,
                            pore_size,
                            structure_type,
                            material_category,  # 获取的值
                            sieving_mechanism,  # 获取的值
                            molecular_size,
                            mechanism,
                            filename,
                            Test_Conditions,
                            Threshold_Pressure,
                            Additional_Info
                        ])
                    if material_category == "MOFs" and molecule_name in [
    "xenon",
    "oxygen",
    "carbon dioxide",
    "benzene",
    "carbon monoxide",
    "acetylene",
    "ethylene",
    "ethane",
    "propyne",
    "propylene",
    "propane",
    "methane",
    "xylene",
    "trans-piperylene",
    "trans-butene",
    "trans-3-hexene",
    "trans-2-hexene",
    "toluene",
    "styrene",
    "pentane",
    "octane",
    "nonane",
    "hexane",
    "heptane",
    "ammonia",
    "butane",
    "nitrous oxide",
    "krypton",
    "isoprene",
    "isopentane",
    "isobutene",
    "isobutane",
    "ethylbenzene",
    "cyclooctene",
    "cyclohexene",
    "cyclohexane",
    "cis-piperylene",
    "cis-2-butene",
    "carbon tetrafluoride",
    "butadiene",
    "1-chloropropane",
    "perfluoropropane",
    "3-methylpentane",
    "3,3-dimethylpentane",
    "2-methylpentane",
    "2,2-dimethylbutane",
    "2,3-dimethylbutane",
    "2,2-dimethylhexane",
    "2,2,4-trimethylpentane",
    "1-pentene",
    "1-octene",
    "1-hexene",
    "1-butene",
    "trans-2-butene"
]:
                     with open(output_csv_to_llm, mode='a', newline='', encoding='utf-8') as f:
                        writer = csv.writer(f)
                        writer.writerow([
                            materials_name,
                            molecule_name,
                            can_pass_sieving,
                            sieving_mechanism,  # 获取的值
                            mechanism,
                            Test_Conditions,
                            Threshold_Pressure,
                        ]) 
                        
            except Exception as inner_error:
                # 记录单个 item 错误
                with open(error_log_file, mode='a', encoding='utf-8') as log:
                    log.write(f"【{filename}】处理条目时出错: {str(inner_error)}\n")

    except Exception as e:
        # 记录整个文件错误
        with open(error_log_file, mode='a', encoding='utf-8') as log:
            log.write(f"【{filename}】无法读取或解析: {str(e)}\n")