### Use the Dify workflow API as the LLMs processing method. The Google Gemini API can also be called directly

In [1]:
import requests
import json
import os
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

def extract_json_from_text(text):
    """
    从给定的 text 中提取 JSON 数据。
    如果 text 包含 ```json 和 ``` 标记，则仅提取其中的内容；
    如果不包含这些标记，则假设整个 text 是有效的 JSON 并尝试解析。
    """
    try:
        # 尝试寻找并提取 ```json 和 ``` 之间的内容
        match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
        if match:
            # 如果找到了匹配，使用匹配到的内容进行解析
            json_str = match.group(1)
        else:
            # 如果没有找到匹配，假设整个 text 是 JSON
            json_str = text
        
        # 解析 JSON 字符串为 Python 对象
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"JSON 解析错误: {e}")
        return None
        
def upload_file(file_path, user):
    upload_url = "http://172.19.7.73/v1/files/upload"
    headers = {
        "Authorization": "",
        }
    
    try:
        print("上传文件中...")
        with open(file_path, 'rb') as file:
            files = {
                'file': (file_path, file, 'text/plain')  # 确保文件以适当的MIME类型上传
            }
            data = {
                "user": user,
                "type": "PDF"  # 设置文件类型为TXT
            }
            
            response = requests.post(upload_url, headers=headers, files=files, data=data)
            if response.status_code == 201:  # 201 表示创建成功
                print("文件上传成功")
                return response.json().get("id")  # 获取上传的文件 ID
            else:
                print(f"文件上传失败，状态码: {response.status_code}")
                return None
    except Exception as e:
        print(f"发生错误: {str(e)}")
        return None

def run_workflow(file_id, user, response_mode="blocking"):
    workflow_url = "http://172.19.7.73/v1/workflows/run"
    headers = {
        "Authorization": "",
        "Content-Type": "application/json"
    }

    data = {
        "inputs":
        {
            "fy_file": {
                "transfer_method": "local_file",
                "upload_file_id": file_id,
                "type": "document",
            }
        },
        "response_mode": response_mode,
        "user": user
    }

    try:
        print("运行工作流...")
        response = requests.post(workflow_url, headers=headers, json=data)
        if response.status_code == 200:
            print("工作流执行成功")
 
            return response.json()
        else:
            print(f"工作流执行失败，状态码: {response.status_code}")
            return {"status": "error", "message": f"Failed to execute workflow, status code: {response.status_code}"}
    except Exception as e:
        print(f"发生错误: {str(e)}")
        return {"status": "error", "message": str(e)}


In [None]:
for file in os.listdir("./destination_folder_running"):
    if file.endswith(".pdf"):
        # 使用示例
        file_path = f"./destination_folder_running/{file}"
        user = ""
        
        # 上传文件
        file_id = upload_file(file_path, user)
        print(file_id)
        if file_id:
            # 文件上传成功，继续运行工作流
            result = run_workflow(file_id, user)
            raw_text = result['data']['outputs']['text']
            with open(f"./destination_folder_running/workflow_{file[:-4]}.json", 'w', encoding='utf-8') as f:
                json.dump(result, f, ensure_ascii=False, indent=4)
            with open(f"./destination_folder_running/{file[:-4]}.json", 'w', encoding='utf-8') as f:
                if raw_text is not None:
                    json.dump(extract_json_from_text(raw_text), f, ensure_ascii=False, indent=4)
                else:
                    json.dump(raw_text, f, ensure_ascii=False, indent=4)
        else:
            print("文件上传失败，无法执行工作流")


### based gemini api

In [3]:
prompt = """🧠 Optimized Final Prompt: Standardized Extraction of Size-Sieving MOF Data (Fully English)

You are an expert materials science AI assistant specializing in the analysis of scientific literature on Metal-Organic Frameworks (MOFs). Your task is to meticulously extract and standardize information about MOFs demonstrating size-sieving capabilities, as reported in scientific texts.

The output must be a strictly formatted JSON array. Each object in the array corresponds to one MOF. If the input text describes one or multiple MOFs, all MOFs should be represented as distinct objects within this single JSON array. If any specific piece of information for a field is not present in the provided text or is ambiguous, use the string "Not Reported".

Absolutely no explanations, markdown, or any text outside the JSON array itself should be returned.

✅ Fields to Extract for Each MOF:

Each MOF object must have the following structure. All fields are mandatory.

JSON

[
  {
    "MOF Name": "string", // e.g., ZIF-8, UiO-66, MOF-X. If a common name and a chemical formula are given, prefer the common name.
    "Material Category": "string", // Strictly use one of: MOFs, MOFs membrane, MOF composite, or Others
    "Pore/Structural Features": {
      "Pore Size": "string", // Report the effective pore diameter or aperture size relevant to sieving, including units (e.g., "0.3 nm", "3.4 Å"). If a range is given, report the range (e.g., "0.3-0.35 nm").
      "Structure Type": "string" // Describe the dimensionality and nature of the pores/channels (e.g., "One-dimensional channels", "Two-dimensional interlayer pores", "Three-dimensional interconnected cages", "Ultramicroporous").
    },
    "Topology": "string", // e.g., pcu, zeo, nbo. If not specified, use "Not Reported".
    "Metal Nodes": ["string"], // List the metal ions or clusters, e.g., ["Zn2+"], ["Zr6O4(OH)4"]. If generic (e.g., 'M'), use that.
    "Ligands": ["string"], // List the organic linkers, e.g., ["2-methylimidazole"], ["benzene-1,4-dicarboxylate"]. If generic (e.g., 'L'), use that.
    "Guest Molecules": [ // List each guest molecule evaluated for sieving properties separately.
      {
        "Molecule Name": "string", // Chemical name or formula, e.g., "CO2", "Methane", "G1".
        "Molecular Size": "string", // Kinetic diameter or other relevant molecular dimension, including units (e.g., "0.33 nm", "3.8 Å").
        "Can Pass Sieving": "string", // Strictly use one of: "Yes", "No", "Temperature-dependent", "Pressure-dependent", "Concentration-dependent", "Conditional (see Additional Info)", "Not Reported".
        "Mechanism": "string", // Brief explanation for this specific molecule's passage or rejection, e.g., "Kinetic sieving", "Size exclusion", "Shape selectivity", "Blocked by smaller co-adsorbed molecule", "Passes due to framework flexibility".
        "Other Information": {
          "Test Conditions": "string", // Relevant experimental conditions for this molecule's behavior, e.g., "Temperature: 298 K", "Pressure: 1 bar", "Partial pressure: 0.1 atm", "Temperature < 100°C".
          "Threshold Pressure": "string", // Pressure at which gate-opening occurs or passage behavior changes significantly for this molecule, if applicable, including units (e.g., "1.5 MPa", "10 bar"). Use "Not Applicable" if no gate-opening or pressure threshold is mentioned for this molecule.
          "Additional Info": "string" // Any other relevant details for this specific guest molecule's sieving behavior. Explain "Conditional" `Can Pass Sieving` here.
        }
      }
    ],
    "Sieving Mechanism": "string", // Overall sieving mechanism of the MOF, e.g., "Molecular sieving based on pore aperture", "Shape-selective diffusion", "Gate-opening effect controlled by guest-framework interactions", "Ultramicropore-filling".
    "Experimental Validation": {
      "Method": "string", // Technique used to confirm sieving, e.g., "Breakthrough experiments", "Membrane permeation tests", "Single-crystal X-ray diffraction with guest loading", "GCMC simulations", "Mixed-gas adsorption".
      "Performance Metrics": "string" // Key results or quantitative data, e.g., "Selectivity CO2/N2: 50", "Permeance H2: 10^-7 mol m-2 s-1 Pa-1", "Uptake capacity: G1 > G2 at STP", "Separation factor: X".
    },
    "Other Information": {
      "Additional Info": "string" // Any other relevant general information about the MOF's sieving properties not covered elsewhere, e.g., "Flexible framework", "Hydrophobic pores", "Studied as a membrane", "Post-synthetic modification applied".
    }
  }
]
⚠️ Critical Instructions & Reminders:

Single JSON Array Output: The entire response must be a single, valid JSON array. If multiple MOFs are described, they should be separate objects within this one array. No markdown, comments, or other text.
Mandatory Fields: All specified fields in the JSON structure must be present for each MOF object. Use "Not Reported" if the information is unavailable in the source text.
Multiple MOFs/Guests: If the input text describes multiple MOFs, create a separate JSON object for each MOF within the main array. Similarly, list each distinct guest molecule as a separate object within that MOF's Guest Molecules array.
Units: Always include units for physical quantities like size, pressure, and temperature if they are reported in the text (e.g., "nm", "Å", "MPa", "bar", "K", "°C").
Specific Values for Can Pass Sieving: Adhere strictly to the allowed string values: "Yes", "No", "Temperature-dependent", "Pressure-dependent", "Concentration-dependent", "Conditional (see Additional Info)", or "Not Reported". If behavior is complex (e.g., dependent on both T and P), use "Conditional (see Additional Info)" and detail in the Additional Info sub-field for that guest.
Threshold Pressure: Use "Not Applicable" for Guest Molecules > Other Information > Threshold Pressure if no specific gate-opening or pressure threshold is mentioned for that particular molecule's passage, even if the MOF is generally flexible. Only report a value if a threshold is explicitly tied to a guest's passage.
Descriptive Strings: For fields like "Structure Type" or "Mechanism", aim for concise but descriptive strings based on the text.
Metal Nodes/Ligands: If the text uses generic identifiers (e.g., "M" for metal, "L" for ligand) and not specific chemical names/formulas, use those generic identifiers.
🔍 Example Input (Scientific Text describing MULTIPLE MOFs):

“The flexible soft-solid MOF-X membrane demonstrates two-dimensional straight-through interlayer pores (~0.3 nm), enabling efficient G1/G2 separation. G1 molecules (kinetic diameter ~0.289 nm) pass through the pores at temperatures below 100°C, while G2 (0.33 nm) is effectively rejected except at pressures above 1.5 MPa, which triggers a slight framework expansion. The MOF-X is built from M-clusters and L1/L2 linkers. Gas separation experiments confirmed high G1/G2 selectivity that varies with conditions. No specific topology was identified for MOF-X.

A second material, ZIF-Y, possesses rigid three-dimensional cages with 0.34 nm apertures. ZIF-Y efficiently separates H2 (0.289 nm) from N2 (0.364 nm) at ambient temperature (298 K) and 1 atm. H2 passes readily, while N2 is excluded. ZIF-Y is composed of Zn ions and imidazole-derived linkers. Its topology is SOD. Breakthrough tests were used for validation, showing a H2/N2 separation factor of 25.”

📤 Expected Output (Strict JSON Only, showing MULTIPLE MOFs in the array):

JSON

[
  {
    "MOF Name": "MOF-X",
    "Material Category": "MOF membrane"
    "Pore/Structural Features": {
      "Pore Size": "~0.3 nm",
      "Structure Type": "Two-dimensional straight-through interlayer pores"
    },
    "Topology": "Not Reported",
    "Metal Nodes": ["M-clusters"],
    "Ligands": ["L1", "L2"],
    "Guest Molecules": [
      {
        "Molecule Name": "G1",
        "Molecular Size": "~0.289 nm",
        "Can Pass Sieving": "Temperature-dependent",
        "Mechanism": "Size exclusion, passes when thermally permitted",
        "Other Information": {
          "Test Conditions": "Temperatures < 100°C",
          "Threshold Pressure": "Not Applicable",
          "Additional Info": "Passes through pores at lower temperatures."
        }
      },
      {
        "Molecule Name": "G2",
        "Molecular Size": "0.33 nm",
        "Can Pass Sieving": "Pressure-dependent",
        "Mechanism": "Size exclusion at low pressure, passage upon framework expansion at high pressure",
        "Other Information": {
          "Test Conditions": "Rejected except at pressures > 1.5 MPa",
          "Threshold Pressure": "1.5 MPa",
          "Additional Info": "Passage linked to framework expansion triggered by pressure."
        }
      }
    ],
    "Sieving Mechanism": "Size-based exclusion with pressure-induced gate-opening/framework expansion",
    "Experimental Validation": {
      "Method": "Gas separation experiments",
      "Performance Metrics": "High G1/G2 selectivity, varies with conditions"
    },
    "Other Information": {
      "Additional Info": "Flexible soft-solid MOF studied as a membrane."
    }
  },
  {
    "MOF Name": "ZIF-Y",
    "Material Category": "MOF membrane"
    "Pore/Structural Features": {
      "Pore Size": "0.34 nm",
      "Structure Type": "Rigid three-dimensional cages"
    },
    "Topology": "SOD",
    "Metal Nodes": ["Zn ions"],
    "Ligands": ["imidazole-derived linkers"],
    "Guest Molecules": [
      {
        "Molecule Name": "H2",
        "Molecular Size": "0.289 nm",
        "Can Pass Sieving": "Yes",
        "Mechanism": "Size exclusion, smaller molecule passes readily",
        "Other Information": {
          "Test Conditions": "Ambient temperature (298 K), 1 atm",
          "Threshold Pressure": "Not Applicable",
          "Additional Info": "Readily passes through apertures."
        }
      },
      {
        "Molecule Name": "N2",
        "Molecular Size": "0.364 nm",
        "Can Pass Sieving": "No",
        "Mechanism": "Size exclusion, larger molecule is blocked",
        "Other Information": {
          "Test Conditions": "Ambient temperature (298 K), 1 atm",
          "Threshold Pressure": "Not Applicable",
          "Additional Info": "Effectively excluded by pore size."
        }
      }
    ],
    "Sieving Mechanism": "Molecular sieving based on rigid pore apertures",
    "Experimental Validation": {
      "Method": "Breakthrough tests",
      "Performance Metrics": "H2/N2 separation factor: 25"
    },
    "Other Information": {
      "Additional Info": "Rigid framework."
    }
  }
]
"""

In [4]:
import os
from google import genai
from google.genai import types
import json
from tqdm import tqdm

client = genai.Client(api_key="")
for file in tqdm(os.listdir(f"F:\\Working\\AI-FY-C5\\NewDownload\\pdfoutput")):
    with open(f"F:\\Working\\AI-FY-C5\\NewDownload\\pdfoutput\\{file}", mode = "r") as f:
        content = f.read()
        response = client.models.generate_content(model="gemini-2.5-flash-preview-05-20",
        config=types.GenerateContentConfig(system_instruction=prompt),
        contents=content)
        # print("-" * 20)
        responsetext= response.text
         
        with open(f"./destination_folder_running/{file[:-4]}.json", 'w', encoding='utf-8') as f:
            json.dump(extract_json_from_text(responsetext), f, ensure_ascii=False, indent=4)


100%|██████████| 17/17 [06:27<00:00, 22.79s/it]
