In [1]:
Ml-flow , evaluation 

NameError: name 'Ml' is not defined

# Импорты

In [1]:
import pandas as pd 
import numpy as np
from math import exp
from PyPDF2 import PdfReader
from pprint import pprint

In [2]:
import os
import json
import logging
from openai import OpenAI
from IPython.display import display, HTML

In [3]:
from dataclasses import dataclass
from pydantic import BaseModel, Field, ConfigDict, ValidationError
from typing import Optional, List, Dict, Any, Tuple
from types import SimpleNamespace
from enum import Enum

# Классификация текста

In [4]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_analysis_with_llm_classifier/patent_analysis.log'),
            logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

In [5]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [6]:
def get_completion(
    messages: list[dict[str, str]],
    model: str = "gpt-4o-mini",
    max_tokens=500,
    temperature=0,
    stop=None,
    seed=None,
    tools=None,
    logprobs=None,  # whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message..
    top_logprobs=None,
) -> str:
    
    system_message = {
        "role": "system",
        "content": """You are a highly experienced research chemist with a PhD and 20+ years of experience in chemistry and patent analysis. 
Your expertise includes:
- Publishing numerous research papers
- Reading thousands of chemical patents
- Leading laboratory research teams
- Writing detailed experimental procedures
- Analyzing chemical documentation

You are specifically trained to identify experimental data in chemical patents with extremely high accuracy."""
    }
    
    # Insert system message at the beginning of messages list
    messages.insert(0, system_message)

    params = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": stop,
        "seed": seed,
        "logprobs": logprobs,
        "top_logprobs": top_logprobs,
    }
    if tools:
        params["tools"] = tools

    completion = client.chat.completions.create(**params)
    return completion

In [7]:
def analyze_file_agent(file_path: str) -> Tuple[str, str]:

    results=[]
    file_name = os.path.basename(file_path)
    absolute_path = os.path.abspath(file_path)
    CLASSIFICATION_PROMPT = """
You are analyzing a chemical patent about olefin polymerization catalysts. Check if this page contains EXPERIMENTAL DATA.

PATENT SECTIONS:
1. EXPERIMENTAL Sections (answer YES):
   - "EXAMPLES"
   - "COMPARATIVE EXAMPLES" 
   - "WORKING EXAMPLES"
   - "PREPARATION EXAMPLES"
   - "POLYMERIZATION EXAMPLES"
   - "TABLES"

2. NON-EXPERIMENTAL Sections (answer NO):
   - "TECHNICAL FIELD"
   - "BACKGROUND ART"
   - "CLAIMS"
   - "ABSTRACT"

ANSWER YES IF YOU SEE:
1. ANY part of experimental procedures (including continued from previous page):
   - Beginning: "The catalyst was prepared by mixing..."
   - Middle: "...was stirred for 2h at 80°C..."
   - End: "...yielding 5.2g of product"
   - Table parts (even incomplete)
   - Continued analytical data

2. Detailed procedures, like:
   "The catalyst was prepared by mixing..."
   "Polymerization was carried out..."
   "To a flask was added..."
   "The mixture was stirred at..."
   "Table №1"

ANSWER NO IF PAGE ONLY HAS:
- General descriptions without procedures
- Lists of compounds without quantities
- Patent claims
- Theoretical explanations

CRITICAL RULES:
1. Even PARTIAL experimental data makes the page experimental (YES)
2. Beginning, middle, or end of procedures count as experimental
3. Continued tables from previous pages count as experimental
4. Look for procedure descriptions with measurements first!

Answer ONLY "YES" or "NO"

Analyze this text: {text}"""

    with open(file_path, "rb") as f:
        reader = PdfReader(f).pages
        text = []
        for page in reader:
            text.append(page.extract_text())
    
    for i, page in enumerate(text):
        print(f"\nHeadline: {page[:200]}")
        
        # Check if page is empty or contains only whitespace
        if not page.strip():
            # Create mock response for empty pages
            top_two_logprobs = [
                SimpleNamespace(token="NO", logprob=0.0),  # log(1.0) = 0.0 for 100% probability
                SimpleNamespace(token="YES", logprob=float('-inf'))  # -inf for 0% probability
            ]
        else:
            API_RESPONSE = get_completion(
                [{"role": "user", "content": CLASSIFICATION_PROMPT.format(text=page)}],
                model="gpt-4o-mini",
                logprobs=True,
                top_logprobs=2,
                max_tokens=1,
                temperature=0,
            )
            top_two_logprobs = API_RESPONSE.choices[0].logprobs.content[0].top_logprobs

        html_content = ""
        for j, logprob in enumerate(top_two_logprobs, start=1):
            html_content += (
                f"<span style='color: cyan'>Output token {j}:</span> {logprob.token}, "
                f"<span style='color: darkorange'>logprobs:</span> {logprob.logprob}, "
                f"<span style='color: magenta'>linear probability:</span> {np.round(np.exp(logprob.logprob)*100,2)}%<br>"
            )
        display(HTML(html_content))
        print("\n")
        
        results.append({
                        'file_name': file_name,
                        'file_path': absolute_path,
                        'page_number': i+1,
                        '1st_token': top_two_logprobs[0].token,
                        '1st_logprob': np.round(np.exp(top_two_logprobs[0].logprob)*100,2),
                        '2nd_token': top_two_logprobs[1].token,
                        '2nd_logprob': np.round(np.exp(top_two_logprobs[1].logprob)*100,2),
                        'text': page
                    })

    df = pd.DataFrame(results)
    return df

In [8]:
df = analyze_file_agent("/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US9352308.pdf")


Headline: (12) United States Patent 
 Braganca et al. US0093.52308B2 
 US 9,352,308 B2 
 May 31, 2016 (10) Patent No.: 
 (45) Date of Patent: 
 (54) SOLID CATALYST COMPONENT FOR 
 POLYMERIZATION AND 
 COPOLYMER


2025-03-30 12:04:32,557 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: U.S. Patent May 31, 2016 Sheet 1 of 6 US 9,352,308 B2 
 FIGURE 1 
 Organometallic 
 compound solution Silica-o- IMPREGNATION 
 (a) 
 Magnesium compound 
 REACTON REMOVAL Supernatant Titanium compound 


2025-03-30 12:04:33,171 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: U.S. Patent May 31, 2016 Sheet 2 of 6 US 9,352,308 B2 
 FIGURE 2 
   


2025-03-30 12:04:33,739 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 U.S. Patent 
 9/ | 
 eoueoeye   


2025-03-30 12:04:34,254 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: U.S. Patent May 31, 2016 Sheet 4 of 6 US 9,352,308 B2 
 FIGURE 4 
 .. 
 O 
 44 4, 48 SO SO. 54 SO 58 
 Energy (ew 


2025-03-30 12:04:34,766 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: U.S. Patent May 31, 2016 Sheet 5 of 6 US 9,352,308 B2 
 FIGURES 
 9. - 
 O 
 95 . ... 3 498 Energy (W) 
 4.94 EO 498 SO SO SO4 SOSO SO.8 SO 
 Energy (ew)   


2025-03-30 12:04:35,274 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: U.S. Patent May 31, 2016 Sheet 6 of 6 US 9,352,308 B2 
 FIGURE 6 
 1966 968 9FO 9. 94 496 498 98) 
 Energy (EW) 


2025-03-30 12:04:35,788 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 1. 
 SOLID CATALYST COMPONENT FOR 
 POLYMERIZATION AND 
 COPOLYMERIZATION OF ETHYLENE AND 
 PROCESS FOR OBTAINING THE SAME 
 This application is a Continuation-In-Part of application


2025-03-30 12:04:36,720 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 3 
 groups 1, 2, 12 or 13 of the periodic table. The process for 
 producing the catalyst of the present invention comprises the 
 steps of: 
 (a) impregnating activated silica parti


2025-03-30 12:04:37,345 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 5 
 over a period from 1 to 20 hours. The amount of remaining 
 OH on the silica surface after this treatment ranges from 0.1 to 2 mmoles OH per g of silica, preferably between 0.5 a


2025-03-30 12:04:38,042 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 7 
 Other silicon compounds useful as reducing agents in the 
 practice of this invention are: silanes (SiH, in which m is a number equal to or higher than 1), alkyl-silanes or aryl 


2025-03-30 12:04:38,653 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 The amount of organometallic compound of the groups 1, 
 2, 12 or 13 that remains fixed on the solid catalyst component may reach up to 5% by weight, expressed as the metal con 
 ten


2025-03-30 12:04:39,271 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 11 
 anhydrous MgCl, and 330 ml (0.969 moles) of Ti(OBu). 
 This mixture was allowed to stir at 300 rpm and heated to 150° 
 C. for about 12 hours in order to have the solids complet


2025-03-30 12:04:39,783 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 13 
 anhydrous MgCl2 and 39.6 ml (0.116 moles) of Ti(OBu). 
 This mixture was allowed to stir at 300 rpm and heated to 150° 
 C. for about 12 hours in order to have the solids comple


2025-03-30 12:04:40,294 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 15 
 addition, stirring was continued for 3.5 hours at a temperature of 60°C. The temperature of the mixture was then brought to 
 65° C. and kept for additional 2 hours. After cooli


2025-03-30 12:04:41,206 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 17 
 C. for about 12 hours in order to have the solids completely 
 dissolved, thereby a clear liquid product was obtained. This 
 resulting liquid was cooled down to 40°C. and under


2025-03-30 12:04:41,819 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 19 
 MIE 0.91 g/10 min 
 MIFAMIE 26 
 Fraction Soluble in Xylene 9.2% 
 Comonomer content 8.7% Polymer Density 0.917 g/cm Bulk Density 0.36 g/cm 
 Pilot Plant Tests 
 A pilot plant c


2025-03-30 12:04:42,269 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"





Headline: US 9,352,308 B2 
 21 
 FIG.3 and an X-Ray Absorption Near Edge Structure spectra 
 as shown in one or more of FIGS. 4, 5 and 6. 2. The catalyst composition of claim 1, wherein the product of the homop


2025-03-30 12:04:42,750 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"






In [9]:
df_filtered = df[df['1st_token'] == 'YES']
exp_text = ' '.join(df_filtered['text'].tolist())
exp_text

'US 9,352,308 B2 \n 5 \n over a period from 1 to 20 hours. The amount of remaining \n OH on the silica surface after this treatment ranges from 0.1 to 2 mmoles OH per g of silica, preferably between 0.5 and \n 1.5 mmoles OH per g of silica. The impregnation is preferably carried out by Suspending \n 10 to 20 parts by weight of silica for each 100 parts by volume of a solution of an organometallic compound of the groups 1, \n 2, 12 or 13, in aliphatic hydrocarbons, and maintaining the \n Solution with stirring at a temperature which ranges from \n room temperature to the boiling point of the solution of the organometallic compound of the groups 1, 2, 12 or 13, in aliphatic hydrocarbons, preferably at room temperature, over \n a period from 30 to 120 minutes, preferably between 50 and \n 60 minutes. \n The organometallic compounds of groups 1, 2, 12 or 13 of \n the periodic table suitable for use in step (a) are alkyl com pounds and alkyl halide compounds of metals belonging to \n these 

# Извлечение страниц с экспериментальными данными

In [37]:
class TableData(BaseModel):
    model_config = ConfigDict(extra='forbid')
    
    caption: Optional[str] = Field(
        default=None,
        description="Table caption or description"
    )

    headers: List[str] = Field(
        default_factory=list,
        description="Column headers of the table"
    )
    rows: List[List[str]] = Field(
        default_factory=list,
        description="Table data rows"
    )
    reference: List[str] = Field(
        default_factory=list,
        description="List of referenced experiment IDs"
    )

class ExperimentInfo(BaseModel):
    model_config = ConfigDict(extra='forbid')
    
    id: str = Field(description="Unique experiment identifier (e.g., 'Example 1')")
    text_of_example: str = Field(description="Exact text quote from the patent")
    type: str = Field(
        description="Type of experiment",
        # Using Field with allowed values instead of pattern
        json_schema_extra={"enum": ["catalyst_synthesis", "polymerization", "table"]}
    )
    reference: List[str] = Field(
        default_factory=list,
        description="List of referenced experiment IDs"
    )

class PatentInfo(BaseModel):
    model_config = ConfigDict(extra='forbid')
    
    experiments: List[ExperimentInfo] = Field(
        default_factory=list,
        description="List of experiments found in the patent"
    )
    table_data: List[TableData] = Field(
        default=None,
        description="Structured table data if type is 'table'"
    )

In [29]:
def extract_information(text: str):

    client = OpenAI()
    text = '\n'.join(line for line in text.split('\n') if len(line.strip()) > 5)
    # Define the system and human message templates
    system_template = """You are a chemistry assistant specialized in analyzing patent documents. 
    Focus on identifying experimental procedures, their types, and cross-references between examples. 
    Always return valid JSON that matches the specified schema.
    Important: Be thorough and meticulous in your analysis. Make sure to:
    - Carefully read and process every detail in the text
    - Don't skip any experimental examples
    - Double-check all cross-references between examples
    - Verify that all identified information is included in the output
    - Ensure complete coverage of the input text"""
    
    human_template = f"""As a chemistry assistant specializing in catalysis and polyolefins, 
    analyze the following patent text and extract structured information in JSON format.

    For each experimental example, provide:
    1. Type of experiment (one of):
       - catalyst_synthesis (for catalyst preparation procedures)
       - polymerization (for polymerization reactions)
       - table (for tabulated results or comparative data)
    
    2. Original text of the example (exact quote)
    
    3. References to other experiments (if any):
       - Look for phrases like "as in Example 1", "according to Example 2", etc.
       - Include experiment numbers that are referenced

    In case of tables, provide the exact text of the table or comparative data.

    Text to analyze: {text}
    """
   
    completion = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": system_template},
        {"role": "user", "content": human_template}
    ],
    response_format=PatentInfo,
    )
    
    return completion

In [56]:
patent_info = extract_information(exp_text)
patent_info

2025-03-30 12:26:44,856 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


ParsedChatCompletion[PatentInfo](id='chatcmpl-BGfRoiGbLAAVaK5DnGwHkFKzsEeeX', choices=[ParsedChoice[PatentInfo](finish_reason='stop', index=0, logprobs=None, message=ParsedChatCompletionMessage[PatentInfo](content='{"experiments":[{"id":"Example 1","text_of_example":"In a 5 liter flask fitted with a mechanical stirrer and previously purged with nitrogen were fed 44 g (0.462 moles) of anhydrous MgCl, and 330 ml (0.969 moles) of Ti(OBu). This mixture was allowed to stir at 300 rpm and heated to 150° C. for about 12 hours in order to have the solids completely dissolved, thereby a clear liquid product was obtained. This resulting liquid was cooled down to 40°C. and under gently stirring at 150 rpm, it was diluted with 3200 ml of anhydrous hexane. Into this solution kept at 40° C. and under the same stirring, 250g of the silica support were added. This silica was previously dehydrated and treated with 19 ml (0.139 moles) of triethylaluminum diluted in anhydrous hexane, for 50 minutes and a

In [92]:
patent_info.choices[0].message.parsed.table_data[0].reference

['Example 1', 'Example 2', 'Example 3']

**Таблицы извлекаются верно** однако ссылки на эксперименты при этом извлекаются не верно. 

# Извлечение экспериментальных данных из структурированного текста

In [93]:
@dataclass
class ExperimentParser:
    def __init__(self, patent_info: PatentInfo):
        self.experiments = patent_info.experiments
        self.table_data = patent_info.table_data
        
    def get_catalyst_synthesis_experiments(self) -> List[ExperimentInfo]:
        """Get all experiments related to catalyst synthesis"""
        return [exp for exp in self.experiments 
                if exp.type == 'catalyst_synthesis']
    
    def get_related_experiments(self, experiment_id: str) -> List[ExperimentInfo]:
        """Get all experiments that reference the given experiment ID"""
        return [exp for exp in self.experiments 
                if experiment_id in (exp.reference or [])]
    
    def get_synthesis_with_related(self) -> Dict[str, Tuple[ExperimentInfo, List[ExperimentInfo]]]:
        """
        Get all catalyst synthesis experiments and their related experiments
        Returns a dictionary where keys are synthesis experiment IDs and values are tuples of
        (synthesis experiment, list of related experiments)
        """
        result = {}
        synthesis_experiments = self.get_catalyst_synthesis_experiments()
        
        for synthesis in synthesis_experiments:
            related = self.get_related_experiments(synthesis.id)
            result[synthesis.id] = (synthesis, related)
            
        return result

    def get_synthesis_and_related_texts(self, 
                                      synthesis_and_related: Dict[str, Tuple[ExperimentInfo, List[ExperimentInfo]]]
                                      ) -> Dict[str, Tuple[str, List[Tuple[str, str]]]]:
        """
        Extract text from synthesis experiments and their related experiments
        Returns a dictionary where keys are synthesis IDs and values are tuples of
        (synthesis text, list of (text, experiment_type) tuples for related experiments)
        """
        result = {}
        
        for synthesis_id, (synthesis, related) in synthesis_and_related.items():
            synthesis_text = synthesis.text_of_example
            related_texts = [(exp.type, exp.text_of_example) for exp in related]
            result[synthesis_id] = (synthesis_text, related_texts)
        
        return result
    
    def get_related_table_data(self, experiment_id: str) -> List[TableData]:
        """
        Get all table data entries that reference the given experiment ID
        Returns a list of table data dictionaries that reference the experiment
        """
        return [table for table in self.table_data
                if experiment_id in (table.reference or [])]

    def get_synthesis_with_related_and_tables(self) -> Dict[str, Tuple[ExperimentInfo, List[ExperimentInfo], List[TableData]]]:
        """
        Get all catalyst synthesis experiments, their related experiments, and associated table data
        Returns a dictionary where keys are synthesis experiment IDs and values are tuples of
        (synthesis experiment, list of related experiments, list of related table data)
        """
        result = {}
        synthesis_experiments = self.get_catalyst_synthesis_experiments()
        
        for synthesis in synthesis_experiments:
            related = self.get_related_experiments(synthesis.id)
            table_data = self.get_related_table_data(synthesis.id)
            result[synthesis.id] = (synthesis, related, table_data)
            
        return result

    def get_synthesis_related_and_table_texts(self, 
                                            synthesis_data: Dict[str, Tuple[ExperimentInfo, List[ExperimentInfo], List[TableData]]]
                                            ) -> Dict[str, Tuple[str, List[Tuple[str, str]], List[TableData]]]:
        """
        Extract text from synthesis experiments, related experiments, and table data
        Returns a dictionary where keys are synthesis IDs and values are tuples of
        (synthesis text, list of (text, experiment_type) tuples for related experiments, table data)
        """
        result = {}
        
        for synthesis_id, (synthesis, related, tables) in synthesis_data.items():
            synthesis_text = synthesis.text_of_example
            related_texts = [(exp.type, exp.text_of_example) for exp in related]
            result[synthesis_id] = (synthesis_text, related_texts, tables)
        
        return result

In [None]:
parser = ExperimentParser(patent_info.choices[0].message.parsed)


synthesis_data = parser.get_synthesis_with_related_and_tables()


synthesis_texts = parser.get_synthesis_related_and_table_texts(synthesis_data)

In [None]:
class UnitValue(BaseModel):
    value: Optional[float] = Field(None, description="Numerical value")
    unit: Optional[str] = Field(None, description="Unit of measurement")

class ReactionType(str, Enum):
    SLURRY = "slurry"
    SOLUTION = "solution"
    GAS_PHASE = "gas phase"

class MonomerType(str, Enum):
    ETHYLENE = "ethylene"
    PROPYLENE = "propylene"
    BUTENE = "butene"
    HEXENE = "hexene"

class PolymerProperties(BaseModel):
    mw: Optional[UnitValue] = Field(None, description="Weight average molecular weight")
    mn: Optional[UnitValue] = Field(None, description="Number average molecular weight")
    pdi: Optional[float] = Field(None, description="Polydispersity index")
    density: Optional[UnitValue] = Field(None, description="Density")
    tm: Optional[UnitValue] = Field(None, description="Melting temperature")
    bulk_density: Optional[UnitValue] = Field(None, description="Bulk density")
    melt_index: Optional[UnitValue] = Field(None, description="Melt index")
    melt_flow_rate: Optional[UnitValue] = Field(None, description="Melt flow rate")

class PolytestResult(BaseModel):
    reaction_type: Optional[ReactionType] = Field(None, description="Type of reaction in the polytest")
    monomer_type: Optional[MonomerType] = Field(None, description="Type of monomer used in the polytest")
    comonomer_type: Optional[MonomerType] = Field(None, description="Type of comonomer if used")
    reactor_volume: Optional[UnitValue] = Field(None, description="Volume of reactor")
    temperature: Optional[UnitValue] = Field(None, description="Temperature of polytest")
    monomer: Optional[UnitValue] = Field(None, description="Pressure of monomer")
    hydrogen: Optional[UnitValue] = Field(None, description="Amount of hydrogen used")
    catalyst_amount: Optional[UnitValue] = Field(None, description="Amount of catalyst")
    cocatalyst_amount: Optional[UnitValue] = Field(None, description="Amount of cocatalyst")
    reaction_time: Optional[UnitValue] = Field(None, description="Reaction time")
    activity: Optional[UnitValue] = Field(None, description="Catalyst activity")
    polymer_properties: Optional[PolymerProperties] = Field(None, description="Polymer properties")

class Component(BaseModel):
    name: Optional[str] = Field(None, description="Name of the component")
    amount: Optional[UnitValue] = Field(None, description="Amount of component")

class SynthesisConditions(BaseModel):
    temperature: Optional[UnitValue] = Field(None, description="Temperature in °C, if applicable")
    time: Optional[UnitValue] = Field(None, description="Duration, if applicable")
    pressure: Optional[UnitValue] = Field(None, description="Pressure in MPa if applicable")

class SynthesisStep(BaseModel):
    step_number: Optional[int] = Field(None, description="Sequential number of the synthesis step")
    description: Optional[str] = Field(None, description="Description of the step")
    components_added: Optional[List[Component]] = Field(None, description="Components added in this step")
    conditions: Optional[SynthesisConditions] = Field(None, description="Conditions during this step")

class CatalystSynthesis(BaseModel):
    id: str = Field(description="Unique identifier for the catalyst synthesis")
    synthesis_steps: Optional[List[SynthesisStep]] = Field(None, description="Ordered list of synthesis steps")
    polytest_results: Optional[List[PolytestResult]] = Field(None, description="Results of polytests if applicable")
    catalyst_composition: Optional[List[Component]] = Field(None, description="Composition of final product")

In [None]:
def extract_catalyst_data(text: str, model: str = "gpt-4o-mini") -> Optional[CatalystSynthesis]:
    """
    Extract catalyst synthesis data from text using GPT-4 and validate with Pydantic model.
    
    Args:
        text (str): Input text containing catalyst synthesis description
        model (str): OpenAI model to use
        
    Returns:
        Optional[CatalystSynthesis]: Validated catalyst synthesis data or None if extraction fails
    """
    
    prompt = f"""You are a specialized assistant for extracting catalyst synthesis data from scientific text.
    Analyze the following text and extract information about catalyst synthesis and polymerization testing.

    Extract only factual information present in the text. Use null for missing values.
    Do not make assumptions or add information not explicitly stated.
    
    Text to analyze:
    {text}
    """

    client = OpenAI()
    
    try:
        response = client.beta.chat.completions.parse(
            model=model,
            messages=[
                {"role": "system", "content": "You are a specialized assistant for extracting catalyst synthesis data from scientific text."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0,
            response_format=CatalystSynthesis
        )

        return response
        
    except ValidationError as e:
        print(f"Validation error: {e}")
        return None
    except Exception as e:
        print(f"Error during extraction: {e}")
        return None

In [None]:
result = extract_catalyst_data(str(get_synthesis_and_related_texts['Example 1']))

In [None]:
result = result.choices[0].message.parsed
if result:
    print(json.dumps(result.model_dump(), indent=2))

In [None]:
result

In [None]:
def flatten_unit_value(prefix: str, unit_value: Optional[UnitValue]) -> Dict[str, Optional[float | str]]:
    """
    Convert UnitValue object to a flat dictionary
    Args:
        prefix: Column name prefix
        unit_value: UnitValue object to flatten
    Returns:
        Dictionary with value and unit fields
    """
    if unit_value is None:
        return {f"{prefix}_value": None, f"{prefix}_unit": None}
    return {
        f"{prefix}_value": unit_value.value,
        f"{prefix}_unit": unit_value.unit
    }

def flatten_polymer_properties(props: Optional[PolymerProperties]) -> Dict[str, Optional[float | str]]:
    """
    Convert PolymerProperties object to a flat dictionary
    Args:
        props: PolymerProperties object to flatten
    Returns:
        Dictionary with all polymer properties
    """
    if props is None:
        return {}
    
    result = {}
    unit_value_properties = [
        'mw', 'mn', 'density', 'tm', 'bulk_density', 
        'melt_index', 'melt_flow_rate'
    ]
    
    # Handle properties with UnitValue
    for prop in unit_value_properties:
        value = getattr(props, prop)
        if isinstance(value, UnitValue):
            result.update(flatten_unit_value(prop, value))
    
    # Handle simple properties
    if props.pdi is not None:
        result["pdi"] = props.pdi
    
    return result

def polytest_to_dataframe(synthesis: CatalystSynthesis) -> pd.DataFrame:
    """
    Convert polytest results to DataFrame
    Args:
        synthesis: CatalystSynthesis object containing polytest results
    Returns:
        DataFrame with polytest results
    """
    if not synthesis.polytest_results:
        return pd.DataFrame()
    
    rows = []
    for polytest in synthesis.polytest_results:
        # Handle enum fields
        row = {
            "reaction_type": polytest.reaction_type.value if polytest.reaction_type else None,
            "monomer_type": polytest.monomer_type.value if polytest.monomer_type else None,
            "comonomer_type": polytest.comonomer_type.value if polytest.comonomer_type else None,
        }
        
        # Handle UnitValue fields
        unit_value_fields = [
            "reactor_volume", "temperature", "monomer", 
            "hydrogen", "catalyst_amount", "cocatalyst_amount",
            "reaction_time", "activity"
        ]
        
        for field in unit_value_fields:
            unit_value = getattr(polytest, field)
            row.update(flatten_unit_value(field, unit_value))
        
        # Add polymer properties
        row.update(flatten_polymer_properties(polytest.polymer_properties))
        rows.append(row)
    
    return pd.DataFrame(rows)

def synthesis_steps_to_dataframe(synthesis: CatalystSynthesis) -> pd.DataFrame:
    """
    Convert synthesis steps to DataFrame
    Args:
        synthesis: CatalystSynthesis object containing synthesis steps
    Returns:
        DataFrame with synthesis steps
    """
    if not synthesis.synthesis_steps:
        return pd.DataFrame()
    
    rows = []
    for step in synthesis.synthesis_steps:
        base_row = {
            "step_number": step.step_number,
            "description": step.description
        }
        
        # Handle conditions
        if step.conditions:
            for condition in ["temperature", "time", "pressure"]:
                value = getattr(step.conditions, condition)
                if value:
                    base_row.update(flatten_unit_value(condition, value))
        
        # Handle components
        if step.components_added:
            for component in step.components_added:
                row = base_row.copy()
                row["component_name"] = component.name
                if component.amount:
                    row.update(flatten_unit_value("component_amount", component.amount))
                rows.append(row)
        else:
            rows.append(base_row)
    
    return pd.DataFrame(rows)

def process_catalyst_synthesis(synthesis: CatalystSynthesis) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Process a CatalystSynthesis instance and return synthesis steps and polytest results
    Args:
        synthesis: CatalystSynthesis object to process
    Returns:
        Tuple of (synthesis_steps_df, polytest_results_df)
    """
    polytest_df = polytest_to_dataframe(synthesis)
    synthesis_df = synthesis_steps_to_dataframe(synthesis)
    
    return synthesis_df, polytest_df


In [None]:
synthesis_steps_df, polytest_results_df = process_catalyst_synthesis(result)

print("Synthesis Steps:")

synthesis_steps_df

In [None]:
polytest_results_df

# Сравнительный анализ влияния предварительной фильтрации

In [None]:
res_withour_filter = "/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_analysis_with_initial_text/results/extraction_results.csv"
res_withour_filter = pd.read_csv(res_withour_filter)
res_withour_filter = res_withour_filter.iloc[15].llm_extraction_results
pprint(res_withour_filter)

## Исходные данные в markdown формате

In [100]:
file_path = "/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_analysis_with_docling_preproceccing/results/US9352308.md"
def extract_text_from_md(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content
md_file_res = extract_information(extract_text_from_md(file_path))

2025-03-30 22:18:06,767 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [113]:
md_file_res.choices[0].message.parsed.experiments

[ExperimentInfo(id='Example 1', text_of_example='In a 5 liter flask fitted with a mechanical stirrer and previously purged with nitrogen were fed 44 g (0.462 moles) of anhydrous MgCl, and 330 ml (0.969 moles) of Ti(OBu). This mixture was allowed to stir at 300 rpm and heated to 150°C. for about 12 hours in order to have the solids completely dissolved, thereby a clear liquid product was obtained. This resulting liquid was cooled down to 40°C. and under gently stirring at 150 rpm, it was diluted with 3200 ml of anhydrous hexane. Into this solution kept at 40° C. and under the same stirring, 250g of the silica support were added. This silica was previously dehydrated and treated with 19 ml (0.139 moles) of triethylaluminum diluted in anhydrous hexane, for 50 minutes and at room temperature. Once the addition of the silica is completed, the mixture was heated to 60° C. and kept at this temperature for 1 hour. Into this mixture, kept at 60° C. and under gently stirring, a solution consisti

In [110]:
md_file_res.choices[0].message.parsed.table_data

[TableData(caption=None, headers=[], rows=[], reference=[]),
 TableData(caption='Table 1', headers=['EXAMPLE', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23'], rows=[['Catalyst used', 'Ex. 1', 'Ex. 2', 'Ex. 3', 'Ex. 4', 'Ex. S', 'Ex. 6', 'Ex. 7', 'Ex. 8', 'Ex. 9', 'Ex. 10', 'Ex. 10'], ['Ti(%)', '7.0', '1.4', '1.2', '1.5', '1.7', '1.8', '2.1', '2.0', '1.9', '2.0', '2.0'], ['Mg (%)', '2.0', 'O.S', 'O.3', 'O.3', '2.4', '2.7', '1.3', '1.5', '2.9', '1.5', '1.5'], ['Temperature (C.)', '88', '88', '88', '88', '88', '88', '88', '88', '88', '88', '75'], ['Ethylene partial pressure (bar)', '3', '11.8', '9', '10.9', '5', '7', '7', '7', '7', '7', '7'], ['Total pressure (bar)', '21', '22', '21', '21', '21', '21', '21', '21', '21', '21', '21'], ['Residence Time (h)', '4.43', '3.71', '5.32', '3.2O', '4.86', '3.92', '3.18', '2.89', '3.91', '3.70', '3.15'], ['Hafethylene (mole/mole)', 'O.10', 'O.098', 'O.O86', 'O.096', 'O.096', 'O.118', 'O.098', 'O.12', 'O.13', 'O.12', 'O.17'], ['Bute

In [112]:

parser = ExperimentParser(md_file_res.choices[0].message.parsed)
parser.get_synthesis_with_related_and_tables()

{'Example 1': (ExperimentInfo(id='Example 1', text_of_example='In a 5 liter flask fitted with a mechanical stirrer and previously purged with nitrogen were fed 44 g (0.462 moles) of anhydrous MgCl, and 330 ml (0.969 moles) of Ti(OBu). This mixture was allowed to stir at 300 rpm and heated to 150°C. for about 12 hours in order to have the solids completely dissolved, thereby a clear liquid product was obtained. This resulting liquid was cooled down to 40°C. and under gently stirring at 150 rpm, it was diluted with 3200 ml of anhydrous hexane. Into this solution kept at 40° C. and under the same stirring, 250g of the silica support were added. This silica was previously dehydrated and treated with 19 ml (0.139 moles) of triethylaluminum diluted in anhydrous hexane, for 50 minutes and at room temperature. Once the addition of the silica is completed, the mixture was heated to 60° C. and kept at this temperature for 1 hour. Into this mixture, kept at 60° C. and under gently stirring, a sol

## Исходные данные в html формате 

In [114]:
file_path = "/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_analysis_with_docling_preproceccing/results/US9352308.html"
def extract_text_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content
html_file_res = extract_information(extract_text_from_html(file_path))

2025-03-30 22:58:43,375 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [120]:
html_file_res.choices[0].message.parsed.experiments

[ExperimentInfo(id='Example 1', text_of_example='In a 5 liter flask fitted with a mechanical stirrer and previously purged with nitrogen were fed 44 g (0.462 moles) of anhydrous MgCl, and 330 ml (0.969 moles) of Ti(OBu). This mixture was allowed to stir at 300 rpm and heated to 150° C. for about 12 hours in order to have the solids completely dissolved, thereby a clear liquid product was obtained. This resulting liquid was cooled down to 40°C. and under gently stirring at 150 rpm, it was diluted with 3200 ml of anhydrous hexane. Into this solution kept at 40° C. and under the same stirring, 250g of the silica support were added. This silica was previously dehydrated and treated with 19 ml (0.139 moles) of triethylaluminum diluted in anhydrous hexane, for 50 minutes and at room temperature. Once the addition of the silica is completed, the mixture was heated to 60° C. and kept at this temperature for 1 hour. Into this mixture, kept at 60° C. and under gently stirring, a solution consist

In [117]:
html_file_res.choices[0].message.parsed.table_data

[TableData(caption='TABLE 1', headers=['EXAMPLE', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23'], rows=[['Catalyst used', 'Ex. 1', 'Ex. 2', 'Ex. 3', 'Ex. 4', 'Ex. S', 'Ex. 6', 'Ex. 7', 'Ex. 8', 'Ex. 9', 'Ex. 10', 'Ex. 10'], ['Ti(%)', '7.0', '1.4', '1.2', '1.5', '1.7', '1.8', '2.1', '2.0', '1.9', '2.0', '2.0'], ['Mg (%)', '2.0', 'O.S', 'O.3', 'O.3', '2.4', '2.7', '1.3', '1.5', '2.9', '1.5', '1.5'], ['Temperature (C.)', '88', '88', '88', '88', '88', '88', '88', '88', '88', '88', '75'], ['Ethylene partial pressure (bar)', '3', '11.8', '9', '10.9', '5', '7', '7', '7', '7', '7', '7'], ['Total pressure (bar)', '21', '22', '21', '21', '21', '21', '21', '21', '21', '21', '21'], ['Residence Time (h)', '4.43', '3.71', '5.32', '3.2O', '4.86', '3.92', '3.18', '2.89', '3.91', '3.70', '3.15'], ['Hafethylene (mole/mole)', 'O.10', 'O.098', 'O.O86', 'O.096', 'O.096', 'O.118', 'O.098', 'O.12', 'O.13', 'O.12', 'O.17'], ['Butenefethylene (mole/mole)', 'O.39', 'O41', 'O46', '0.44', 'O.35

## Вывод 
Не все таблицы были извлечены. Часть данных потеряна. Не зависимо от форматов LLM плохо могут терять часть табличных данных. 