# Загрузка исходных данных для анализа

In [None]:
import pandas as pd
import os

# Define the directory containing the files
directory = '/home/anatoly_kayda/Desktop/mipt/kayda_thesis/concept_developtment/output_chunk_is_page'
csv_files = os.listdir(directory)
df = pd.concat([pd.read_csv(directory+'/'+file, index_col=0) for file in os.listdir(directory)])
df = df[df['1st_token']=='YES']
# Filter rows where 1st_token is YES and group by file_name
df_grouped = df.groupby('file_name').agg({
    'file_path': 'first',  # Keep first file path
    'text': lambda x: ' '.join(x)  # Concatenate all text for each group
}).reset_index()

# Select only the required columns
df_grouped = df_grouped[['file_name','file_path', 'text']]
df_grouped

#  Обработка данных с помощью GPT-4o

In [None]:
from pydantic import BaseModel, ValidationError, Field
from typing import List, Optional
from openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

In [None]:
from pydantic import BaseModel, Field, ConfigDict
from typing import List, Optional

class TableData(BaseModel):
    model_config = ConfigDict(extra='forbid')
    
    headers: List[str] = Field(
        default_factory=list,
        description="Column headers of the table"
    )
    rows: List[List[str]] = Field(
        default_factory=list,
        description="Table data rows"
    )
    caption: Optional[str] = Field(
        default=None,
        description="Table caption or description"
    )

class ExperimentInfo(BaseModel):
    model_config = ConfigDict(extra='forbid')
    
    id: str = Field(description="Unique experiment identifier (e.g., 'Example 1')")
    text_of_example: str = Field(description="Exact text quote from the patent")
    type: str = Field(
        description="Type of experiment",
        # Using Field with allowed values instead of pattern
        json_schema_extra={"enum": ["catalyst_synthesis", "polymerization", "table"]}
    )
    reference: List[str] = Field(
        default_factory=list,
        description="List of referenced experiment IDs"
    )
    table_data: Optional[TableData] = Field(
        default=None,
        description="Structured table data if type is 'table'"
    )

class PatentInfo(BaseModel):
    model_config = ConfigDict(extra='forbid')
    
    experiments: List[ExperimentInfo] = Field(
        default_factory=list,
        description="List of experiments found in the patent"
    )

In [None]:
def extract_information(text: str):

    client = OpenAI()
    text = '\n'.join(line for line in text.split('\n') if len(line.strip()) > 5)
    # Define the system and human message templates
    system_template = """You are a chemistry assistant specialized in analyzing patent documents. 
    Focus on identifying experimental procedures, their types, and cross-references between examples. 
    Always return valid JSON that matches the specified schema.
    Important: Be thorough and meticulous in your analysis. Make sure to:
    - Carefully read and process every detail in the text
    - Don't skip any experimental examples
    - Double-check all cross-references between examples
    - Verify that all identified information is included in the output
    - Ensure complete coverage of the input text"""
    
    human_template = f"""As a chemistry assistant specializing in catalysis and polyolefins, 
    analyze the following patent text and extract structured information in JSON format.

    For each experimental example, provide:
    1. Type of experiment (one of):
       - catalyst_synthesis (for catalyst preparation procedures)
       - polymerization (for polymerization reactions)
       - table (for tabulated results or comparative data)
    
    2. Original text of the example (exact quote)
    
    3. References to other experiments (if any):
       - Look for phrases like "as in Example 1", "according to Example 2", etc.
       - Include experiment numbers that are referenced

    In case of tables, provide the exact text of the table or comparative data.

    Text to analyze: {text}
    """
   
    completion = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": system_template},
        {"role": "user", "content": human_template}
    ],
    response_format=PatentInfo,
    )
    
    return completion

In [None]:
df_grouped['llm_completion'] = df_grouped['text'].apply(extract_information)
df_grouped['llm_completion.choices[0].message.parsed'] = df_grouped['llm_completion'].apply(lambda completion: completion.choices[0].message.parsed)
df_grouped.to_csv('results/llm_extracted_experiment_data.csv')

# Проверка качества извлечения текста

In [3]:
import pandas as pd 
df_grouped = pd.read_csv('results/llm_extracted_experiment_data.csv', index_col=0)
df_grouped

Unnamed: 0,file_name,file_path,text,llm_completion,llm_completion.choices[0].message.parsed
0,US20060046928.pdf,/home/anatoly_kayda/Desktop/mipt/data/novolen/...,US 2006/0046928A1 \n from about 0.5 wt % to ab...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...,"experiments=[ExperimentInfo(id='Example 1', te..."
1,US20060166812.pdf,/home/anatoly_kayda/Desktop/mipt/data/braskem/...,US 2006/0166812 A1 \n 0048 (e) optionally reac...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...,"experiments=[ExperimentInfo(id='Example 1', te..."
2,US4148754.pdf,/home/anatoly_kayda/Desktop/mipt/data/hoehst/U...,"4,148,754 7 \n ous phase in a fluidized bed. T...",ParsedChatCompletion[PatentInfo](id='chatcmpl-...,"experiments=[ExperimentInfo(id='EXAMPLE 1', te..."
3,US4374753.pdf,/home/anatoly_kayda/Desktop/mipt/data/chemplex...,"4,374,753 7 \n atmospheric to 10,000 psi gauge...",ParsedChatCompletion[PatentInfo](id='chatcmpl-...,"experiments=[ExperimentInfo(id='Example 1', te..."
4,US4481301.pdf,/home/anatoly_kayda/Desktop/mipt/data/mobiloil...,"4,481,301 17 \n is then opened to deliver the ...",ParsedChatCompletion[PatentInfo](id='chatcmpl-...,"experiments=[ExperimentInfo(id='Example 1', te..."
5,US4843132.pdf,/home/anatoly_kayda/Desktop/mipt/data/novolen/...,"4,843,132 3 \n where R is an aryl, alkylaryl o...",ParsedChatCompletion[PatentInfo](id='chatcmpl-...,"experiments=[ExperimentInfo(id='Example 1', te..."
6,US4849389.pdf,/home/anatoly_kayda/Desktop/mipt/data/mobiloil...,"4,849,389 5 \n into the solvent, preferably wh...",ParsedChatCompletion[PatentInfo](id='chatcmpl-...,"experiments=[ExperimentInfo(id='Example 1', te..."
7,US6248831.pdf,/home/anatoly_kayda/Desktop/mipt/data/unioncar...,"US 6,248,831 B1 \n the cocatalyst Solution. Op...",ParsedChatCompletion[PatentInfo](id='chatcmpl-...,"experiments=[ExperimentInfo(id='Example 1', te..."
8,US6617405.pdf,/home/anatoly_kayda/Desktop/mipt/data/unioncar...,"US 6,617.405 B1 \n 7 \n bottom of the residenc...",ParsedChatCompletion[PatentInfo](id='chatcmpl-...,experiments=[ExperimentInfo(id='Example 1 to 4...
9,US9352308.pdf,/home/anatoly_kayda/Desktop/mipt/data/braskem/...,"US 9,352,308 B2 \n 5 \n over a period from 1 t...",ParsedChatCompletion[PatentInfo](id='chatcmpl-...,"experiments=[ExperimentInfo(id='Example 1', te..."


In [11]:
df_grouped['human_feedback'] = None

In [21]:
import os
import tempfile
import subprocess
import textwrap

def format_text_block(text, width=80):
    """Format text block with proper line wrapping"""
    paragraphs = text.split('\n')
    formatted = []
    for p in paragraphs:
        # Wrap each paragraph
        if p.strip():
            wrapped = textwrap.fill(p.strip(), width=width)
            formatted.append(wrapped)
        else:
            formatted.append('')
    return '\n'.join(formatted)

def human_feedback(df_grouped):
    extracted_experiments = []
    actual_experiments = []
    
    for i in range(len(df_grouped)):
        row = df_grouped.iloc[i]
        text = str(row['llm_completion.choices[0].message.parsed'])
        formatted_text = format_text_block(text)
        
        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
            content = f"""
{'='*80}
Document {i+1}/{len(df_grouped)}
{'='*80}

File Name: {row['file_name']}
File Path: {row['file_path']}

{'='*80}
Extracted Information:
{'='*80}

{formatted_text}

{'='*80}
Current Statistics:
{'='*80}

- Documents Processed: {i+1}/{len(df_grouped)}
- Total Experiments Found: {sum(extracted_experiments)}
- Total Actual Experiments: {sum(actual_experiments)}

{'='*80}
Instructions:
- Review the extracted information above
- Close this file when done reviewing
- Return to terminal to enter your feedback
{'='*80}
"""
            f.write(content)
            temp_file = f.name
        
        subprocess.run(['code', '-w', temp_file])
        
        print("\nPlease enter your feedback:")
        print("Format: 'found/total' (e.g. 3/5)")
        feedback = input("Your answer: ")
        
        os.unlink(temp_file)
        
        df_grouped.loc[i, "human_feedback"] = feedback
        extracted, actual = feedback.split('/')
        extracted_experiments.append(int(extracted))
        actual_experiments.append(int(actual))
    
    return df_grouped

df_grouped = human_feedback(df_grouped)


Please enter your feedback:
Format: 'found/total' (e.g. 3/5)

Please enter your feedback:
Format: 'found/total' (e.g. 3/5)

Please enter your feedback:
Format: 'found/total' (e.g. 3/5)

Please enter your feedback:
Format: 'found/total' (e.g. 3/5)

Please enter your feedback:
Format: 'found/total' (e.g. 3/5)

Please enter your feedback:
Format: 'found/total' (e.g. 3/5)

Please enter your feedback:
Format: 'found/total' (e.g. 3/5)

Please enter your feedback:
Format: 'found/total' (e.g. 3/5)

Please enter your feedback:
Format: 'found/total' (e.g. 3/5)

Please enter your feedback:
Format: 'found/total' (e.g. 3/5)


In [22]:
human_feedback_list = df_grouped.human_feedback.tolist()

# Lists to store extracted and actual experiment counts
extracted_experiments = []
actual_experiments = []

# Parse feedback strings in format "found/total"
for feedback in human_feedback_list:
    extracted_count, actual_count = feedback.split('/')
    extracted_experiments.append(int(extracted_count))
    actual_experiments.append(int(actual_count))

# Calculate metrics
total_extracted = sum(extracted_experiments)
total_actual = sum(actual_experiments)

total_precision = total_extracted / total_actual
print(f"Total Precision: {total_precision:.2f}")

Total Precision: 0.74


In [23]:
df_grouped.to_csv('results/llm_extracted_experiment_data_human_cheked.csv')

низкая степень извлечения экспериментальных данных. Видимо в модель приходит слшиком много контекста. Надо придумать подход поумнее. Однако же продолжим исследования.

# TMP

In [None]:
# def extract_information_2(text):
#     First, break down the text into smaller chunks and analyze step by step
#     initial_prompt = f"""
#     Analyze the following patent text in steps:
#     1. First, identify all numbered examples (like "Example 1:", "Example 2:") and their boundaries.
#     2. For each example, classify it ONLY as one of these types:
#        - catalyst_synthesis (if it describes catalyst preparation)
#        - polymerization (if it describes polymerization reaction)
#        - table (if it contains tabulated data)
#     3. Look for direct references between examples (like "prepared as in Example 1")

#     Text to analyze: {text}

#     Respond with a simple list format:
#     EXAMPLE 1:
#     Type: [type]
#     References: [list of referenced examples]
#     Text: [exact text of the example]
#     ---
#     """
    
#     # Get initial structure
#     initial_response = openai.chat.completions.create(
#         model="gpt-4o-mini",
#         messages=[
#             {"role": "system", "content": "You are a chemistry patent analyzer. Be precise and concise."},
#             {"role": "user", "content": initial_prompt}
#         ],
#         max_tokens=10000,
#         temperature=0
#     )
    
#     # Now process each example in detail
#     structured_prompt = f"""
#     Based on the identified examples, convert the following information into JSON format:

#     {initial_response.choices[0].message.content}

#     Return ONLY valid JSON matching this exact structure:
#     {{
#         "experiments": [
#             {{
#                 "id": "Example X",
#                 "type": "one_of_three_types",
#                 "text_of_example": "exact_text",
#                 "reference": ["Example Y"],
#                 "table_data": {{}}  // only for tables
#             }}
#         ]
#     }}
#     """
    
#     final_response = openai.chat.completions.create(
#         model="gpt-4o-mini",
#         messages=[
#             {"role": "system", "content": "You are a JSON formatter. Return only valid JSON."},
#             {"role": "user", "content": structured_prompt}
#         ],
#         max_tokens=2000,
#         temperature=0
#     )
    
#     return final_response.choices[0].message.content.strip()

In [None]:
# import openai
# from dotenv import load_dotenv
# import os
# from pydantic import BaseModel, ValidationError, Field
# from typing import List, Optional
# import pandas as pd

# # Load environment variables from .env file
# load_dotenv()

# # Define your OpenAI API key
# openai.api_key = os.getenv('OPENAI_API_KEY')

# class TableData(BaseModel):
#     headers: List[str] = Field(description="Column headers of the table")
#     rows: List[List[str]] = Field(description="Table data rows")
#     caption: Optional[str] = Field(description="Table caption or description")


# class ExperimentInfo(BaseModel):
#     id: str = Field(description="Unique experiment identifier (e.g., 'Example 1')")
#     text_of_example: str = Field(description="Exact text quote from the patent")
#     type: str = Field(
#         description="Type of experiment", 
#         pattern="^(catalyst_synthesis|polymerization|table)$"
#     )
#     reference: Optional[List[str]] = Field(
#         default_factory=list,
#         description="List of referenced experiment IDs"
#     )

#     table_data: Optional[TableData] = Field(
#         default=None,
#         description="Structured table data if type is 'table'"
#     )

# class PatentInfo(BaseModel):
#     experiments: List[ExperimentInfo] = Field(
#         default_factory=list,
#         description="List of experiments found in the patent"
#     )

# def extract_information(text):
#     prompt = f"""
#     As a chemistry assistant specializing in catalysis and polyolefins, analyze the following patent text and extract structured information in JSON format.

#     For each experimental example, provide:
#     1. Type of experiment (one of):
#        - catalyst_synthesis (for catalyst preparation procedures)
#        - polymerization (for polymerization reactions)
#        - table (for tabulated results or comparative data)
    
#     2. Original text of the example (exact quote)
    
#     3. References to other experiments (if any):
#        - Look for phrases like "as in Example 1", "according to Example 2", etc.
#        - Include experiment numbers that are referenced

#     In case of tables, provide the exact text of the table or comparative data.

#         Respond in the following JSON format:
#     {{
#         "experiments": [
#             {{
#                 "id": "Example 1",
#                 "type": "catalyst_synthesis/polymerization/table",
#                 "text_of_example": "exact text of example or table from patent",
#                 "reference": ["Example 2", "Example 3"],
#                 "table_data": {{
#                     "headers": ["Column1", "Column2", ...],
#                     "rows": [
#                         ["value1", "value2", ...],
#                         ["value1", "value2", ...]
#                     ],
#                     "caption": "table description if available"
#                 }}  // only for table type
#             }}
#         ]
#     }}    

    

#     Text to analyze: {text}
#     """
    
#     response = openai.chat.completions.create(
#         model="gpt-4o-mini",
#         messages=[
#             {"role": "system", "content": """You are a chemistry assistant specialized in analyzing patent documents. 
#              Focus on identifying experimental procedures, their types, and cross-references between examples. 
#              Always return valid JSON that matches the specified schema."""},
#             {"role": "user", "content": prompt}
#         ],
#         max_tokens=10000,
#         temperature=0
#     )
    
#     return response.choices[0].message.content.strip()

In [None]:
# # Define models (same as before)
# class TableData(BaseModel):
#     headers: List[str] = Field(description="Column headers of the table")
#     rows: List[List[str]] = Field(description="Table data rows")
#     caption: Optional[str] = Field(description="Table caption or description")

# class ExperimentInfo(BaseModel):
#     id: str = Field(description="Unique experiment identifier (e.g., 'Example 1')")
#     text_of_example: str = Field(description="Exact text quote from the patent")
#     type: str = Field(
#         description="Type of experiment", 
#         pattern="^(catalyst_synthesis|polymerization|table)$"
#     )
#     reference: Optional[List[str]] = Field(
#         default_factory=list,
#         description="List of referenced experiment IDs"
#     )
#     table_data: Optional[TableData] = Field(
#         default=None,
#         description="Structured table data if type is 'table'"
#     )

# class PatentInfo(BaseModel):
#     experiments: List[ExperimentInfo] = Field(
#         default_factory=list,
#         description="List of experiments found in the patent"
#     )

# def extract_information(text: str) -> PatentInfo:
#     # Initialize the language model
#     llm = ChatOpenAI(
#         model="gpt-4-0125-preview",
#         temperature=0,
        
#     )

#     # Create output parser
#     parser = PydanticOutputParser(pydantic_object=PatentInfo)
    
#     # Define the system and human message templates
#     system_template = """You are a chemistry assistant specialized in analyzing patent documents. 
#     Focus on identifying experimental procedures, their types, and cross-references between examples. 
#     Always return valid JSON that matches the specified schema."""
    
#     human_template = """As a chemistry assistant specializing in catalysis and polyolefins, analyze the following patent text and extract structured information in JSON format.

#     For each experimental example, provide:
#     1. Type of experiment (one of):
#        - catalyst_synthesis (for catalyst preparation procedures)
#        - polymerization (for polymerization reactions)
#        - table (for tabulated results or comparative data)
    
#     2. Original text of the example (exact quote)
    
#     3. References to other experiments (if any):
#        - Look for phrases like "as in Example 1", "according to Example 2", etc.
#        - Include experiment numbers that are referenced

#     In case of tables, provide the exact text of the table or comparative data.

#     {format_instructions}

#     Text to analyze: {input_text}
#     """
    
#     # Create the chat prompt
#     prompt = ChatPromptTemplate.from_messages([
#         ("system", system_template),
#         ("human", human_template)
#     ])
    
#     # Format the prompt with the input text and parser instructions
#     messages = prompt.format_messages(
#         input_text=text,
#         format_instructions=parser.get_format_instructions()
#     )
#     structure_llm = llm.with_structured_output(PatentInfo)
#     # Get the response from the model
#     response = llm.invoke(messages)
    
#     # Parse the response into the Pydantic model
#     try:
#         parsed_response = parser.parse(response.content)
#         return parsed_response
#     except ValidationError as e:
#         print(f"Error parsing response: {e}")
#         raise