# Импорт библиотек

In [1]:
import os
import json
import pandas as pd
import numpy as np
import logging
from PyPDF2 import PdfReader
from pydantic import BaseModel, Field, ConfigDict
from typing import List, Optional


In [2]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('experimentation.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

In [3]:
from openai import OpenAI
import tiktoken
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Загрузка файла

In [4]:
file_path ="/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US4849389.pdf"
def load_text_from_pdf(file_path: str) -> str:
    """
    Load text from a PDF file.
    Args:
        file_path (str): Path to the PDF file.
    Returns:
        str: Extracted text from the PDF.
    """
    file_path = str(file_path)
    try:    
        with open(file_path, "rb") as f:
            pdf = PdfReader(f)
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        logger.error(f"Error reading PDF file {file_path}: {e}")
        return ""
text = load_text_from_pdf(file_path)

In [5]:
def count_tokens(text: str, model: str = "gpt-4o-mini") -> int:
    """
    Count the number of tokens in a text string.
    
    Args:
        text (str): The input text to count tokens for
        model (str): The model name to use for tokenization (default: "gpt-4")
    
    Returns:
        int: Number of tokens
    """
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    return len(tokens)

count_tokens(text, model="gpt-4o-mini")

15436

# Агент для излечения данных из полного исходного текста

In [6]:
class TableData(BaseModel):
    model_config = ConfigDict(extra='forbid')
    
    headers: List[str] = Field(
        default_factory=list,
        description="Column headers of the table"
    )
    rows: List[List[str]] = Field(
        default_factory=list,
        description="Table data rows"
    )
    caption: Optional[str] = Field(
        default=None,
        description="Table caption or description"
    )

class ExperimentInfo(BaseModel):
    model_config = ConfigDict(extra='forbid')
    
    id: str = Field(description="Unique experiment identifier (e.g., 'Example 1')")
    text_of_example: str = Field(description="Exact text quote from the patent")
    type: str = Field(
        description="Type of experiment",
        # Using Field with allowed values instead of pattern
        json_schema_extra={"enum": ["catalyst_synthesis", "polymerization", "table"]}
    )
    reference: List[str] = Field(
        default_factory=list,
        description="List of referenced experiment IDs"
    )
    table_data: Optional[TableData] = Field(
        default=None,
        description="Structured table data if type is 'table'"
    )

class PatentInfo(BaseModel):
    model_config = ConfigDict(extra='forbid')
    
    experiments: List[ExperimentInfo] = Field(
        default_factory=list,
        description="List of experiments found in the patent"
    )

In [7]:
def extract_info_agent(text: str, model:str) -> PatentInfo:

    client = OpenAI()
    text = '\n'.join(line for line in text.split('\n') if len(line.strip()) > 5)
    # Define the system and human message templates
    system_template = """You are a chemistry assistant specialized in analyzing patent documents. 
    Focus on identifying experimental procedures, their types, and cross-references between examples. 
    Always return valid JSON that matches the specified schema.
    Important: Be thorough and meticulous in your analysis. Make sure to:
    - Carefully read and process every detail in the text
    - Don't skip any experimental examples
    - Double-check all cross-references between examples
    - Verify that all identified information is included in the output
    - Ensure complete coverage of the input text"""
    
    human_template = f"""As a chemistry assistant specializing in catalysis and polyolefins, 
    analyze the following patent text and extract structured information in JSON format.

    For each experimental example, provide:
    1. Type of experiment (one of):
       - catalyst_synthesis (for catalyst preparation procedures)
       - polymerization (for polymerization reactions)
       - table (for tabulated results or comparative data)
    
    2. Original text of the example (exact quote)
    
    3. References to other experiments (if any):
       - Look for phrases like "as in Example 1", "according to Example 2", etc.
       - Include experiment numbers that are referenced

    In case of tables, provide the exact text of the table or comparative data.

    Text to analyze: {text}
    """
   
    completion = client.beta.chat.completions.parse(
    model=model,
    messages=[
        {"role": "system", "content": system_template},
        {"role": "user", "content": human_template}
    ],
    response_format=PatentInfo,
    )
    
    return completion

In [8]:
import json
from pprint import pprint

def pretty_print_patent_info(patent_info,  width=80):
    """
    Format and print patent information in a readable way.
    Args: patent_info: Response from the OpenAI API containing patent analysis
    """
    try:
        # Convert string to JSON if needed
        if isinstance(patent_info.choices[0].message.content, str):
            content = json.loads(patent_info.choices[0].message.content)
        else:
            content = patent_info.choices[0].message.content
            
        # Print with nice formatting
        print("\n=== Patent Analysis Results ===\n")
        
        # Format with JSON indentation
        formatted_json = json.dumps(content, indent=2, ensure_ascii=False)
        print(formatted_json)
        
        print("\n=== End of Analysis ===")
        
    except Exception as e:
        print(f"Error formatting patent info: {str(e)}")

## Агент на основе gpt-4o-mini

In [9]:
gpt_4o_mini_results = extract_info_agent(text=text, model="gpt-4o-mini")
pretty_print_patent_info(gpt_4o_mini_results)

2025-03-29 23:04:41,523 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



=== Patent Analysis Results ===

{
  "experiments": [
    {
      "id": "Example 1",
      "text_of_example": "20.0 grams of Davison grade 955 silica which had been heated to 600 C. for about 16 hours under a dry nitrogen purge was slurried in about 200 (milliters) mls of dry hexane contained in a 4-neck 500 ml round bottom flask kept under a continuous nitrogen purge and fitted with an overhead stirrer and a reflux condenser. The slurry was heated to and maintained are reflux, and 52.4 mls of dibutylmagnesium (DBM), 0.77 Molar solution in heptane, was added dropwise to the slurry (about 15 minutes) and the reflux continued for one hour. Next, 4.15 mls of absolute ethanol diluted in about 40 mls of dry hexane was added dropwise to the slurry (about 15 minutes) and reflux continued for an additional hour. Finally, 2.4 mils of TiCl4 diluted in about 40 mls of dry hexane was added dropwise, the reflux was continued for an additional hour, and solvents were removed by distillation and dri

📊 Анализ результатов

❌ **Проблемы**
- Утрачены оригинальные тексты экспериментов
- Значительная часть информации не сохранилась при извлечении
- Недостаточная валидация данных
- Отсутствие контроля качества

✅ **Успехи**
- Некоторые таблицы извлечены корректно
- Часть экспериментальных данных успешно обработана
- Структура данных соответствует требованиям
- Базовый функционал работает стабильно

🔄 **План улучшений**
- попробовать более мощную LLM 

## Агент на основе gpt-4o

In [10]:
gpt_4o_results = extract_info_agent(text=text, model="gpt-4o")
pretty_print_patent_info(gpt_4o_results, width=80)

2025-03-29 23:05:20,767 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



=== Patent Analysis Results ===

{
  "experiments": [
    {
      "id": "Example 1",
      "text_of_example": "20.0 grams of Davison grade 955 silica which had been heated to 600 C. for about 16 hours under a dry nitrogen purge was slurried in about 200 (milliters) mls of dry hexane contained in a 4-neck 500 ml round bottom flask kept under a continuous nitrogen purge and fitted with an overhead stirrer and a reflux condenser. The slurry was heated to and maintained are reflux, and 52.4 mls of dibutylmagnesium (DBM), 0.77 Molar solution in heptane, was added dropwise to the slurry (about 15 minutes) and the reflux continued for one hour. Next, 4.15 mls of absolute ethanol diluted in about 40 mls of dry hexane was added dropwise to the slurry (about 15 minutes) and reflux continued for an additional hour. Finally, 2.4 mils of TiCl4 diluted in about 40 mls of dry hexane was added dropwise, the reflux was continued for an additional hour, and solvents were removed by distillation and dri

📊 Анализ результатов

❌ **Проблемы**
- Не извлечены таблицы. Таблицы извлекаются не стабильно. То извлекаются, то нет. 

✅ **Успехи**
- Модель отрабатывает лучше. Однако на сколько хорошо отрабытвает модель на большой выборке не ясно.

🔄 **План улучшений**
- Надо улучшить предобработку. Попробовать уменьшить количество данных на вход и сделать фильтрацию входного текста

## Тестирвоание агентов на выборке из 10 патентов 

In [11]:
from pathlib import Path
from datetime import datetime
import json
import textwrap
import subprocess
import tempfile

In [12]:
input_directory = "/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments"
input_path = Path(input_directory)
pdf_files = [f for f in input_path.glob('*.pdf') if f.is_file()]
pdf_files

[PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US6617405.pdf'),
 PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US4481301.pdf'),
 PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US20060046928.pdf'),
 PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US4849389.pdf'),
 PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US4843132.pdf'),
 PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US9352308.pdf'),
 PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US6248831.pdf'),
 PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US4374753.pdf'),
 PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US20060166812.pdf'),
 PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_f

In [13]:
def analyze_pdf_files(pdf_files:str , models: list) -> list:
    """
    Analyze a list of PDF files and extract information using the specified model.
    
    Args:
        pdf_files (list): List of PDF file paths to analyze
        model (str): Model name to use for analysis
    
    Returns:
        list: List of extracted information from each PDF file
    """
    results = []
    for model in models:
        logger.info(f"Analyzing {len(pdf_files)} PDF files using model: {model}")
        logger.info(f"Using model: {model}")
        logger.info(f"Number of PDF files: {len(pdf_files)}")
        logger.info(f"PDF files: {pdf_files}")
        # Initialize an empty list to store results
        # Iterate through each PDF file
        for file_path in pdf_files:
            text = load_text_from_pdf(file_path)
            patent_info = extract_info_agent(text=text, model=model)
            results.append({'model': model,
                            'text': text,
                            'file_path': str(file_path),
                            'llm_extraction_results':patent_info})
    return results

In [14]:
results = analyze_pdf_files(pdf_files=pdf_files, models=["gpt-4o-mini", "gpt-4o"])
results = pd.DataFrame(results)
results.to_csv("/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_analysis_with_initial_text/results/extraction_results.csv", index=False)
results

2025-03-29 23:05:20,892 - INFO - Analyzing 10 PDF files using model: gpt-4o-mini
2025-03-29 23:05:20,893 - INFO - Using model: gpt-4o-mini
2025-03-29 23:05:20,895 - INFO - Number of PDF files: 10
2025-03-29 23:05:20,896 - INFO - PDF files: [PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US6617405.pdf'), PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US4481301.pdf'), PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US20060046928.pdf'), PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US4849389.pdf'), PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US4843132.pdf'), PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US9352308.pdf'), PosixPath('/home/anatoly_kayda/Desktop/mipt/mipt_thesis/patent_files_for_experiments/US6248831.pdf'), PosixPath('/home/anatoly_kayda/Desktop/mi

Unnamed: 0,model,text,file_path,llm_extraction_results
0,gpt-4o-mini,USOO6617405B1 \n (12) United States Patent (10...,/home/anatoly_kayda/Desktop/mipt/mipt_thesis/p...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...
1,gpt-4o-mini,United States Patent (19) \n Nowlin et al. \n ...,/home/anatoly_kayda/Desktop/mipt/mipt_thesis/p...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...
2,gpt-4o-mini,(19) United States US 20060046928A1 \n (12) Pa...,/home/anatoly_kayda/Desktop/mipt/mipt_thesis/p...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...
3,gpt-4o-mini,United States Patent (19) \n Nowlin et al. \n ...,/home/anatoly_kayda/Desktop/mipt/mipt_thesis/p...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...
4,gpt-4o-mini,United States Patent (19) \n Werner et al. \n ...,/home/anatoly_kayda/Desktop/mipt/mipt_thesis/p...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...
5,gpt-4o-mini,(12) United States Patent \n Braganca et al. U...,/home/anatoly_kayda/Desktop/mipt/mipt_thesis/p...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...
6,gpt-4o-mini,USOO6248831B1 \n (12) United States Patent (10...,/home/anatoly_kayda/Desktop/mipt/mipt_thesis/p...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...
7,gpt-4o-mini,United States Patent (19) \n Pullukat et al. \...,/home/anatoly_kayda/Desktop/mipt/mipt_thesis/p...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...
8,gpt-4o-mini,(19) United States US 2006O166812A1 \n (12) Pa...,/home/anatoly_kayda/Desktop/mipt/mipt_thesis/p...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...
9,gpt-4o-mini,United States Patent (19) \n Strobel et al. \n...,/home/anatoly_kayda/Desktop/mipt/mipt_thesis/p...,ParsedChatCompletion[PatentInfo](id='chatcmpl-...


In [16]:
def display_formatted_results(data: pd.DataFrame):
    """
    Display model results with formatted text wrapping and clear output between entries.
    
    Args:
        data (pd.DataFrame): DataFrame containing model results
        width (int): Maximum line width for text output
    """
    data['how_many_exps_was_extracted'] = None
    data['how_many_exps_was_extracted_correctly'] = None
    data['how_many_tables_were_found'] = None
    data['how_many_tables_were_found_correctly'] = None
    data['how_many_links_were_found'] = None
    data['how_many_links_were_found_correctly'] = None
   
    
    for idx, row in data.iterrows(): 
        
        # Wrap and display the model's output
        wrapped_text = textwrap.fill(
            row['llm_extraction_results'],
            width=80,
            initial_indent="  ",
            subsequent_indent="  "
        )

        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
            content = f"""  {'='*80}
                                Model: {row['model']}
                                Patent: {Path(row['file_path']).stem}

                                {'='*80}
                                Extracted Information:
                                {'='*80}

                                {wrapped_text}

                                {'='*80}
                                Current Statistics:
                                {'='*80}

                                - Documents Processed: {idx+1}/{len(data)}

                                {'='*80}
                                Instructions:
                                - Review the extracted information above
                                - Close this file when done reviewing
                                - Return to terminal to enter your feedback
                                {'='*80}
                                """
        

            f.write(content)
            temp_file = f.name
        
        subprocess.run(['code', '-w', temp_file])
        # Get user feedback
        feedback = input("\nHow_many_exps_was_extracted (correct/total): ")
        data.at[idx, 'how_many_exps_was_extracted'] = feedback

        feedback = input("\nHow_many_exps_was_extracted_correctly (correct/total): ")
        data.at[idx, 'how_many_exps_was_extracted_correctly'] = feedback

        feedback = input("\nHow_many_tables_were_found (correct/total): ")
        data.at[idx, 'how_many_tables_were_found'] = feedback

        feedback = input("\nHow_many_tables_were_found_correctly (correct/total): ")
        data.at[idx, 'how_many_tables_were_found_correctly'] = feedback

        feedback = input("\nHow_many_links_were_found (correct/total): ")
        data.at[idx, 'how_many_links_were_found'] = feedback

        feedback = input("\nHow_many_links_were_found_correctly (correct/total): ")
        data.at[idx, 'how_many_links_were_found_correctly'] = feedback
        os.unlink(temp_file)
    return data

📊 Анализ результатов

❌ **Проблемы**
- GPT-4o-mini справляется с задачей весь плохо. Часто теряются данные, плохо извлекаются таблицы. 
- GPT-4o отрабатывает значительно лучше, но не идельно много текста теряется, текст извлакается нестабильно. Надо подумать о том, как улучшить фильтрацию

✅ **Успехи**
- В целом кажется, что данные можно научиться извлекать. Однако, необходимо доработать предобработку. 

🔄 **План улучшений**
- Проработать простую фильтрацию текста
- Проработать использование библиотке docling
