# Imports

In [6]:
import os
import json
import logging
from pathlib import Path
from PyPDF2 import PdfReader
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
import pandas as pd

In [7]:
from dotenv import load_dotenv, find_dotenv

def load_env():
    _ = load_dotenv(find_dotenv())

load_env()

# Patent preprocessing

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.output_parsers import BooleanOutputParser
from typing import List

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('patent_analysis.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)


In [9]:
# def has_experimental_data(chunk: str) -> bool:
#     """Analyze if text contains experimental data using GPT-4"""
#     try:
#         if not chunk.strip():
#             return False
            
#         # Initialize model outside of function to avoid recreation
#         model = ChatOpenAI(
#             model="gpt-4o-mini",  # Use full GPT-4 model for better accuracy
#             temperature=0,   # Keep temperature at 0 for consistent results
#             max_retries=2    # Add retries for reliability
#         )
            
#         prompt = ChatPromptTemplate.from_messages([
#             ("system", """You are a chemical patent analyzer specialized in identifying experimental data.
# TASK: Determine if the text contains SPECIFIC experimental data.
# RESPOND ONLY WITH 'YES' or 'NO' - no other text allowed.

# EXPERIMENTAL DATA MUST INCLUDE AT LEAST ONE OF:
# 1. NUMERICAL VALUES with UNITS:
#    - Measurements (e.g., "5.2g", "75°C", "2 hours", "10 mL")
#    - Concentrations (e.g., "0.1M", "5 wt%")
#    - Reaction parameters (e.g., "heated at 80°C")

# 2. DETAILED PROCEDURES:
#    - Step-by-step synthesis descriptions
#    - Specific reaction conditions
#    - Laboratory methods with concrete steps

# 3. EXPERIMENTAL RESULTS:
#    - Yield values (e.g., "85% yield")
#    - Analysis data (e.g., "mp: 120-122°C")
#    - Measured properties with values

# KEY RULES:
# - Must contain SPECIFIC NUMBERS with UNITS
# - General descriptions = NO
# - Theoretical explanations = NO
# - Claims without data = NO
# - Future possibilities = NO
# - Ranges without specific examples = NO

# Quick check: If you can't find ANY NUMERICAL VALUES or SPECIFIC MEASUREMENTS, answer NO."""),
#             ("human", "TEXT TO ANALYZE:\n{chunk}")
#         ])

#         output_parser = BooleanOutputParser()
#         chain = prompt | model | output_parser
        
#         return chain.invoke({"chunk": chunk})
        
#     except Exception as e:
#         logger.error(f"Chunk analysis failed: {str(e)}")
#         return False

In [10]:
# def process_pdf(file_path: str, chunk_size: int = 1000, chunk_overlap: int = 100) -> List[dict]:
#     """
#     Process PDF file and analyze chunks for experimental data.
#     """
#     results = []
    
#     # Get file information
#     file_name = os.path.basename(file_path)
#     absolute_path = os.path.abspath(file_path)
    
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=chunk_size,
#         chunk_overlap=chunk_overlap,
#         length_function=len,
#         separators=["\n\n", "\n", " ", ""]
#     )
    
#     try:
#         with open(file_path, "rb") as f:
#             reader = PdfReader(f)
#             for page_num, page in enumerate(reader.pages, start=1):
#                 logger.info(f"Processing page {page_num}")
#                 text = page.extract_text() or ""
#                 chunks = text_splitter.split_text(text)
                
#                 for i, chunk in enumerate(chunks):
#                     logger.info(f"Analyzing chunk {i+1} of page {page_num}")
#                     has_exp_data = has_experimental_data(chunk)
#                     results.append({
#                         'file_name': file_name,
#                         'file_path': absolute_path,
#                         'page_number': page_num,
#                         'chunk_number': i+1,
#                         'has_experimental_data': has_exp_data,
#                         'text': chunk
#                     })
                    
#     except Exception as e:
#         logger.error(f"PDF processing failed: {str(e)}")
#     results = pd.DataFrame(results)
#     results.to_csv(f"output/{file_name}_parsed.csv")

#     return results

In [11]:
# class ExperimentalDataAnalysis(BaseModel):
#     contains_experimental: bool = Field(description="Whether the text contains experimental data")
#     confidence: float = Field(description="Confidence score between 0 and 1")
#     reason: str = Field(description="Brief explanation for the decision with examples")

# def analyze_experimental_data(chunk: str) -> Dict[str, Any]:
#     """
#     Analyze text for experimental data with structured output and confidence scoring
#     """
#     try:
#         if not chunk.strip():
#             return {"contains_experimental": False, "confidence": 1.0, "reason": "Empty text"}

#         model = ChatOpenAI(
#             model="gpt-4o-mini",
#             temperature=1,
#             request_timeout=60
#         ).with_structured_output(ExperimentalDataAnalysis)

#         prompt = ChatPromptTemplate.from_messages([
#             ("system", 
#             """You are a precise analyzer of chemical patents focused on identifying 
#              experimental data.

#             DEFINITION OF EXPERIMENTAL DATA:
#             Experimental data MUST contain specific numerical measurements or detailed procedural steps from actual experiments.

#             REQUIRED ELEMENTS (at least one):
#             1. Specific numerical measurements with units:
#             - Quantities (e.g., "5.2g", "10 mL")
#             - Temperature values (e.g., "75°C")
#             - Time durations (e.g., "2 hours")
#             - Pressure readings (e.g., "5 atm")
#             - Concentrations (e.g., "0.1M")
#             - Tables
#             2. Detailed experimental procedures:
#             - Step-by-step synthesis methods
#             - Specific reaction conditions with values
#             - Actual laboratory protocols

#             3. Concrete experimental results:
#             - Measured yields with values
#             - Analysis data with numbers
#             - Test results with specific values

#             NOT experimental data:
#             - Generic descriptions without numbers
#             - Theoretical explanations
#             - Patent claims without data
#             - Potential applications
#             - Broad ranges without specific examples

#             FORMAT YOUR RESPONSE AS:
#             {{
#                 "contains_experimental": true/false,
#                 "confidence": <float between 0-1>,
#                 "reason": "brief explanation"
#             }}"""),
#             ("human", "Analyze this text:\n{chunk}")
#         ])

#         chain = prompt | model 
#         result = chain.invoke({"chunk": chunk})
        
#         # Only consider it experimental if confidence is high enough
#         if result.confidence < 0.8:
#             result.contains_experimental = False
            
#         return {
#             "has_experimental_data": result.contains_experimental,
#             "confidence": result.confidence,
#             "reason": result.reason
#         }
#     except Exception as e:
#         logger.error(f"Analysis failed: {str(e)}")
#         return {
#             "has_experimental_data": False,
#             "confidence": 0.0,
#             "reason": f"Error: {str(e)}"
#         }

# # Update your process_pdf function to use the new analyzer
# def process_pdf(file_path: str, chunk_size: int = 1000, chunk_overlap: int = 100) -> pd.DataFrame:
#     results = []
#     file_name = os.path.basename(file_path)
#     absolute_path = os.path.abspath(file_path)
    
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=chunk_size,
#         chunk_overlap=chunk_overlap,
#         length_function=len,
#         separators=["\n\n", "\n", " ", ""]
#     )
    
#     try:
#         with open(file_path, "rb") as f:
#             reader = PdfReader(f)
#             for page_num, page in enumerate(reader.pages, start=1):
#                 text = page.extract_text() or ""
#                 chunks = text_splitter.split_text(text)
                
#                 for i, chunk in enumerate(chunks):
#                     analysis = analyze_experimental_data(chunk)
#                     results.append({
#                         'file_name': file_name,
#                         'file_path': absolute_path,
#                         'page_number': page_num,
#                         'chunk_number': i+1,
#                         'has_experimental_data': analysis['has_experimental_data'],
#                         'confidence': analysis['confidence'],
#                         'reason': analysis['reason'],
#                         'text': chunk
#                     })
                    
#     except Exception as e:
#         logger.error(f"PDF processing failed: {str(e)}")
    
#     df = pd.DataFrame(results)
#     df.to_csv(f"output/{file_name}_parsed.csv")
#     return df


In [26]:
from typing import Any, Dict

class ExperimentalDataAnalysis(BaseModel):
    contains_experimental: bool = Field(description="Whether the text contains experimental data")
    confidence: float = Field(description="Confidence score between 0 and 1")
    reason: str = Field(description="Brief explanation for the decision with examples")

def analyze_experimental_data(chunk: str) -> Dict[str, Any]:
    """
    Analyze text for experimental data with scoring based on experimental content strength
    """
    try:
        if not chunk.strip():
            return {"contains_experimental": False, "confidence": 1.0, "reason": "Empty text"}

        model = ChatOpenAI(
            model="gpt-4o-mini",
            temperature=0,
            request_timeout=60,
            logprobs=True
        ).with_structured_output(ExperimentalDataAnalysis)

        prompt = ChatPromptTemplate.from_messages([
            ("system", 
            """You are a precise analyzer of chemical patents that evaluates how experimental the content is.
            
            SCORING SYSTEM (Calculate confidence based on these criteria):

            EXPERIMENTAL:
            - Complete experimental procedure with multiple specific measurements
              Example: 
              "5.2g of compound A was dissolved in 10 mL methanol, heated at 75°C for 2 hours, filtered, and dried."
            - Presence of experimental tables with specific data

            NON-EXPERIMENTAL:
            - Theoretical descriptions
            - Patent claims
            - General statements without any measurements

            FORMAT RESPONSE AS:
            {{
                "contains_experimental": true/false,
                "confidence": <score 0-1>,
                "reason": "Explain score calculation with specific examples from text"
            }}"""),
            ("human", "Analyze this text:\n{chunk}")
        ])

        chain = prompt | model 
        result = chain.invoke({"chunk": chunk})
        
        return {
            "has_experimental_data": result.contains_experimental,
            "confidence": result.confidence,
            "reason": result.reason
        }
    except Exception as e:
        logger.error(f"Analysis failed: {str(e)}")
        return {
            "has_experimental_data": False,
            "confidence": 0.0,
            "reason": f"Error: {str(e)}"
        }

# Update your process_pdf function to use the new analyzer
def process_pdf(file_path: str, chunk_size: int = 1000, chunk_overlap: int = 100) -> pd.DataFrame:
    results = []
    file_name = os.path.basename(file_path)
    absolute_path = os.path.abspath(file_path)
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    try:
        with open(file_path, "rb") as f:
            reader = PdfReader(f)
            for page_num, page in enumerate(reader.pages, start=1):
                text = page.extract_text() or ""
                #chunks = text_splitter.split_text(text)
                
                #for i, chunk in enumerate(text):
                analysis = analyze_experimental_data(text)
                results.append({
                        'file_name': file_name,
                        'file_path': absolute_path,
                        'page_number': page_num,
                        'chunk_number': 1,
                        'has_experimental_data': analysis['has_experimental_data'],
                        'confidence': analysis['confidence'],
                        'reason': analysis['reason'],
                        'text': text
                    })
                    
    except Exception as e:
        logger.error(f"PDF processing failed: {str(e)}")
    
    df = pd.DataFrame(results)
    df.to_csv(f"output/{file_name}_parsed.csv")
    return df

patent_path = '/home/anatoly_kayda/Desktop/mipt/kayda_thesis/US5461018.pdf'
df = process_pdf(patent_path, chunk_size=2000, chunk_overlap=0)
df

2025-03-16 18:06:19,994 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 18:06:22,527 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 18:06:24,853 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 18:06:28,908 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 18:06:31,756 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 18:06:36,190 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 18:06:39,259 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Unnamed: 0,file_name,file_path,page_number,chunk_number,has_experimental_data,confidence,reason,text
0,US5461018.pdf,/home/anatoly_kayda/Desktop/mipt/kayda_thesis/...,1,1,False,0.1,The text primarily consists of patent informat...,United States Patent 19 \n Ala-Huikku et al. I...
1,US5461018.pdf,/home/anatoly_kayda/Desktop/mipt/kayda_thesis/...,2,1,False,0.1,The text primarily discusses theoretical aspec...,"5,461.018 \n 1 \n PROCATALYST COMPOST ON FOR T..."
2,US5461018.pdf,/home/anatoly_kayda/Desktop/mipt/kayda_thesis/...,3,1,False,0.2,The text primarily consists of theoretical des...,"5,461.018 \n 3 \n uncontrolled precipitation a..."
3,US5461018.pdf,/home/anatoly_kayda/Desktop/mipt/kayda_thesis/...,4,1,True,0.9,The text contains a detailed experimental proc...,"5,461,018 \n S \n carrier agent, i.e. before i..."
4,US5461018.pdf,/home/anatoly_kayda/Desktop/mipt/kayda_thesis/...,5,1,True,0.95,The text contains detailed experimental proced...,"5,461.018 \n 7 \n D. Treating the Precatalyst ..."
5,US5461018.pdf,/home/anatoly_kayda/Desktop/mipt/kayda_thesis/...,6,1,True,0.95,The text contains detailed experimental proced...,"5,461.018 \n 9 \n Example 8 \n A. Preparation ..."
6,US5461018.pdf,/home/anatoly_kayda/Desktop/mipt/kayda_thesis/...,7,1,True,0.9,The text contains detailed experimental proced...,"5,461.018 \n 11 \n 7.89 ml of a 10% by weight ..."


# Patent analysis

In [13]:
patent_data = '/home/anatoly_kayda/Desktop/mipt/data'
patent_list = '/home/anatoly_kayda/Desktop/mipt/kayda_thesis/best_10_patents.txt'

In [14]:
best_10_patents = []
with open(patent_list) as f:
    for patent in f.read().splitlines() :
        best_10_patents.append(patent+'.pdf')

best_10_patents_paths = []
for dirpaths, dirnames, filenames in os.walk(patent_data):
    for filename in filenames:
        if filename in best_10_patents:
            best_10_patents_paths.append(dirpaths+"/"+filename)
best_10_patents_paths = {os.path.splitext(os.path.basename(path))[0]: path for path in best_10_patents_paths}
best_10_patents_paths

{'US6617405': '/home/anatoly_kayda/Desktop/mipt/data/unioncarbide/US/US6617405.pdf',
 'US6248831': '/home/anatoly_kayda/Desktop/mipt/data/unioncarbide/US/US6248831.pdf',
 'US20060046928': '/home/anatoly_kayda/Desktop/mipt/data/novolen/US/US20060046928.pdf',
 'US4843132': '/home/anatoly_kayda/Desktop/mipt/data/novolen/US/US4843132.pdf',
 'US4148754': '/home/anatoly_kayda/Desktop/mipt/data/hoehst/US/US4148754.pdf',
 'US4481301': '/home/anatoly_kayda/Desktop/mipt/data/mobiloil/US/US4481301.pdf',
 'US4849389': '/home/anatoly_kayda/Desktop/mipt/data/mobiloil/US/US4849389.pdf',
 'US4374753': '/home/anatoly_kayda/Desktop/mipt/data/chemplex/US/US4374753.pdf',
 'US9352308': '/home/anatoly_kayda/Desktop/mipt/data/braskem/US/US9352308.pdf',
 'US20060166812': '/home/anatoly_kayda/Desktop/mipt/data/braskem/US/US20060166812.pdf'}

In [15]:
for patent_name, patent_path in best_10_patents_paths.items():
    process_pdf(patent_path, chunk_size=0, chunk_overlap=0)

2025-03-16 14:45:29,359 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 14:45:31,710 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 14:45:34,884 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 14:45:37,649 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 14:45:40,823 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 14:45:44,021 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 14:45:46,659 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 14:45:49,525 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-16 14:45:52,190 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "

# Check results by human

In [16]:
import pandas as pd
import os
preprocessed_csv_files = '/home/anatoly_kayda/Desktop/mipt/kayda_thesis/output'
df = pd.concat([pd.read_csv(preprocessed_csv_files+'/'+file, index_col=0) for file in os.listdir(preprocessed_csv_files)])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109 entries, US4148754.pdf to US6617405.pdf
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   file_path              109 non-null    object 
 1   page_number            109 non-null    int64  
 2   has_experimental_data  109 non-null    bool   
 3   confidence             109 non-null    float64
 4   reason                 109 non-null    object 
 5   logprobs               109 non-null    object 
 6   text                   109 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 6.1+ KB


In [17]:
df

Unnamed: 0_level_0,file_path,page_number,has_experimental_data,confidence,reason,logprobs,text
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
US4148754.pdf,/home/anatoly_kayda/Desktop/mipt/data/hoehst/U...,1,True,0.891125,The text describes a process for preparing a c...,"[np.float64(0.3091042453358316), np.float64(0....",United States Patent (19) \n Strobel et al. \n...
US4148754.pdf,/home/anatoly_kayda/Desktop/mipt/data/hoehst/U...,2,True,1.124298,The text contains detailed descriptions of exp...,"[np.float64(0.46443908991413724), np.float64(0...","4,148,754 1. \n PROCESS FOR THE PREPARATION OF..."
US4148754.pdf,/home/anatoly_kayda/Desktop/mipt/data/hoehst/U...,3,True,1.129386,The text contains detailed descriptions of che...,"[np.float64(0.45951198501345897), np.float64(0...","4,148,754 3 Suitable halogen-containing magnes..."
US4148754.pdf,/home/anatoly_kayda/Desktop/mipt/data/hoehst/U...,4,True,0.987647,The text contains detailed descriptions of che...,"[np.float64(0.4382026634673881), np.float64(0....","4,148,754 \n wherein R2 and R2 is hydrogen, an..."
US4148754.pdf,/home/anatoly_kayda/Desktop/mipt/data/hoehst/U...,5,True,1.129606,The text contains detailed experimental proced...,"[np.float64(0.48283137373023016), np.float64(0...","4,148,754 7 \n ous phase in a fluidized bed. T..."
...,...,...,...,...,...,...,...
US6617405.pdf,/home/anatoly_kayda/Desktop/mipt/data/unioncar...,6,True,1.107003,The text contains detailed descriptions of exp...,"[np.float64(0.46443908991413724), np.float64(0...","US 6,617.405 B1 \n 9 \n in the range of about ..."
US6617405.pdf,/home/anatoly_kayda/Desktop/mipt/data/unioncar...,7,True,1.137841,The text contains detailed descriptions of exp...,"[np.float64(0.44773368144782066), np.float64(0...","US 6,617.405 B1 \n 11 \n fashion over a larger..."
US6617405.pdf,/home/anatoly_kayda/Desktop/mipt/data/unioncar...,8,True,1.095444,"The text contains detailed experimental data, ...","[np.float64(0.3295836866004329), np.float64(0....","US 6,617.405 B1 \n 13 \n TABLE II \n RESIN PRO..."
US6617405.pdf,/home/anatoly_kayda/Desktop/mipt/data/unioncar...,9,True,1.074867,"The text contains detailed experimental data, ...","[np.float64(0.2995732273553991), np.float64(0....","US 6,617.405 B1 \n TABLE IV-continued TABLE VI..."


In [18]:
best_10_patents_paths

{'US6617405': '/home/anatoly_kayda/Desktop/mipt/data/unioncarbide/US/US6617405.pdf',
 'US6248831': '/home/anatoly_kayda/Desktop/mipt/data/unioncarbide/US/US6248831.pdf',
 'US20060046928': '/home/anatoly_kayda/Desktop/mipt/data/novolen/US/US20060046928.pdf',
 'US4843132': '/home/anatoly_kayda/Desktop/mipt/data/novolen/US/US4843132.pdf',
 'US4148754': '/home/anatoly_kayda/Desktop/mipt/data/hoehst/US/US4148754.pdf',
 'US4481301': '/home/anatoly_kayda/Desktop/mipt/data/mobiloil/US/US4481301.pdf',
 'US4849389': '/home/anatoly_kayda/Desktop/mipt/data/mobiloil/US/US4849389.pdf',
 'US4374753': '/home/anatoly_kayda/Desktop/mipt/data/chemplex/US/US4374753.pdf',
 'US9352308': '/home/anatoly_kayda/Desktop/mipt/data/braskem/US/US9352308.pdf',
 'US20060166812': '/home/anatoly_kayda/Desktop/mipt/data/braskem/US/US20060166812.pdf'}

In [19]:
human_analysis_results = pd.read_csv('/home/anatoly_kayda/Desktop/mipt/kayda_thesis/patent_analysis_results.csv')
human_analysis_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patent_number    131 non-null    object
 1   patent_path      131 non-null    object
 2   page_number      131 non-null    int64 
 3   is_exp_by_human  131 non-null    bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 3.3+ KB


In [20]:
# import fitz  # PyMuPDF
# import os
# import pandas as pd
# from IPython.display import display, Image
# import ipywidgets as widgets
# from IPython.display import clear_output
# import io

# # Initialize empty list to store data
# patent_data = []
# current_patent = None
# current_page = None
# best_10_patents_paths = list(best_10_patents_paths.values())


# def save_response(response):
#     """Save user response and move to next page"""
#     global current_patent, current_page
    
#     patent_data.append({
#         'patent_number': os.path.basename(current_patent).split('.')[0],
#         'patent_path': current_patent,
#         'page_number': current_page + 1,
#         'is_exp_by_human': response
#     })
    
#     # Move to next page or patent
#     display_next_page()

# def display_next_page():
#     """Display next page or finish if all patents processed"""
#     global current_patent, current_page
    
#     clear_output(wait=True)
    
#     if current_patent is None:
#         current_patent = best_10_patents_paths[0]
#         current_page = 0
#     else:
#         doc = fitz.open(current_patent)
#         if current_page + 1 < len(doc):
#             current_page += 1
#         else:
#             current_patent_idx = best_10_patents_paths.index(current_patent)
#             if current_patent_idx + 1 < len(best_10_patents_paths):
#                 current_patent = best_10_patents_paths[current_patent_idx + 1]
#                 current_page = 0
#             else:
#                 # All patents processed
#                 df = pd.DataFrame(patent_data)
#                 df.to_csv('patent_analysis_results.csv', index=False)
#                 print("All patents processed! Results saved to patent_analysis_results.csv")
#                 return

#     # Display current page
#     doc = fitz.open(current_patent)
#     page = doc[current_page]
#     pix = page.get_pixmap()
#     img_data = pix.tobytes()
    
#     print(f"Patent: {os.path.basename(current_patent)}")
#     print(f"Page: {current_page + 1}/{len(doc)}")
    
#     # Display the page image
#     img = widgets.Image(
#         value=img_data,
#         format='png',
#         width=800
#     )
#     display(img)
    
#     # Create and display buttons
#     true_button = widgets.Button(description='Contains Experiment (True)')
#     false_button = widgets.Button(description='No Experiment (False)')
    
#     true_button.on_click(lambda b: save_response(True))
#     false_button.on_click(lambda b: save_response(False))
    
#     button_box = widgets.HBox([true_button, false_button])
#     display(button_box)
    
#     doc.close()

# # Start the process
# display_next_page()