In [35]:
# Imports
import dotenv
from IPython.display import Markdown, display

dotenv.load_dotenv()

True

# Extract the text from the pdf

To extract the text from the pdf, we can use any implementation of the `PdfExtractorInterface` class. and pass the pdf data as bytes to the `extract` method.

```python
class PdfExtractorImplementation(PdfExtractorInterface):
    def forward(self, input: bytes) -> str:
        return "Hello, world!"

pdf_extractor = PdfExtractorImplementation()
text = pdf_extractor.forward(pdf_data)
```

These implementations are available in the `llm_synthesis.transformers.pdf_extraction` module.
- `DoclingPDFExtractor`
- `MistralPDFExtractor`







In [36]:
# declare paper name and id here
# e.g. paper_name = "Adeosun_2024_Direct" without .pdf in the end!
paper_name = "Adeosun_2024_Direct"
paper_id = "001"
suppl_text = ""

## DoclingPDFExtractor

Extract Data locally with Docling

In [None]:
from llm_synthesis.transformers.pdf_extraction import DoclingPDFExtractor

with open("../data/pdf_papers/" + paper_name + ".pdf", "rb") as f:
    pdf_data = f.read()

pdf_extractor = DoclingPDFExtractor(
    pipeline="standard",
    table_mode="accurate",
    add_page_images=False,
    use_gpu=True,
    scale=2.0,
    format="markdown",
)
docling_extracted_text = pdf_extractor.forward(pdf_data)

In [None]:
display(Markdown(docling_extracted_text))

Docling version: Save markdown and create paper object

In [None]:
# Let's save that for later

with open("../data/txt_papers/docling/" + paper_name + ".md", "w+", errors="replace") as f:
    f.write(docling_extracted_text)
    # f.write(mistral_extracted_text)

In [None]:
# Let's make it a paper object
with open("../data/txt_papers/docling/" + paper_name + ".md", "w+") as f:
    docling_extracted_text = f.read()
from llm_synthesis.models.paper import Paper

paper = Paper(
    id=paper_id, name=paper_name, publication_text=docling_extracted_text, si_text=suppl_text
)

## MistralPDFExtractor

Extract Data with Mistral.

NB: You need to have a mistral account and a valid API key.

In [44]:
import dotenv

from llm_synthesis.transformers.pdf_extraction import MistralPDFExtractor

with open("../data/pdf_papers/" + paper_name + ".pdf", "rb") as f:
    pdf_data = f.read()

pdf_extractor = MistralPDFExtractor(
    structured=False
)  # You can set the api key as an argument or in the environment variable MISTRAL_API_KEY (use a .env file)
mistral_extracted_text = pdf_extractor.forward(pdf_data)

d:\Start\Study\_Master\LeMaterial\lematerial-llm-synthesis\.venv\Lib\site-packages\mistralai\models\documenturlchunk.py:38: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for n, f in self.model_fields.items():
d:\Start\Study\_Master\LeMaterial\lematerial-llm-synthesis\.venv\Lib\site-packages\mistralai\models\ocrrequest.py:94: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for n, f in self.model_fields.items():


In [None]:
display(Markdown(mistral_extracted_text))

Mistral version: Save markdown and create paper object

In [46]:
# Let's save that for later

with open("../data/txt_papers/mistral/" + paper_name + ".md", "w+", errors="replace") as f:
    f.write(mistral_extracted_text)

In [None]:
# Let's make it a paper object
with open("../data/txt_papers/mistral/" + paper_name + ".md", "w+") as f:
    mistral_extracted_text = f.read()
from llm_synthesis.models.paper import Paper

paper = Paper(
    id=paper_id, name=paper_name, publication_text=mistral_extracted_text, si_text=suppl_text
)

# Extract text from a markdown text

These functions are available in the `llm_synthesis.transformers.text_extraction` module. From which you can extract any arbitrary text from the publication text.

We currently use dspy to extract text from a markdown text.

Here are a few examples of how to use it.

In [None]:
from llm_synthesis.transformers.material_extraction import (
    DspyTextExtractor,
    make_dspy_text_extractor_signature,
)
from llm_synthesis.utils.dspy_utils import get_llm_from_name
from llm_synthesis.utils.markdown_utils import remove_figs

# Let's make a signature for the text extraction
signature = make_dspy_text_extractor_signature(
    signature_name="ExtractSynthesisParagraph",
    instructions="Extract the synthesis paragraph from the markdown publication text.",
    # instructions="Extract all paragraphs describing the synthesis procedure from the markdown publication text. Look for relevant section titles (e.g., “Catalyst Preparation,” “Pd nanoparticles”). Mention all synthesis steps, do not summarize, shorten, or modify the paragraph!",
    input_description="The markdown publication text to extract the synthesis paragraph from.",
    output_name="synthesis_paragraph",
    output_description="The extracted synthesis paragraph.",
)

# Let's first remove the figures from the publication text
paper.publication_text = remove_figs(paper.publication_text)

# Let's make a language model
# lm = get_llm_from_name("gemini-2.0-flash", {"temperature": 0.0})
lm = get_llm_from_name("mistral-small", {"temperature": 0.0})

# Let's make a text extractor
text_extractor = DspyTextExtractor(signature, lm)

# Let's extract the text
synthesis_paragraph = text_extractor.forward(paper.publication_text)

From here on, the extraction is based on the mistral result.
In order to use docling, just uncomment the previous line and comment the mistral version.

In [119]:
synthesis_paragraph

# # Let's save that for later
# # with open("../data/txt_papers/docling/test_" + paper_name + ".md", "w+", errors="replace") as f:
with open("../data/txt_papers/mistral/test_" + paper_name + ".md", "w+") as f:
    f.write(synthesis_paragraph)
# # Let's make a signature for the structured data extraction

The goal of the signature is to provide the llm context on what is its actual task.

In [120]:
signature = make_dspy_text_extractor_signature(
    signature_name="ExtractFirstParagraph",
    instructions="Extract the first paragraph from the markdown publication text.",
    input_description="The markdown publication text to extract the first paragraph from.",
    output_name="first_paragraph",
    output_description="The extracted first paragraph.",
)

text_extractor = DspyTextExtractor(signature, lm)

text_extractor.forward(paper.publication_text)

'This is the first paragraph of the markdown publication text. It provides an introduction to the topic and sets the stage for the rest of the article.'

# Extract Structured Data from the synthesis paragraph

These functions are available in the `llm_synthesis.transformers.structured_data_extraction` module. From which you can extract any arbitrary structured data from the publication text.

We currently use dspy to extract structured data from a markdown text.

Here are a few examples of how to use it.

In [125]:
from llm_synthesis.transformers.synthesis_extraction import (
    DspySynthesisExtractor,
    make_dspy_synthesis_extractor_signature,
)
from llm_synthesis.utils.dspy_utils import get_llm_from_name

# Let's make a signature for the structured data extraction
signature = make_dspy_synthesis_extractor_signature(
    signature_name="ExtractStructuredSynthesis",
    instructions="Extract the structured synthesis from the synthesis paragraph.",
    input_description="The synthesis paragraph to extract the structured synthesis from.",
    output_name="structured_synthesis",
    output_description="The extracted structured synthesis.",
)

lm = get_llm_from_name("gemini-2.0-flash", {"temperature": 0.0})
# Let's make a structured data extractor
structured_data_extractor = DspySynthesisExtractor(signature, lm)

# Load synthesis paragraph
# with open("../data/txt_papers/docling/test_" + paper_name + ".md") as f:
with open("../data/txt_papers/mistral/" + paper_name + ".md") as f:
    synthesis_paragraph = f.read()

# Let's extract the structured data
structured_data = structured_data_extractor.forward(synthesis_paragraph)

structured_data

TypeError: make_dspy_synthesis_extractor_signature() got an unexpected keyword argument 'input_description'

In [None]:
from llm_synthesis.utils.markdown_utils import remove_figs

# with open("../data/txt_papers/docling/test_" + paper_name + ".md") as f:
with open("../data/txt_papers/mistral/test_" + paper_name + ".md") as f:
    publication_text = f.read()

publication_text = remove_figs(publication_text)

# Let's extract the structured data
structured_data_from_publication_text = structured_data_extractor.forward(
    publication_text
)

structured_data_from_publication_text

# Extract figures from the publication text

These functions are available in the `llm_synthesis.transformers.figure_extraction` module. From which you can extract any arbitrary figures from the publication text.

We currently expect the pdf_parser to embed figures in the markdown text to be able to extract figures from a markdown text.

In [None]:
!pip install matplotlib 

In [None]:
from llm_synthesis.transformers.figure_extraction import (
    FigureExtractorMarkdown,
)

figure_extractor = FigureExtractorMarkdown()

# with open("../data/txt_papers/docling/test_" + paper_name + ".md", errors="replace") as f:
with open("../data/txt_papers/mistral/test_" + paper_name + ".md", errors="replace") as f:
    publication_text = f.read()

figures = figure_extractor.forward(publication_text)

print(f"{len(figures)} figures was found in this paper.")

In [None]:
import base64
import io

import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

# Loop through each figure in the 'figures' list
for index, figure in enumerate(figures):
    print("=" * 80)

    # Determine if the figure is quantitative
    is_quantitative = (
        "Quantitative" if figure.quantitative else "Not Quantitative"
    )

    # Print figure information
    print(
        f"Figure {index + 1}: {is_quantitative} - Figure Class: {figure.figure_class}"
    )

    # Decode Base64 image data and open it using PIL
    image_data = base64.b64decode(figure.base64_data)
    image_stream = io.BytesIO(image_data)
    image = Image.open(image_stream)

    # Convert image to NumPy array for visualization
    image_array = np.array(image)

    # Plot the image using Matplotlib
    plt.imshow(image_array)
    plt.axis("off")  # Hide axes for better visual appearance
    plt.title(f"{is_quantitative}: {figure.figure_class}")
    plt.show()

In [None]:
# Let's print the first figure

import base64

from IPython.display import Image

# Convert base64 string to image and display
Image(base64.b64decode(figures[0].base64_data))

In [None]:
print("alt_text: ", figures[0].alt_text)
print("context_before: ", figures[0].context_before)
print("context_after: ", figures[0].context_after)
print("figure_reference: ", figures[0].figure_reference)
print("position: ", figures[0].position)

# Extract figure descriptions from the publication text

These functions are available in the `llm_synthesis.transformers.figure_description` module. From which you can get figure descriptions from the publication text and figure info.

The current implementation uses dspy to get figure descriptions from the publication text and figure info.



In [None]:
from llm_synthesis.models.figure import FigureInfoWithPaper
from llm_synthesis.transformers.figure_description import (
    DspyFigureDescriptionExtractor,
    make_dspy_figure_description_extractor_signature,
)
from llm_synthesis.utils.dspy_utils import get_llm_from_name
from llm_synthesis.utils.markdown_utils import remove_figs

# Let's make a signature for the figure description extraction
signature = make_dspy_figure_description_extractor_signature(
    signature_name="DspyFigureDescriptionExtractorSignature",
    instructions="Extract the figure description from the figure.",
    publication_text_description="The publication text to extract the figure description from.",
    si_text_description="The supporting information text to extract the figure description from.",
    figure_base64_description="The base64 encoded image of the figure to extract the description from.",
    caption_context_description="The text context surrounding the figure position including the figure caption and nearby paragraphs that reference this figure.",
    figure_position_info_description="The information about the figure's position in the document (e.g., 'Figure 2', 'Fig. 3a', 'Scheme 1') to help with contextual understanding.",
    figure_description_description="The extracted figure description.",
)

lm = get_llm_from_name("gpt-4o-mini", {"temperature": 0.0})

# with open("../data/txt_papers/docling/test_" + paper_name + ".md", errors="replace") as f:
with open("../data/txt_papers/mistral/test_" + paper_name + ".md", errors="replace") as f:
    publication_text = f.read()

publication_text = remove_figs(publication_text)

figure_info_with_paper = FigureInfoWithPaper(
    **figures[2].__dict__,
    paper_text=publication_text,
    si_text="",
)

figure_description_extractor = DspyFigureDescriptionExtractor(signature, lm)

figure_description = figure_description_extractor.forward(
    figure_info_with_paper
)

figure_description