In [None]:
# Imports
import dotenv
from IPython.display import Markdown, display

dotenv.load_dotenv()

# Extract the text from the pdf


To extract the text from the pdf, we can use any implementation of the `PdfExtractorInterface` class. and pass the pdf data as bytes to the `extract` method.

```python
class PdfExtractorImplementation(PdfExtractorInterface):
    def forward(self, input: bytes) -> str:
        return "Hello, world!"

pdf_extractor = PdfExtractorImplementation()
text = pdf_extractor.forward(pdf_data)
```

These implementations are available in the `llm_synthesis.transformers.pdf_extraction` module.

- `DoclingPDFExtractor`
- `MistralPDFExtractor`


## DoclingPDFExtractor

Extract Data locally with Docling


In [None]:
from llm_synthesis.transformers.pdf_extraction import DoclingPDFExtractor

with open("../data/pdf_papers/test.pdf", "rb") as f:
    pdf_data = f.read()

pdf_extractor = DoclingPDFExtractor(
    pipeline="standard",
    table_mode="accurate",
    add_page_images=False,
    use_gpu=True,
    scale=2.0,
    format="markdown",
)
docling_extracted_text = pdf_extractor.forward(pdf_data)

In [None]:
display(Markdown(docling_extracted_text))

In [None]:
# Let's save that for later

with open("../data/txt_papers/docling/test.md", "w+") as f:
    f.write(docling_extracted_text)

## MistralPDFExtractor

Extract Data with Mistral.

NB: You need to have a mistral account and a valid API key.


In [None]:
import dotenv

from llm_synthesis.transformers.pdf_extraction import MistralPDFExtractor

with open("../data/pdf_papers/test.pdf", "rb") as f:
    pdf_data = f.read()

pdf_extractor = MistralPDFExtractor(
    structured=False
)  # You can set the api key as an argument or in the environment variable MISTRAL_API_KEY (use a .env file)
mistral_extracted_text = pdf_extractor.forward(pdf_data)

In [None]:
display(Markdown(mistral_extracted_text))

In [None]:
# Let's save that for later

with open("../data/txt_papers/mistral/test.md", "w+") as f:
    f.write(mistral_extracted_text)

In [None]:
# Let's make it a paper object
from llm_synthesis.models.paper import Paper

paper = Paper(
    id="test", name="test", publication_text=mistral_extracted_text, si_text=""
)
paper.publication_text

# Extract text from a markdown text


These functions are available in the `llm_synthesis.transformers.text_extraction` module. From which you can extract any arbitrary text from the publication text.

We currently use dspy to extract text from a markdown text.

Here are a few examples of how to use it.


In [None]:
from llm_synthesis.utils.dspy_utils import get_llm_from_name
from llm_synthesis.utils.markdown_utils import remove_figs

# Let's first remove the figures from the publication text
paper = Paper(
    id="test",
    name="test",
    publication_text=remove_figs(mistral_extracted_text),
    si_text="",
)
paper.publication_text

The goal of the signature is to provide the llm context on what is its actual task.


# Extract Structured Data from the publication

These functions are available in the `llm_synthesis.transformers.structured_data_extraction` module. From which you can extract any arbitrary structured data from the publication text.

We currently use dspy to extract structured data from a markdown text.

Here are a few examples of how to use it.


In [None]:
from llm_synthesis.transformers.synthesis_extraction import (
    DspyStructuredSynthesisExtractor,
    make_dspy_structured_synthesis_extractor_signature,
)

# Let's make a signature for the structured data extraction
signature = make_dspy_structured_synthesis_extractor_signature(
    signature_name="ExtractStructuredSynthesis",
    instructions="Extract the structured synthesis from the publication text.",
    input_description="The publication text to extract the structured synthesis from.",
    output_name="structured_synthesis",
    output_description="The extracted structured synthesis.",
)

lm = get_llm_from_name("gemini-2.0-flash", {"temperature": 0.0})
# Let's make a structured data extractor
structured_data_extractor = DspyStructuredSynthesisExtractor(signature, lm)

input = paper.publication_text + paper.si_text

# Let's extract the structured data
structured_data = structured_data_extractor.forward(input)

structured_data

In [None]:
# save as json model dump
import json

with open(
    "../data/test.json",
    "w",
) as f:
    json.dump(structured_data.model_dump(), f, indent=2)

In [None]:
keys = structured_data.model_dump().keys()

for key in keys:
    print(key)
    print(structured_data.model_dump()[key])

In [None]:
for step in structured_data.steps:
    print(f"Action: {step.action}")
    print(f"Description: {step.description}")
    for material in step.materials:
        print(f"Material: {material.name} {material.amount} {material.unit}")
    print(f"Conditions: {step.conditions}")
    print("-" * 100)

In [None]:
from llm_synthesis.utils.markdown_utils import remove_figs

with open("../data/txt_papers/mistral/test.md") as f:
    publication_text = f.read()

publication_text = remove_figs(publication_text)

# Let's extract the structured data
structured_data_from_publication_text = structured_data_extractor.forward(
    publication_text
)

structured_data_from_publication_text

# Extract figures from the publication text

These functions are available in the `llm_synthesis.transformers.figure_extraction` module. From which you can extract any arbitrary figures from the publication text.

We currently expect the pdf_parser to embed figures in the markdown text to be able to extract figures from a markdown text.


In [None]:
from llm_synthesis.transformers.figure_extraction import (
    FigureExtractorMarkdown,
)

figure_extractor = FigureExtractorMarkdown()

with open("../data/txt_papers/mistral/test.md") as f:
    publication_text = f.read()

figures = figure_extractor.forward(publication_text)

figures

In [None]:
# Let's print the first figure

import base64

from IPython.display import Image

# Convert base64 string to image and display
Image(base64.b64decode(figures[0].base64_data))

In [None]:
print("alt_text: ", figures[0].alt_text)
print("context_before: ", figures[0].context_before)
print("context_after: ", figures[0].context_after)
print("figure_reference: ", figures[0].figure_reference)
print("position: ", figures[0].position)

# Extract figure descriptions from the publication text

These functions are available in the `llm_synthesis.transformers.figure_description` module. From which you can get figure descriptions from the publication text and figure info.

The current implementation uses dspy to get figure descriptions from the publication text and figure info.


In [None]:
from llm_synthesis.models.figure import FigureInfoWithPaper
from llm_synthesis.transformers.figure_description import (
    DspyFigureDescriptionExtractor,
    make_dspy_figure_description_extractor_signature,
)
from llm_synthesis.utils.dspy_utils import get_llm_from_name
from llm_synthesis.utils.markdown_utils import remove_figs

# Let's make a signature for the figure description extraction
signature = make_dspy_figure_description_extractor_signature(
    signature_name="DspyFigureDescriptionExtractorSignature",
    instructions="Extract the figure description from the figure.",
    publication_text_description="The publication text to extract the figure description from.",
    si_text_description="The supporting information text to extract the figure description from.",
    figure_base64_description="The base64 encoded image of the figure to extract the description from.",
    caption_context_description="The text context surrounding the figure position including the figure caption and nearby paragraphs that reference this figure.",
    figure_position_info_description="The information about the figure's position in the document (e.g., 'Figure 2', 'Fig. 3a', 'Scheme 1') to help with contextual understanding.",
    figure_description_description="The extracted figure description.",
)

lm = get_llm_from_name("gpt-4o-mini", {"temperature": 0.0})

with open("../data/txt_papers/mistral/test.md") as f:
    publication_text = f.read()

publication_text = remove_figs(publication_text)

figure_info_with_paper = FigureInfoWithPaper(
    **figures[2].__dict__,
    paper_text=publication_text,
    si_text="",
)

figure_description_extractor = DspyFigureDescriptionExtractor(signature, lm)

figure_description = figure_description_extractor.forward(
    figure_info_with_paper
)

figure_description