# Local files

For this tutorial, you need to have a list of PDF files in a directory, for instance `../data/pdf_papers/`. You can use the following code to create a list of PDF files in that directory:

### Extract with Docling

In [None]:
from llm_synthesis.services.storage.local_file_storage import LocalFileStorage
from llm_synthesis.services.pdf_extraction.docling_pdf_extractor import (
    DoclingPDFExtractor,
)

storage = LocalFileStorage()
pdf_extractor = DoclingPDFExtractor()

In [None]:
base_path = "<path_to_your_pdf_files>"

In [None]:
pdf_files = storage.list_files(base_path, "pdf")

Extract one file

In [None]:
pdf_file = pdf_files[0]  # Assuming you want to process the first PDF file
pdf_content = storage.read_bytes(pdf_file)
extracted_text = pdf_extractor.extract_to_markdown(pdf_content)

Save result locally

In [None]:
txt_file = "<path_to_save_extracted_text>"

In [None]:
storage.write_text(
    txt_file,
    extracted_text,
)

### Extract with Mistral

In [None]:
from llm_synthesis.services.pdf_extraction.mistral_pdf_extractor import (
    MistralPDFExtractor,
)

mistral_pdf_extractor = MistralPDFExtractor(mistral_api_key="your_mistral_api_key_here")
extracted_text_mistral = mistral_pdf_extractor.extract_to_markdown(pdf_content)
storage.write_text(
    txt_file,
    extracted_text_mistral,
)

# Files on google storage

For this tutorial, you need to have a list of PDF files in a Google Cloud Storage bucket, for instance `gs://<bucket-name>/pdf_papers/`.

In [None]:
from llm_synthesis.services.storage.gcs_file_storage import GCSFileStorage

gcs_storage = GCSFileStorage()

In [None]:
pdf_file_folder_uri = "gs://your_bucket_name/path/to/pdf_files"

In [None]:
pdf_files = gcs_storage.list_files(pdf_file_folder_uri, "pdf")

Extract and write txt files to a Google Cloud Storage bucket

In [None]:
txt_file_uri = "gs://your_bucket_name/path/to/save_extracted_text"

In [None]:
from llm_synthesis.services.pdf_extraction.docling_pdf_extractor import (
    DoclingPDFExtractor,
)

pdf_extractor = DoclingPDFExtractor()
gcs_pdf_content = gcs_storage.read_bytes(
    pdf_files[0]
)  # Read the first PDF file from GCS
extracted_text_gcs = pdf_extractor.extract_to_markdown(gcs_pdf_content)
gcs_storage.write_text(
    txt_file_uri,
    extracted_text_gcs,
)