In [None]:
"""
Step 1 : Extracting all Patient Information Leaflets pdf files from the source ZIP archive into a local folder for further processing
"""

import zipfile
import os

zip_path = r"\Dataa\medicine_notices_fixed.zip"
extract_to = r"\Dataa\unzipped_pdfs"

os.makedirs(extract_to, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    pdf_files = [f for f in zip_ref.namelist() if f.endswith(".pdf")]
    for file in pdf_files:
        zip_ref.extract(file, path=extract_to)


In [None]:
## Confirming how much files are they in the 'unzipped_pdfs' folder 

unzipped_path = r"\Dataa\unzipped_pdfs"
pdfs = [f for f in os.listdir(unzipped_path) if f.endswith(".pdf")]
print(f"Final raw leaflets PDF count: {len(pdfs)}")


Final raw leaflets PDF count: 1921


In [None]:
"""
Step 2 : Classifying the extracted PDFs into two categories: 
1. PDFs with selectable text (copied to /dataa/selectable_pdfs)
2. Scanned PDFs requiring OCR (processed with OCRmyPDF and saved to /dataa/ocr_pdfs)
"""

import os
import shutil
import fitz  

raw_dir = r"\Dataa\unzipped_pdfs"
ocr_dir = r"Dataa\ocr_pdfs"
selectable_dir = r"Dataa\selectable_pdfs"

os.makedirs(ocr_dir, exist_ok=True)
os.makedirs(selectable_dir, exist_ok=True)

pdf_files = [f for f in os.listdir(raw_dir) if f.endswith(".pdf")]
print(f"Total PDFs found: {len(pdf_files)}")

# Defining the function to determine if a PDF has selectable text
def has_selectable_text(pdf_path, threshold=100):
    try:
        doc = fitz.open(pdf_path)
        text = "".join(page.get_text() for page in doc)
        return len(text.strip()) > threshold
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return False

# Processing each PDF
for idx, filename in enumerate(pdf_files, 1):
    print(f"[{idx}/{len(pdf_files)}] Processing: {filename}")
    input_path = os.path.join(raw_dir, filename)
    ocr_path = os.path.join(ocr_dir, filename)
    selectable_path = os.path.join(selectable_dir, filename)

    if has_selectable_text(input_path):
        print("  → Selectable text detected → Copied to selectable_pdfs")
        shutil.copy2(input_path, selectable_path)
    else:
        print("  → No text detected → Running OCR")
        os.system(f'ocrmypdf "{input_path}" "{ocr_path}" --force-ocr --rotate-pages -l fra+eng')

print("Classification complete.")


Total PDFs found: 1924
[1/1924] Processing: ABASAGLAR 100UI_ML _3_64MG_ML_ SOL_ INJ_ EN STYLO PREREMPLI KWIKPEN  B_05 STYLOS PREREMLIS DE 3 ML.pdf
  → Selectable text detected → Copied to selectable_pdfs
[2/1924] Processing: ABILIFY 10MG COMP_ B_28.pdf
  → Selectable text detected → Copied to selectable_pdfs
[3/1924] Processing: ABILIFY 15MG COMP_ B_28.pdf
  → Selectable text detected → Copied to selectable_pdfs
[4/1924] Processing: ABUFENE 400MG COMP_ B_30.pdf
  → Selectable text detected → Copied to selectable_pdfs
[5/1924] Processing: ACARBOSE VIATRIS 50 mg, comprimé_B_99.pdf
  → Selectable text detected → Copied to selectable_pdfs
[6/1924] Processing: ACEBUTOLOL SANDOZ 200MG COMP_ ENRO_ B_30.pdf
  → Selectable text detected → Copied to selectable_pdfs
[7/1924] Processing: ACICLOVIR MYLAN 250MG_FL_ DE PDRE_ PDRE_ SOL_ INJ_ IV B_05 FL_ DE PDRE_.pdf
  → Selectable text detected → Copied to selectable_pdfs
[8/1924] Processing: ACICLOVIR MYLAN 500MG_FL_ DE PDRE_ PDRE_ SOL_ INJ_ IV B_05 

In [None]:
## As i noticed that there are some files missing that were not processed, i want to get their names to see where is the issue

raw_dir = r"\Dataa\unzipped_pdfs"
ocr_dir = r"\Dataa\ocr_pdfs"
selectable_dir = r"\Dataa\selectable_pdfs"

# Get sets of all filenames
all_files = set(f for f in os.listdir(raw_dir) if f.endswith(".pdf"))
ocr_files = set(os.listdir(ocr_dir))
selectable_files = set(os.listdir(selectable_dir))

# Files that made it to either folder
processed_files = ocr_files.union(selectable_files)

# Find the ones that were not processed
missing_files = sorted(list(all_files - processed_files))

print(f"Total missing files: {len(missing_files)}\n")
print("Missing file names:")
for name in missing_files:
    print(name)


❌ Total missing files: 13

Missing file names:
AMITRAL 100MG COMP_ DISPERS_  B_30.pdf
AMITRAL 25MG COMP_ DISPERS_ B_30.pdf
AMITRAL 5MG COMP_ DISPERS_  B_30.pdf
AMODEX-Gé 250MG_5ML PDRE_ P_ SUSP_ BUV_ FL_60ML.pdf
COLOSTOP 200MG COMP B_30.pdf
GLUCOPHAGE  1000MG COMP_ PELLI_ SEC_ B_30.pdf
KYNCEPT 10 MG COMP ORODISPERSIBLE B_30.pdf
KYNCEPT 5 MG COMP ORODISPERSIBLE B_30.pdf
LEVOCARB 250MG_25MG__ COMP_ SEC_ B_100.pdf
RISPAL 1MG COMP_ PELLI_ SEC_ B_60.pdf
RISPAL 2MG COMP_ PELLI_ SEC_ B_60.pdf
RISPAL 4MG COMP_ PELLI_ SEC_ B_30.pdf
VITAMINE B12 RAZES  500 µG_ML _OU 1000µG_2ML_ SOL_INJ_ ET BUV B_5AMP.pdf


In [None]:
## Since OCRmyPDF failed on the missing files another solution is implemented in this script:

"""
This code applies manual OCR to a list of failed PDFs by converting each page to an image,
running Tesseract OCR, and saving the output as a searchable PDF with the original filename.
"""

from pdf2image import convert_from_path
from PyPDF2 import PdfMerger
import pytesseract
from PIL import Image
import tempfile


missing_dir = r"\Dataa\unzipped_pdfs"  
output_dir = r"\Dataa\ocr_pdfs"
os.makedirs(output_dir, exist_ok=True)


#Processing each missing file 
for filename in missing_files:
    input_path = os.path.join(missing_dir, filename)
    output_path = os.path.join(output_dir, filename)

    print(f"OCR-ing: {filename}")

    try:
        # Converting PDF pages to images
        images = convert_from_path(input_path, dpi=300)

        
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_pdf_paths = []

            for i, image in enumerate(images):
                temp_pdf_path = os.path.join(temp_dir, f"{i}.pdf")
                pdf_bytes = pytesseract.image_to_pdf_or_hocr(image, extension='pdf')

                with open(temp_pdf_path, 'wb') as f:
                    f.write(pdf_bytes)

                temp_pdf_paths.append(temp_pdf_path)

            
            merger = PdfMerger()
            for pdf_path in temp_pdf_paths:
                merger.append(pdf_path)
            merger.write(output_path)
            merger.close()

        print(f"Saved to {output_path}")

    except Exception as e:
        print(f"Failed to OCR {filename}: {e}")


In [None]:
## Confirming how much files are there in the 'ocr_pdfs' folder after the correction : 

path = r"\Dataa\ocr_pdfs"
pdfs = [f for f in os.listdir(path) if f.endswith(".pdf")]
print(f"Leaflets pdf count: {len(pdfs)}")


Leaflets pdf count: 171


In [None]:
## Confirming how much pdfs are there in the 'selectable_pdfs' folder after the correction : 

path = r"\Dataa\selectable_pdfs"
pdfs = [f for f in os.listdir(path) if f.endswith(".pdf")]
print(f"leaflets pdf count: {len(pdfs)}")

## Now the the numbers addition will give us 1921 medication leaflets pdfs

leaflets pdf count: 1750


In [None]:
"""
Step 3: Converting OCR-processed and selectable PDFs into clean Markdown (.md) files using Docling.
Outputs are saved in: /extracted_txt_docling
"""

import os
from docling.document_converter import DocumentConverter

# Defining input and output directories
input_dirs = [
    r"\Dataa\ocr_pdfs",
    r"\Dataa\selectable_pdfs"
]
output_dir = r"\Dataa\extracted_txt_docling"
os.makedirs(output_dir, exist_ok=True)

# Initializing the Docling converter
converter = DocumentConverter()

# Processing all PDFs in both input directories


for input_dir in input_dirs:
    print(f"scanning directory: {input_dir}")
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(input_dir, filename)
            txt_path = os.path.join(output_dir, filename.replace(".pdf", ".md"))

            print(f"Converting: {filename}")
            try:
                result = converter.convert(pdf_path)
                doc_text = result.document.export_to_markdown()

                with open(txt_path, "w", encoding="utf-8") as f:
                    f.write(doc_text)

                print(f"Saved: {txt_path}")
              

            except Exception as e:
                print(f"Error processing {filename}: {e}")
               




In [None]:
## Confirming the final number of the .md files that will be used as the input data for the RAG systems: 

final_files = r"\Dataa\extracted_txt_docling"
md_path= [f for f in os.listdir(final_files) if f.endswith(".md")]
print(f"Final leaflet files count: {len(md_path)}")


Final leaflet files count: 1921
