### Attention

`pytesseract` needs a tesseract installation to work. `pdf2image` requires a poppler installation. Here are starting links for the two installations:

**Tesseract**: https://github.com/UB-Mannheim/tesseract/wiki
**pdf2image (poppler)**: https://pdf2image.readthedocs.io/en/latest/installation.html

Keep in mind that on windows systems you need to add the installations to PATH

# Setup

In [1]:
from pdf2image import convert_from_path
from PIL import Image, ImageFilter
import pytesseract
import os

In [2]:
def preprocess_image(image):
    """
    Apply image processing techniques to enhance OCR accuracy.

    Args:
        image (PIL.Image): Original image.

    Returns:
        PIL.Image: Preprocessed image.
    """
    # Convert to grayscale
    image = image.convert('L')
    # Apply adaptive thresholding for binarization
    image = image.point(lambda x: 0 if x < 128 else 255, '1')
    # Remove noise and smooth the image
    image = image.filter(ImageFilter.MedianFilter())
    return image

def merge_images(images):
    """
    Vertically merge a list of images into one single image.

    Args:
        images (list): List of PIL Image objects.

    Returns:
        PIL.Image: A single merged image.
    """
    preprocessed_images = [preprocess_image(img) for img in images]
    widths, heights = zip(*(i.size for i in preprocessed_images))
    total_height = sum(heights)
    max_width = max(widths)
    merged_image = Image.new('L', (max_width, total_height), 'white')
    y_offset = 0
    for im in preprocessed_images:
        merged_image.paste(im, (0, y_offset))
        y_offset += im.size[1]
    return merged_image


def extract_text_from_pdf(pdf_path, output_txt_path, dpi):
    """
    Extract text from a PDF file by converting it to a single image and then using OCR.

    Args:
    pdf_path (str): Path to the PDF file.
    output_txt_path (str): Path to the output text file to save the extracted text.
    """
    # Convert PDF to images (one per page)
    try:
        images = convert_from_path(pdf_path, dpi=dpi)
    except:
        images = convert_from_path(pdf_path, dpi=dpi-100)

    # Configuring tesseract to use a specific PSM and OEM
    custom_oem_psm_config = '--oem 3 --psm 3'

    # Use pytesseract to extract text
    text = ''
    for image in images:
        text += pytesseract.image_to_string(image, lang='deu', config=custom_oem_psm_config)

    # Save the extracted text to a file
    with open(output_txt_path, 'w', encoding='utf-8') as file:
        file.write(text)


# Parameters

In [3]:
# Paths
PDF_DIRECTORY = 'data/pdfs'
TEXT_DIRECTORY = 'data/texts'

# DPI for PDF to image transformation
dpi = 300

In [4]:
# Crate folders if the don't exist
if not os.path.exists(TEXT_DIRECTORY):
    os.makedirs(TEXT_DIRECTORY)

# Extract Text

In [5]:
# List all pdfs to read
pdf_names = os.listdir(PDF_DIRECTORY)

# For each pdf create a txt file
for pdf_name in pdf_names:
    
    in_dir = f'{PDF_DIRECTORY}/{pdf_name}'
    file_name = pdf_name[:-4] + '.txt'
    txt_out = TEXT_DIRECTORY + '/' + file_name

    print(file_name)

    if file_name not in os.listdir(TEXT_DIRECTORY):
        extract_text_from_pdf(in_dir, txt_out, dpi)

2001-01-16-3_0_Postulat_Susi_Tapernoux__Erhebungen_über_die_Versorgung_der_einzelnen_Wohngebiete_in_der_Stadt_St.Ga.txt
2001-01-16-3_1_Beilage___09.01.2001__1.txt
2001-01-16-3_2_Parlamentarischer_Vorstoss___24.10.2000__2.txt
2001-01-16-4_0_Parlamentarischer_Vorstoss___21.11.2000__0.txt
2001-01-16-4_1_Postulat_Walter_Brunner__Familienergänzende_Betreuung_von_Kindern_im_Vorschulalter__Ausbau_des_Angeb.txt
2001-01-16-5_0_Parlamentarischer_Vorstoss___24.10.2000__0.txt
2001-01-16-5_1_Interpellation_Andreas_Frank__Zwängerei_um_Mobilfunkantennen_Gesuch__schriftlich___19.12.2000__1.txt
2001-02-13-2_0_Parlamentarischer_Vorstoss___24.10.2000__0.txt
2001-02-13-2_1_Interpellation_Albert_Nufer__Eigentumswohnungen_auf_dem_alten_Lagerhaus__schriftlich___23.01.2001__1.txt
2001-02-13-3_0_Interpellation_Andreas_Frank__Gefährdetes_Kulturhaus___23.01.2001__0.txt
2001-02-13-3_1_Parlamentarischer_Vorstoss___24.10.2000__1.txt
2001-02-13-4_0_Interpellation_Mirjam_Köchli__Englisch_Lehrmittel__schriftlich___09.



2021-08-24-16_0_Interpellation_Liegenschaften__und_Baukommission__Bevölkerungsvorstoss_Begegnungszone_Primelweg__sch.txt
2021-08-24-16_1_Ablauf_Einrichtung_Begegnungszone_16.06.2021__1.txt
2021-08-24-16_2_Formular_Antrag_Begegnungszone_16.06.2021__2.txt
2021-08-24-16_3_Interpellation__Bevölkerungsvorstoss_Begegnungszone_Primelweg_20.05.2021__3.txt
2021-08-24-17_0_Einfache_Anfrage_Stefan_Grob__Wie_nachhaltig_geht_die_Stadt_mit_Asphalt_um___Beantwortung___21.05.20.txt
2021-08-24-17_1_Einfache_Anfrage_Stefan_Grob__Wie_nachhaltig_geht_die_Stadt_mit_Asphalt_um____18.03.2021__1.txt
2021-08-24-18_0_Einfache_Anfrage_Andreas_Dudli_Abwendung_eines_Finanzdebakels___Wann_reagiert_der_Stadtrat_mit_einem.txt
2021-08-24-18_1_Einfache_Anfrage_„Abwendung_eines_Finanzdebakels_–_Wann_reagiert_der_Stadtrat_mit_einem_Entlastungsp.txt
2021-08-24-19_0_Einfache_Anfrage_Karl_Schimke__Campus_Platztor__Kommt_die_«Kulturachse»_auch___Beantwortung___09.06..txt
2021-08-24-19_1_Einfache_Anfrage__Campus_Platztor__Kom