In [7]:
from pathlib import Path, PureWindowsPath
import os

import pytesseract as pt
from pdf2image import convert_from_path
import cv2
import itertools
from PIL import Image
import openpyxl
import pandas as pd

In [2]:
### Insert full path of input directory containing the election programs as .pdf file 
input_dir = Path("")

In [3]:
def create_img_files_from_pdf(input_dir):
    pdf_counter = 1
    ### Iterate through pdf input dir and process only .pdf files
    for file in os.listdir(input_dir):
        if file.endswith(".pdf"):
            print(f"Processing PDF file {pdf_counter}: {file}")
            pdf_counter += 1

            pdf_file_path = input_dir/file
            fn = file.split(".")[0]

            ### Create a sub_dir inside the "input" dir for each .pdf election programm document
            #############################################
            sub_dir_path = input_dir/fn
            ### Check if sub_dir already exists, if not --> create sub_dir for the .pdf election file
            if not os.path.exists(sub_dir_path):
                os.mkdir(os.path.join(sub_dir_path))
                print(f"New SUB_DIR created: {sub_dir_path}")
            else:
                print(f"SUB_DIR already exists: {sub_dir_path}")

            ### Get each page image and save into new sub_dir for each .pdf election programm document
            #############################################
            pages = convert_from_path(pdf_file_path, dpi=500)
            for page_index, page in enumerate(pages):
                img_fn = f"{fn}_Page_{str(page_index)}"+".png"
                if not os.path.exists(sub_dir_path/img_fn):
                    print(f"Saving {fn} PDF page {page_index}")
                    page.save(sub_dir_path/img_fn, "PNG")
                else:
                    print("File already exists --> SKIP")
                    continue
    print("PROCESSING FINISHED!")

create_img_files_from_pdf(input_dir)

Processing PDF file 1: AFD_Wahlprogramm_2021.pdf
New SUB_DIR created: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\input\AFD_Wahlprogramm_2021
Saving AFD_Wahlprogramm_2021 PDF page 0
Saving AFD_Wahlprogramm_2021 PDF page 1
Saving AFD_Wahlprogramm_2021 PDF page 2
Saving AFD_Wahlprogramm_2021 PDF page 3
Saving AFD_Wahlprogramm_2021 PDF page 4
Saving AFD_Wahlprogramm_2021 PDF page 5
Saving AFD_Wahlprogramm_2021 PDF page 6
Saving AFD_Wahlprogramm_2021 PDF page 7
Saving AFD_Wahlprogramm_2021 PDF page 8
Saving AFD_Wahlprogramm_2021 PDF page 9
Saving AFD_Wahlprogramm_2021 PDF page 10
Saving AFD_Wahlprogramm_2021 PDF page 11
Saving AFD_Wahlprogramm_2021 PDF page 12
Saving AFD_Wahlprogramm_2021 PDF page 13
Saving AFD_Wahlprogramm_2021 PDF page 14
Saving AFD_Wahlprogramm_2021 PDF page 15
Saving AFD_Wahlprogramm_2021 PDF page 16
Saving AFD_Wahlprogramm_2021 PDF page 17
Saving AFD_Wahlprogramm_2021 PDF page 18
Saving AFD_Wahlprogramm_2021 PDF page 19
Saving AFD_Wahlprogramm

In [4]:
def get_txt_files_from_sub_dirs(input_dir=input_dir):
    sub_dir_list = []
    for dir in os.listdir(input_dir):
        if (".pdf" not in dir) & (".txt" not in dir):
            sub_dir_list.append(dir)
        else:
            continue
    print(f"List of Sub_DIRs: \n{sub_dir_list}")

    for sub_dir in sub_dir_list:
        sub_dir_path = input_dir/sub_dir

        ### Raw string structure of the image files per party
        ### Iterating through folder would lead to incorrect order 1, 10, 2, 3 --> Therefore iterate filename numbers based
        fn_structure = os.listdir(sub_dir_path)[0]
        fn_structure = fn_structure.split("_")[0:-1]
        fn_structure = "_".join(fn_structure)

        ### .txt filepath
        txt_file = Path(input_dir/f"{sub_dir}.txt")
        with open(txt_file, "a", encoding="utf-8") as output_file:

            ### Generate the number for correct order extraction
            for img_no in range(0 , len(os.listdir(sub_dir_path))):
                ### use the fn_structure for each party and generate filenames via iterating image numbers
                img_file = fn_structure+f"_{img_no}.png"
                print(f"Extracting text from image: {img_file}")

                ## Recognize the text as string in image using pytesserct
                ### German language package used "deu"
                text = pt.image_to_string(
                Image.open(sub_dir_path/img_file), 
                lang="deu", 
                )
                output_file.write(text)
            print(f"Save text into .txt file: {txt_file}")
    print("Text Extraction FINISHED!")

get_txt_files_from_sub_dirs()

List of Sub_DIRs: 
['AFD_Wahlprogramm_2021', 'CDU-CSU_Wahlrprogramm_2021', 'DIE_GRUENEN_Wahlprogramm_2021', 'DIE_LINKE_Wahlprogramm_2021', 'FDP_Wahlprogramm_2021', 'SPD_Wahlprogramm_2021']
Extracting text from image: AFD_Wahlprogramm_2021_Page_0.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_1.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_2.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_3.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_4.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_5.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_6.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_7.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_8.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_9.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_10.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_11.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_12.png
Extracting text