In [1]:
from pathlib import Path
import os
from pdf2image import convert_from_path

import cv2
import pytesseract as pt
from PIL import Image
import pandas as pd

In [2]:
### ca. 190 min!!
### Insert full path of input directory containing the election programs as .pdf file 
input_dir = Path("C:/DHBW WWI DS(A) Studium/S5/Natural Language Processing/NLP_Projekt/NLP/input/")

### Erstelle einzelne Bilder aus PDF

In [None]:
def pdf_2_images(input_dir=input_dir):
    pdf_counter = 1
    ### Iterate through pdf input directory and process only PDF files via .pdf extension
    for file in os.listdir(input_dir):
        if file.endswith(".pdf"):
            print(f"Processing PDF file {pdf_counter}: {file}")
            pdf_counter += 1

            ### Create entire filepath for each PDF file
            pdf_file_path = input_dir/file
            fn = file.split(".")[0]

            ### Create a sub directory inside the "input" directory for each PDF
            #############################################
            sub_dir_path = input_dir/fn
            ### Check if sub_dir already exists, if not --> create sub_dir for each PDF
            if not os.path.exists(sub_dir_path):
                os.mkdir(os.path.join(sub_dir_path))
                print(f"New SUB_DIR created: {sub_dir_path}")
            else:
                print(f"SUB_DIR already exists: {sub_dir_path}")

            ### Generate image file in sub_dir for each page in PDF file
            #############################################
            pages = convert_from_path(pdf_file_path, dpi=500)
            for page_index, page in enumerate(pages):
                ### Name schema of the image filenames
                img_fn = f"{fn}_Page_{str(page_index)}"+".png"
                ### Check if image file already exists
                if not os.path.exists(sub_dir_path/img_fn):
                    print(f"Saving {fn} PDF page {page_index}")
                    page.save(sub_dir_path/img_fn, "PNG")
                    continue
                else:
                    print("File already exists --> SKIP")
                    continue
    print("PROCESSING FINISHED!")

pdf_2_images(input_dir=input_dir)

### Erstelle eine .txt Datei mit ausgelesenem Text

In [3]:
### Raw string extraction as .txt file of the image files combined per PDF/sub_dir
def get_txt_from_images(input_dir=input_dir):
    ### Create a list of the sub directories for later iteration
    sub_dir_list = []
    for dir in os.listdir(input_dir):
        if (".pdf" not in dir) & (".txt" not in dir) & (".xlsx" not in dir) & (".csv" not in dir):
            sub_dir_list.append(dir)
        else:
            continue
    print(f"List of Sub_DIRs: \n{sub_dir_list}")

    ### Iterate over the list of sub directories and generate the sub_dir path
    for sub_dir in sub_dir_list:
        sub_dir_path = input_dir/sub_dir

        ### Iterating through folder would lead to incorrect order 1, 10, 2, 3 --> Therefore iterate based on the filename numbering
        ### Generate filename_structure by removing the number of the page (REMOVE --> _0.png)
        fn_structure = os.listdir(sub_dir_path)[0]
        fn_structure = fn_structure.split("_")[0:-1]
        fn_structure = "_".join(fn_structure)

        ### .txt filepath
        txt_file = Path(input_dir/f"{sub_dir}.txt")
        with open(txt_file, "a", encoding="utf-8") as output_file:

            ### Generate the correct order for extraction 
            ### Iterating through folder would lead to incorrect order 1, 10, 2, 3 --> Therefore iterate based on the filename numbering
            for img_no in range(0 , len(os.listdir(sub_dir_path))):
                ### use the fn_structure for each party and generate filenames via iterating image numbers
                img_file = fn_structure+f"_{img_no}.png"
                print(f"Extracting text from image: {img_file}")

                ## Recognize the text as string in image using pytesserct
                ### German language package used "deu"
                text = pt.image_to_string(
                Image.open(sub_dir_path/img_file), 
                lang="deu", 
                )
                output_file.write(text)
            print(f"Save text into .txt file: {txt_file}")

    print("Text Extraction FINISHED!")

get_txt_from_images(input_dir=input_dir)

List of Sub_DIRs: 
['AFD_Wahlprogramm_2021', 'CDU-CSU_Wahlprogramm_2021', 'DIE_GRUENEN_Wahlprogramm_2021', 'DIE_LINKE_Wahlprogramm_2021', 'FDP_Wahlprogramm_2021', 'SPD_Wahlprogramm_2021']
Extracting text from image: AFD_Wahlprogramm_2021_Page_0.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_1.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_2.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_3.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_4.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_5.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_6.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_7.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_8.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_9.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_10.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_11.png
Extracting text from image: AFD_Wahlprogramm_2021_Page_12.png
Extracting text 

## Erstelle eine bereinigte .csv Datei mit ausgelesenem Text

In [4]:
### DF cleaning
def process_df(df):
    ### Entferne manche cols vom DF --> [level], [block_num], [par_num]
    ### Wähle explizit die pytesseract cols, die benötigt werden und verkleinere das DF
    col_lst = ['file', 'page_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']
    dfc = df[col_lst].copy()

    ### Entferne alle Wordobjekte mit einer OCR Konfidenz < 80 --> unbekannt ob 70 optimal ist
    dfc = dfc[dfc["conf"] > 70].reset_index(drop=True)

    ### Vergrößere das DF mit dem rechten Grenzwert eine Wordobjects [right] und dem Abstand zum nächsten Wordobjekt [delta]
    right_border_list = []
    delta_list = []
    for index, rows in dfc.iterrows():
            ### [right] = [Left] + [width] --> rechte Grenze eines Wordobjektes im DF
            right_border_value = dfc.at[index, 'left'] + dfc.at[index, 'width']
            right_border_list.append(right_border_value)
            while index < dfc.index.max():
                ### Wenn [right] > [left][+1] --> dann muss das nächste Wortobjekt im DF das letzte Wortobjekt der Zeile sein
                ### Abstand [delta] = 0 zeigt an, dass es kein folgendes Wortobjekt in der Zeile gibt
                if right_border_value >= dfc.at[index+1, 'left']:
                    delta_value = 0
                else:
                    delta_value = dfc.at[index+1, 'left'] - right_border_value
                delta_list.append(delta_value)
                break
    delta_list.append(0)
    dfc.insert(6, 'right', right_border_list)  
    dfc.insert(7, 'delta', delta_list)

    return dfc

##############################################################################
##############################################################################
##############################################################################

### Dataframe extraction as .csv file of the image files combined per PDF/sub_dir
def get_df_from_images(input_dir=input_dir):

    sub_dir_list = []
    for dir in os.listdir(input_dir):
        if (".pdf" not in dir) & (".txt" not in dir) & (".xlsx" not in dir) & (".csv" not in dir):
            sub_dir_list.append(dir)
        else:
            continue
    print(f"List of Sub_DIRs: \n{sub_dir_list}")

    ### Initiate MASTER_DF with all parties
    master_df = pd.DataFrame()

    ### Iterate over the list of sub directories and generate the sub_dir path
    for sub_dir in sub_dir_list:
        sub_dir_path = input_dir/sub_dir

        ### Iterating through folder would lead to incorrect order 1, 10, 2, 3 --> Therefore iterate based on the filename numbering
        ### Generate filename_structure by removing the number of the page (REMOVE --> _0.png)
        fn_structure = os.listdir(sub_dir_path)[0]
        fn_structure = fn_structure.split("_")[0:-1]
        fn_structure = "_".join(fn_structure)

        ### Initiate an empty DF for each sub_dir
        comb_df = pd.DataFrame()
        ### Generate the correct order for extraction 
        ### Iterating through folder would lead to incorrect order 1, 10, 2, 3 --> Therefore iterate based on the filename numbering
        for img_no in range(0 , len(os.listdir(sub_dir_path))):
            ### use the fn_structure for each party and generate filenames via iterating image numbers
            img_fname = fn_structure+f"_{img_no}.png"
            img_fpath = sub_dir_path/img_fname
            print(f"Extracting DF from image: {img_fpath}")

            ### Read images and create pytesseract DF with coordinates for ecach image and combine for each sub_dir/PDF
            img_file = cv2.imread(str(img_fpath), 1)
            sub_df   = pt.image_to_data(
                            ### German language package used "deu"
                            img_file, lang="deu",
                            ### Pytesseract page segmentation mode (--psm 6) -> assume a single uniform block of text. 
                            ### https://ai-facets.org/tesseract-ocr-best-practices/
                            config= r'--oem 1 --psm 6' ,
                            output_type = pt.Output.DATAFRAME
                            )
            ### REMOVE the pytesseract page_num (always 1)
            sub_df = sub_df.drop(columns=["page_num"], axis=1)
            ### ADD the correct page_num into the column for each sub_DF and insert at first position
            sub_df.insert(0, "page_num", [img_no for i in range(0, len(sub_df))]) # image no which counts based on the length of the sub_dir --> MAYBE Change...
            comb_df = pd.concat([comb_df, sub_df])

        ### ADD the sub_dir name and insert at first position
        comb_df.insert(0, "file", [sub_dir for i in range(0, len(comb_df))])

        ### Säubere das DF mit Hilfe process_df() Funktion mit Hilfe der Konfidenz Werte bei der OCR (conf > 70)
        comb_df = process_df(comb_df)
        
        ### Save separated DF for each sub_dir
        comb_df.to_csv(Path(input_dir/f"{sub_dir}.csv"), index=False)
        print(f"Save DF into .csv file:")

        ### CONCATENATE the combined party_df into one MASTER_DF
        master_df = pd.concat([master_df, comb_df])

    ### SAVE and return the full Master_DF with all parties
    master_df.to_csv(Path(input_dir/f"MASTER.csv"), index=False)
    print("PROCESSING FINISHED!")
    return(master_df)
    
get_df_from_images(input_dir=input_dir)

List of Sub_DIRs: 
['AFD_Wahlprogramm_2021', 'CDU-CSU_Wahlprogramm_2021', 'DIE_GRUENEN_Wahlprogramm_2021', 'DIE_LINKE_Wahlprogramm_2021', 'FDP_Wahlprogramm_2021', 'SPD_Wahlprogramm_2021']
Extracting DF from image: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\NLP\input\AFD_Wahlprogramm_2021\AFD_Wahlprogramm_2021_Page_0.png
Extracting DF from image: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\NLP\input\AFD_Wahlprogramm_2021\AFD_Wahlprogramm_2021_Page_1.png
Extracting DF from image: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\NLP\input\AFD_Wahlprogramm_2021\AFD_Wahlprogramm_2021_Page_2.png
Extracting DF from image: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\NLP\input\AFD_Wahlprogramm_2021\AFD_Wahlprogramm_2021_Page_3.png
Extracting DF from image: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\NLP\input\AFD_Wahlprogramm_2021\AFD_Wahlprogramm_2021_Page_4.png
Extracting D

Unnamed: 0,file,page_num,line_num,word_num,left,top,right,delta,width,height,conf,text
0,AFD_Wahlprogramm_2021,0,1,1,338,1717,1431,55,1093,188,96.505013,Deutschland.
1,AFD_Wahlprogramm_2021,0,1,2,1486,1721,1884,87,398,138,94.446686,Aber
2,AFD_Wahlprogramm_2021,0,1,3,1971,1669,2594,0,623,355,94.446686,normal.
3,AFD_Wahlprogramm_2021,0,1,4,1936,1721,2597,0,661,138,95.646454,normal.
4,AFD_Wahlprogramm_2021,0,2,1,333,2144,703,22,370,71,95.806519,Programm
...,...,...,...,...,...,...,...,...,...,...,...,...
25203,SPD_Wahlprogramm_2021,64,32,2,912,5600,985,2575,73,29,95.978912,2021
25204,SPD_Wahlprogramm_2021,64,32,3,3560,5600,3651,16,91,29,96.100380,SEITE
25205,SPD_Wahlprogramm_2021,64,32,4,3667,5612,3678,17,11,17,94.163429,>
25206,SPD_Wahlprogramm_2021,64,32,5,3695,5600,3738,0,43,30,96.162041,65


In [3]:
df = pd.read_csv(Path("C:/DHBW WWI DS(A) Studium/S5/Natural Language Processing/NLP_Projekt/NLP/input/Master.csv"))

def create_bbx_images(input_dir=input_dir, df=df):

    ### 1) Create list of the different files/parties
    file_lst = df["file"].unique().tolist()
    
    ### 2) Iterate over list an create Sub_DF for one file/party
    for file in file_lst:
        print(f"Processing file {file}")

        ### 3) Create directory inside "input" directory to save the BBX images
        file_ext = file+"_BBX_Image"
        bbx_path = input_dir/file_ext
        ### Check if sub_dir already exists, if not --> create sub_dir for each PDF
        if not os.path.exists(bbx_path):
            os.mkdir(os.path.join(bbx_path))
            print(f"New SUB_DIR created: {bbx_path}")
        else:
            print(f"SUB_DIR already exists: {bbx_path}")

        ### 4) Create Sub_DF with values of only one PDF file/party
        sub_df = df[df["file"]==file]

        ### 5) Create list of the different pages in the file
        page_lst = sub_df["page_num"].unique().tolist()

        ### 6) Iterate over page_list
        for page in page_lst:

            ### 7) Grab the respective Image file
            img = cv2.imread(str(input_dir)+f"{file}\\{file}_Page_{page}.png", 1)

            ### 8) Create SubSub_DF with the values for each page
            subsub_df = sub_df[sub_df["page_num"]==page]

            ### 9) Iterate over SubSub_DF and save the respective coordinates
            for index, rows in subsub_df.iterrows():
                (left, top, width, height) = subsub_df.at[index, 'left'], subsub_df.at[index, 'top'], subsub_df.at[index, 'width'], subsub_df.at[index, 'height']

                #### 10) Draw rectangles and extend the rectangles slightly --> BBX Images
                cv2.rectangle(img, (left, top), (left-5 + width+10, top-5 + height+10), (0, 0, 0), 10)

            try: 
                ### 11) After finishing the iteration save the BBX Image in the respective directory
                cv2.imwrite(str(bbx_path)+f"{file}_BBX_Page_{page}.png", img)
            except:
                continue

    print("FINISHED! BBX Images completely created!")


create_bbx_images(input_dir=input_dir, df=df)

Processing file AFD_Wahlprogramm_2021
SUB_DIR already exists: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\NLP\input\AFD_Wahlprogramm_2021_BBX_Image
Processing file CDU-CSU_Wahlprogramm_2021
SUB_DIR already exists: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\NLP\input\CDU-CSU_Wahlprogramm_2021_BBX_Image
Processing file DIE_GRUENEN_Wahlprogramm_2021
SUB_DIR already exists: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\NLP\input\DIE_GRUENEN_Wahlprogramm_2021_BBX_Image
Processing file DIE_LINKE_Wahlprogramm_2021
SUB_DIR already exists: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\NLP\input\DIE_LINKE_Wahlprogramm_2021_BBX_Image
Processing file FDP_Wahlprogramm_2021
SUB_DIR already exists: C:\DHBW WWI DS(A) Studium\S5\Natural Language Processing\NLP_Projekt\NLP\input\FDP_Wahlprogramm_2021_BBX_Image
Processing file SPD_Wahlprogramm_2021
SUB_DIR already exists: C:\DHBW WWI DS(A) Studium\S5\Natural 

## Nicht funktionaler Code zum Zusammenfügen der einzelnen Textobjekte mit "Two Column Text Dateien" (Linke, FDP)

In [None]:
# testdf = pd.read_csv("C:\\DHBW WWI DS(A) Studium\\S5\\Natural Language Processing\\NLP_Projekt\\input\\AFD_Wahlprogramm_2021_fragmented.csv")

def merge_sentences_from_df(df):
    ### Mache eine Kopie des übergebenen DFs
    dfc = df.copy()

    ### Sortiere das DF --> 1) Seite, 2) Zeile, 3) linke Koordinate
    dfc = dfc.sort_values(["page_num", "line_num", "left"], ascending=[True, True, True]).reset_index(drop=True)

    for index, row in dfc.iterrows():
        ### Abbruchbedingung der Iteration, weil auf den Nachfolgender verwiesen wird, daher Abbruch beim vorletzten Objekt
        if index == dfc.index[-1]:
            break
        else:
            ### Bei "." oder "!" oder "?" soll ein Merge von zwei Wortobjekten durchgeführt werden
            if ("." not in dfc.at[index, "text"]) and ("!" not in dfc.at[index, "text"]) and ("?" not in dfc.at[index, "text"]):
                ### Merge der Wortobjekte
                dfc.loc[index+1, 'text'] = dfc.loc[index, 'text'] + ' ' + dfc.loc[index+1, 'text']
                ### Änderung der Daten nach dem Merge --> Übernehme Daten des ersten Wortobjektes
                dfc.loc[index+1, 'left'] = dfc.loc[index, 'left']
                dfc.loc[index+1, 'line_num'] = dfc.loc[index, 'line_num']

                ### Drop des ersten Wortobjektes nach dem Merge
                dfc = dfc.drop(index)
            else:
                continue
                
    return dfc

# testdf_merged = merge_sentences_from_df(testdf)
# testdf_merged

In [None]:
# # test = tdf1[19:100].reset_index(drop=True)
# test = tdf1[0:50].reset_index(drop=True)

# ####
# ####
# #### FIRST ITERATION
# ####
# ####
# test1 = test.copy()
# for index, row in test1.iterrows():
#     if index == test1.index[-1]:
#         break
#     else:
#         if (test1.at[index, 'delta'] <= 50 and test1.at[index, 'delta'] >= 10) and ("." not in test1.at[index, "text"]) and ("!" not in test1.at[index, "text"]) and ("?" not in test1.at[index, "text"]):
#             ### Merge Wortobjekte
#             test1.loc[index+1, 'text'] = test1.loc[index, 'text'] + ' ' + test1.loc[index+1, 'text']
#             ### Übernahme die Daten des ersten Wortobjektes --> NICHT [delta] !!!!
#             test1.loc[index+1, 'left'] = test1.loc[index, 'left']
#             test1.loc[index+1, 'line_num'] = test1.loc[index, 'line_num']
#             # test3.loc[index+1, 'word_num'] = test3.loc[index, 'word_num']
#             # test3.loc[index+1, 'top'] = test3.loc[index, 'top']
#             # test3.loc[index+1, 'right'] = test3.loc[index, 'right']
#             # test3.loc[index+1, 'width'] = test3.loc[index, 'width']
#             # test3.loc[index+1, 'height'] = test3.loc[index, 'height']
#             # test3.loc[index+1, 'conf'] = test3.loc[index, 'conf']
#             test1 = test1.drop(index)
#         # elif (test3.at[index, 'delta'] == 0) and ("." not in test.at[index, "text"]) and ("!" not in test.at[index, "text"]) and ("?" not in test.at[index, "text"]):
#         else:
#             continue

# test1 = test1.sort_values(["page_num", "line_num", "left"], ascending=[True, True, True]).reset_index(drop=True)
# test1

In [None]:
# test2 = test1.copy()


# for index, row in test2.iterrows():
#     if index == test2.index[-1]:
#         break
#     else:
#         # if ("." not in test2.at[index, "text"]) and ("!" not in test2.at[index, "text"]) and ("?" not in test2.at[index, "text"]):
#         #     continue

#         ### FIRST COLUMN in TEXT --> [delta] > 50 bedeutet, dass es ein nachfolgendes Wortobjekt in derselben Zeile
#         if (test2.at[index, "delta"] > 50):
#             nxt_line_num = test2.at[index, "line_num"] + 1
#             ### Sub DF mit allen Wortobjekten der nächsten Zeile
#             sub_df = test2[test2["line_num"] == nxt_line_num]
#             ### Iteriere über das Sub DF
#             for sub_index, sub_row in sub_df.iterrows():
#                 if sub_df.at[sub_index, "left"] < test2.at[index, "left"]:
                    
#                     ### ?????
#                     test2.loc[sub_index, 'text'] = test2.loc[index, 'text'] + ' ' + test1.loc[sub_index, 'text']
#                     break

#                 else:
#                     continue
#             test2 = test2.drop(index)


#         elif (test2.at[index, "delta"] == 0):
#             nxt_line_num = test2.at[index, "line_num"] + 1
#             ### Sub DF mit allen Wortobjekten der nächsten Zeile
#             sub_df = test2[test2["line_num"] == nxt_line_num]
#             ### Iteriere über das Sub DF
#             for sub_index, sub_row in sub_df.iterrows():
#                 if (sub_df.at[sub_index, "left"] < test2.at[index, "left"]):
#                 # if (sub_df.at[sub_index, "left"] >= test2.at[index, "left"]-20) and (sub_df.at[sub_index, "left"] <= test2.at[index, "left"]+20):
                    
#                     ### ?????
#                     test2.loc[sub_index, 'text'] = test2.loc[index, 'text'] + ' ' + test1.loc[sub_index, 'text']
#                     break

#                 else:
#                     continue
#             test2 = test2.drop(index)

#         else:
#             continue
        

# test2 = test2.sort_values(["page_num", "line_num", "left"], ascending=[True, True, True]).reset_index(drop=True)
# test2