In this file, we:

- Separate our dataset in train_test and validation set
- Based on text analysis we create features for all projects

**Result: agg_data5**

In [1]:
import fitz  # PyMuPDF
import numpy as np
import os
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import CountVectorizer

agg_data = pd.read_csv("agg_data4.csv")
agg_data.head()

Unnamed: 0,pdf,project_name,country,multiple_phases,number_countries,eval_year,number_sectors,number_subsectors,mean_investment,number_funding_source,...,outcome_partially,outcome_not,cluster,invest_plan_vs_actual,adaptive_management,wbgi_gee,wdi_expedus,wdi_gdpind,gpi_ss,bci_bcistd
0,%C3%84gypten_Qena_2022_D.pdf,Kommunale Wasserver- und Abwasserentsorgung Qena,Egypt,0,1,2022,1,1,12670000.0,1,...,33.333333,0.0,3,0.186092,1,-0.447547,0.0,32.711431,2.497,0.0
1,%C3%84gypten_Umwelt_2022_D.pdf,Förderung von Umweltschutzmaßnahmen der privat...,Egypt,0,1,2022,1,1,25780000.0,1,...,0.0,0.0,3,0.55144,1,-0.447547,0.0,32.711431,2.497,0.0
2,%C3%84thiopien_Gavi_2022_D.pdf,Impfprogrammförderung in Äthiopien in Zusammen...,Ethiopia,0,1,2022,1,1,10000000.0,1,...,0.0,33.333333,3,,0,-0.745265,0.0,22.722007,2.94,0.0
3,Afrika_TCX_2012_D.pdf,Lokalwährungsfonds TCX,Afrika (regional),0,1,2012,1,1,90000000.0,1,...,0.0,0.0,4,,0,-0.657388,17.500307,25.828145,2.727727,4.263196
4,Albanien_Elbasan_2011.pdf,Wirtschaftsförderung durch Ausbau kommunaler I...,Albania,0,1,2011,1,1,10600000.0,1,...,16.666667,33.333333,2,0.645367,1,-0.202177,0.0,24.484478,2.148,4.303499


First we extract the text from each pdf and put create a new column in agg_data.

In [2]:
def extract_text_from_pdf(pdf_path):
    try:
        pdf_document = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# Pdf directory
current_directory = os.getcwd()
folder_path = os.path.join(current_directory, 'pdf_reports')

agg_data['pdf_text'] = agg_data['pdf'].apply(lambda pdf: extract_text_from_pdf(os.path.join(folder_path, pdf)))

Error reading C:\Users\mariu\OneDrive\Uni\Master BAE\Github_Term_Paper_Project\TermPaperDEVAL\pdf_reports\%C3%84gypten_Qena_2022_D.pdf: no such file: 'C:\Users\mariu\OneDrive\Uni\Master BAE\Github_Term_Paper_Project\TermPaperDEVAL\pdf_reports\%C3%84gypten_Qena_2022_D.pdf'
Error reading C:\Users\mariu\OneDrive\Uni\Master BAE\Github_Term_Paper_Project\TermPaperDEVAL\pdf_reports\%C3%84gypten_Umwelt_2022_D.pdf: no such file: 'C:\Users\mariu\OneDrive\Uni\Master BAE\Github_Term_Paper_Project\TermPaperDEVAL\pdf_reports\%C3%84gypten_Umwelt_2022_D.pdf'
Error reading C:\Users\mariu\OneDrive\Uni\Master BAE\Github_Term_Paper_Project\TermPaperDEVAL\pdf_reports\%C3%84thiopien_Gavi_2022_D.pdf: no such file: 'C:\Users\mariu\OneDrive\Uni\Master BAE\Github_Term_Paper_Project\TermPaperDEVAL\pdf_reports\%C3%84thiopien_Gavi_2022_D.pdf'
Error reading C:\Users\mariu\OneDrive\Uni\Master BAE\Github_Term_Paper_Project\TermPaperDEVAL\pdf_reports\Afrika_TCX_2012_D.pdf: no such file: 'C:\Users\mariu\OneDrive\Uni\M

Now, we clean the texts in the following way.

In [3]:
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Removing numbers
    text = text.replace('\n', '')  # Removing new line characters
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removing punctuation
    return text

agg_data['pdf_text'] = agg_data['pdf_text'].apply(preprocess_text)

Next, we extract other features:
- word_count: total number of words in pdf
- character_count: total number of characters in pdf
- word_density: average length of words used

In [4]:
def word_count(text):
    words = text.split()
    return len(words)

def character_count(text):
    return len(text)

def word_density(text):
    words = text.split()
    if len(words) == 0:
        return 0
    return sum(len(word) for word in words) / len(words)

agg_data['word_count'] = agg_data['pdf_text'].apply(word_count)
agg_data['character_count'] = agg_data['pdf_text'].apply(character_count)
agg_data['word_density'] = agg_data['pdf_text'].apply(word_density)

Before moving on with feature engineering, we separate the complete dataset in training, testing and validation. We use train and test for feature engineering and the validation set for final model evaluation. Since we have a limited amount of observations, we use only 10% for validation purposes:

In [5]:
validation_data = agg_data.sample(frac=0.1, random_state = 123)
train_test_data = agg_data.drop(validation_data.index)
train_test_data.shape, validation_data.shape

((665, 34), (74, 34))

We create several functions for text extraction from the pdf files.

In [6]:
current_directory = os.getcwd()
folder_path = os.path.join(current_directory, 'pdf_reports')

def stopwords(text, remove_stopwords=True):    
    if remove_stopwords == True:
        # Remove German stopwords
        stopwords_german = set(stopwords.words('german'))
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stopwords_german]

        # Join the filtered words back into a single string
        text = ' '.join(filtered_words)
    else:
        pass

    return text

def extract_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def read_files(agg_data, score, remove_stopwords=True):
    df = agg_data[agg_data["overall_rating"].isin(score)]
    pdfs = list(df["pdf"])
    text_dict = {}
    for file in pdfs:
        pdf_path = os.path.join(folder_path, file)
        text = extract_text(pdf_path)
        processed_text = stopwords(text, remove_stopwords)
        text_dict[file] = processed_text
    return text_dict

def most_used_words(arrays, k, zero_threshold):
    # Finds the words that are used most often on average
    # Zero_threshold makes sure that only words are filtered that are used in a lot of reports
    
    # 1. Stacks arrays and calcaulate mean values for every column
    stacked_arrays = np.vstack(arrays)
    mean_values = np.mean(stacked_arrays, axis=0)
    
    # 2. Calculate the proportion of zeros in each column and filter out words with too many zeros
    column_totals = stacked_arrays.shape[0]
    zero_counts = np.sum(stacked_arrays == 0, axis=0)
    zero_ratio = zero_counts / column_totals
    idx = np.where(zero_ratio < zero_threshold)[0]
    mean_values[idx] = np.nan
    
    # 3. Sorting indices by mean values in descending order
    sorted_idx = np.argsort(-mean_values)
    highest_idx = sorted_idx[:k]
    return highest_idx, mean_values

We try to identify token-based features by looking for similarities in word use for different rating groups:

In [7]:
# First we use this function to exract all texts for a specific rating
result = read_files(train_test_data, [1,2], remove_stopwords=False)
texts = list(result.values())

# Bag of Words
vectorizer = CountVectorizer(ngram_range=(3, 3))
X = vectorizer.fit_transform(texts)

# List of arrays for similar_features function
arrays = [row for row in X.toarray()]

# Extracting the words that are used most often
idx_mean, most = most_used_words(arrays, 250, 0.01)

feature_names = vectorizer.get_feature_names_out()

print("Most Frequently Used Words:")
print(feature_names[idx_mean])
print(" ")
print("Frequency of Most Used Words:")
print(most[idx_mean])

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\mariu\\OneDrive\\Uni\\Master BAE\\Github_Term_Paper_Project\\TermPaperDEVAL\\pdf_reports\\Afrika_TCX_2012_D.pdf'

## Rating-specific results
**Features:**

Rating 1/2:
- deutlich, gut, erreicht, erfolgreich, verbesserungen, erfüllt, umsetzung, zutreffend, verbesserung der, beitrag zur, mit hoher wahrscheinlichkeit, deutlich über den

Rating 3/4:
- gute, teilweise, allerdings, positiv, positive, positiver, verbesserung der, wirksamkeit des vorhabens, bisher positive, positive entwicklungspolitische wirksamkeit, unter den erwartungen, die negativen ergebnisse, dominieren die

Rating 5/6:
- unzureichend, geringen, mängel, nicht erfolgreich, nicht erfolgreiche, überwiegend nicht, auch nicht, überwiegend nicht erfolgreich, vorhaben ist nutzlos

In [None]:
# Existing text count lists initialization
texts = list(agg_data["pdf_text"])

# Initialize lists for each phrase
count_deutlich = []
count_gut = []
count_erreicht = []
count_erfolgreich = []
count_verbesserungen = []
count_erfuellt = []
count_umsetzung = []
count_zutreffend = []
count_verbesserung_der = []
count_beitrag_zur = []
count_mit_hoher_wahrscheinlichkeit = []
count_deutlich_ueber_den = []
count_gute = []
count_teilweise = []
count_allerdings = []
count_positiv = []
count_positive = []
count_positiver = []
count_wirksamkeit_des_vorhabens = []
count_bisher_positive = []
count_positive_entwicklungspolitische_wirksamkeit = []
count_unter_den_erwartungen = []
count_die_negativen_ergebnisse = []
count_dominieren_die = []
count_unzureichend = []
count_geringen = []
count_maengel = []
count_nicht_erfolgreich = []
count_nicht_erfolgreiche = []
count_ueberwiegend_nicht = []
count_auch_nicht = []
count_ueberwiegend_nicht_erfolgreich = []
count_vorhaben_ist_nutzlos = []

# Iterate through the texts to count occurrences of each phrase
for text in texts:
    count_deutlich.append(text.count("deutlich"))
    count_gut.append(text.count("gut"))
    count_erreicht.append(text.count("erreicht"))
    count_erfolgreich.append(text.count("erfolgreich"))
    count_verbesserungen.append(text.count("verbesserungen"))
    count_erfuellt.append(text.count("erfüllt"))
    count_umsetzung.append(text.count("umsetzung"))
    count_zutreffend.append(text.count("zutreffend"))
    count_verbesserung_der.append(text.count("verbesserung der"))
    count_beitrag_zur.append(text.count("beitrag zur"))
    count_mit_hoher_wahrscheinlichkeit.append(text.count("mit hoher wahrscheinlichkeit"))
    count_deutlich_ueber_den.append(text.count("deutlich über den"))
    count_gute.append(text.count("gute"))
    count_teilweise.append(text.count("teilweise"))
    count_allerdings.append(text.count("allerdings"))
    count_positiv.append(text.count("positiv"))
    count_positive.append(text.count("positive"))
    count_positiver.append(text.count("positiver"))
    count_wirksamkeit_des_vorhabens.append(text.count("wirksamkeit des vorhabens"))
    count_bisher_positive.append(text.count("bisher positive"))
    count_positive_entwicklungspolitische_wirksamkeit.append(text.count("positive entwicklungspolitische wirksamkeit"))
    count_unter_den_erwartungen.append(text.count("unter den erwartungen"))
    count_die_negativen_ergebnisse.append(text.count("die negativen ergebnisse"))
    count_dominieren_die.append(text.count("dominieren die"))
    count_unzureichend.append(text.count("unzureichend"))
    count_geringen.append(text.count("geringen"))
    count_maengel.append(text.count("mängel"))
    count_nicht_erfolgreich.append(text.count("nicht erfolgreich"))
    count_nicht_erfolgreiche.append(text.count("nicht erfolgreiche"))
    count_ueberwiegend_nicht.append(text.count("überwiegend nicht"))
    count_auch_nicht.append(text.count("auch nicht"))
    count_ueberwiegend_nicht_erfolgreich.append(text.count("überwiegend nicht erfolgreich"))
    count_vorhaben_ist_nutzlos.append(text.count("vorhaben ist nutzlos"))

# Add the count lists as new columns to the DataFrame
agg_data['count_deutlich'] = count_deutlich
agg_data['count_gut'] = count_gut
agg_data['count_erreicht'] = count_erreicht
agg_data['count_erfolgreich'] = count_erfolgreich
agg_data['count_verbesserungen'] = count_verbesserungen
agg_data['count_erfuellt'] = count_erfuellt
agg_data['count_umsetzung'] = count_umsetzung
agg_data['count_zutreffend'] = count_zutreffend
agg_data['count_verbesserung_der'] = count_verbesserung_der
agg_data['count_beitrag_zur'] = count_beitrag_zur
agg_data['count_mit_hoher_wahrscheinlichkeit'] = count_mit_hoher_wahrscheinlichkeit
agg_data['count_deutlich_ueber_den'] = count_deutlich_ueber_den
agg_data['count_gute'] = count_gute
agg_data['count_teilweise'] = count_teilweise
agg_data['count_allerdings'] = count_allerdings
agg_data['count_positiv'] = count_positiv
agg_data['count_positive'] = count_positive
agg_data['count_positiver'] = count_positiver
agg_data['count_wirksamkeit_des_vorhabens'] = count_wirksamkeit_des_vorhabens
agg_data['count_bisher_positive'] = count_bisher_positive
agg_data['count_positive_entwicklungspolitische_wirksamkeit'] = count_positive_entwicklungspolitische_wirksamkeit
agg_data['count_unter_den_erwartungen'] = count_unter_den_erwartungen
agg_data['count_die_negativen_ergebnisse'] = count_die_negativen_ergebnisse
agg_data['count_dominieren_die'] = count_dominieren_die
agg_data['count_unzureichend'] = count_unzureichend
agg_data['count_geringen'] = count_geringen
agg_data['count_maengel'] = count_maengel
agg_data['count_nicht_erfolgreich'] = count_nicht_erfolgreich
agg_data['count_nicht_erfolgreiche'] = count_nicht_erfolgreiche
agg_data['count_ueberwiegend_nicht'] = count_ueberwiegend_nicht
agg_data['count_auch_nicht'] = count_auch_nicht
agg_data['count_ueberwiegend_nicht_erfolgreich'] = count_ueberwiegend_nicht_erfolgreich
agg_data['count_vorhaben_ist_nutzlos'] = count_vorhaben_ist_nutzlos

agg_data.head()

In [None]:
agg_data.to_csv("agg_data5.csv", index=False)