In [1]:
import pandas as pd
import os
from openai import OpenAI
from dotenv import load_dotenv
import wandb

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

df_original = pd.read_csv('data/combined_old.csv')

df_enhanced = pd.read_csv('data/combined_new.csv')

In [2]:
df_enhanced

Unnamed: 0,ID_Key,ID,Company,Year,Presence,Index
0,1215.0,105.0,ADLER Real Estate AG (2015),2002,,
1,1215.0,105.0,ADLER Real Estate AG (2015),2003,,
2,1215.0,105.0,ADLER Real Estate AG (2015),2004,,
3,1215.0,105.0,ADLER Real Estate AG (2015),2005,,
4,1215.0,105.0,ADLER Real Estate AG (2015),2006,,
...,...,...,...,...,...,...
3961,,56.0,GAGFAH S.A.,2015,,
3962,,140.0,Highlight Communications AG,2015,,
3963,,208.0,QIAGEN N.V.,2015,64.30,TecDAX
3964,,89.0,RTL Group S.A.,2015,82.93,MDAX


In [3]:
df_original

Unnamed: 0,ID_Key,ID,Company,Year,Presence,Index
0,1215.0,105.0,ADLER Real Estate AG (2015),2002,,
1,1215.0,105.0,ADLER Real Estate AG (2015),2003,,
2,1215.0,105.0,ADLER Real Estate AG (2015),2004,,
3,1215.0,105.0,ADLER Real Estate AG (2015),2005,,
4,1215.0,105.0,ADLER Real Estate AG (2015),2006,,
...,...,...,...,...,...,...
3961,,56.0,GAGFAH S.A.,2015,,
3962,,140.0,Highlight Communications AG,2015,,
3963,,208.0,QIAGEN N.V.,2015,64.30,TecDAX
3964,,89.0,RTL Group S.A.,2015,82.93,MDAX


In [4]:
# First, add suffixes to each column in both dataframes except for the index
df_original_suffixed = df_original.add_suffix('_original')
df_enhanced_suffixed = df_enhanced.add_suffix('_enhanced')

# Join both dataframes based on their index
merged_df = pd.concat([df_original_suffixed, df_enhanced_suffixed], axis=1)

# drop the ID_Key_enhanced, ID_enhanced, Company_enhanced, and Index_enhanced columns
merged_df = merged_df.drop(columns=['ID_Key_enhanced', 'ID_enhanced', 'Company_enhanced', 'Year_enhanced', 'Index_enhanced'])
merged_df

Unnamed: 0,ID_Key_original,ID_original,Company_original,Year_original,Presence_original,Index_original,Presence_enhanced
0,1215.0,105.0,ADLER Real Estate AG (2015),2002,,,
1,1215.0,105.0,ADLER Real Estate AG (2015),2003,,,
2,1215.0,105.0,ADLER Real Estate AG (2015),2004,,,
3,1215.0,105.0,ADLER Real Estate AG (2015),2005,,,
4,1215.0,105.0,ADLER Real Estate AG (2015),2006,,,
...,...,...,...,...,...,...,...
3961,,56.0,GAGFAH S.A.,2015,,,
3962,,140.0,Highlight Communications AG,2015,,,
3963,,208.0,QIAGEN N.V.,2015,64.30,TecDAX,64.30
3964,,89.0,RTL Group S.A.,2015,82.93,MDAX,82.93


## Creating a large test set

This one includes the rows that are have been provided by the Kleinanlegerschutzverbund (?)

## Creating the test set

Only the rows Linus has checked manually

In [5]:
# drop all rows where merged_df['Presence_enhanced'] isna. Effectively, I only need ID_Key_original and Presence_enhanced, but the other columns are useful for debugging
test_set_large = merged_df.dropna(subset=['Presence_enhanced'])
test_set_large.to_csv('data/231215_test_set_large.csv', index=False)

In [6]:
# Filter out the rows where 'Presence_original' is NaN and 'Presence_enhanced' is not NaN
test_set_small = merged_df[merged_df['Presence_original'].isna() & merged_df['Presence_enhanced'].notna()]

# The result_df will contain the desired rows with distinct column suffixes
test_set_small.to_csv('data/231215_test_set_small.csv', index=False)

## Finding the correct file

Finding the file path to the PDF the data has been extracted from

### calculating pricing

Creating a function to calculate the cost of all this, to be used later

In [7]:
def calculate_cost(input_tokens, output_tokens, input_price_per_1000=0.01, output_price_per_1000=0.03):
    # Calculate the cost for input tokens
    input_cost = (input_tokens / 1000) * input_price_per_1000

    # Calculate the cost for output tokens
    output_cost = (output_tokens / 1000) * output_price_per_1000

    # Total cost
    total_cost = input_cost + output_cost

    return total_cost

In [8]:
import pdfplumber
from tqdm import tqdm
import ast

directory = "data/Praesenzen_hv-info"

# Initialize Weights & Biases
wandb.init(project="hv-praesenzen")

# Define W&B Table to store results
columns = ["ID_Key_original", "Year_original", "Presence_enhanced", "Presence_predicted", "correct", "error", "price", "file_path", "comment"]
table = wandb.Table(columns=columns)

for index, (df_index, row) in enumerate(tqdm(test_set_small.iterrows(), total=test_set_small.shape[0])):

    if index < 73:
        continue

    if index == 75:
        break

    error = False

    # total cost pf processing the document
    cost_total = 0

    # add a comment collection variable for the case some pages dont output a number
    comment_collection = ""

    prediction_correct = False

    id_value = str(int(row['ID_Key_original']))
    year_value = str(int(row['Year_original']))

    # Initialize variable to store the found directory path
    found_directory_path = None

    # Find subdirectory
    for subdirectory in os.listdir(directory):
        subdirectory_path = os.path.join(directory, subdirectory)
        if os.path.isdir(subdirectory_path) and subdirectory.endswith(id_value):
            found_directory_path = subdirectory_path
            break

    # add "ASM" subdirectory to the path
    if found_directory_path:
        found_directory_path = os.path.join(found_directory_path, "ASM")

    # If a matching subdirectory is found, search for the correct file
    if found_directory_path:
        for file in os.listdir(found_directory_path):
            file_path = os.path.join(found_directory_path, file)
            if os.path.isfile(file_path) and file.endswith(year_value[-2:] + ".pdf"):
                # Found the file, you can add your code here to handle the file
                print(f"Found file: {file_path}")
                break
    else:
        # Handle the case where no matching subdirectory is found
        print(f"No subdirectory found for ID {id_value}")

    with pdfplumber.open(file_path) as pdf:
        
        # List to store all texts
        all_texts = []
 
        # Iterate through each page
        for page_number, page in enumerate(pdf.pages, start=1):
            # Extract text from the page
            page_text = page.extract_text()
            if page_text == "":
                test_set_small.at[index, 'error'] = "document could not be read"
                error = True
                break
            # Store the text
            all_texts.append((page_number, page_text))

        if error:
            table.add_data(id_value, year_value, row['Presence_enhanced'], 0, None, error, cost_total, file_path, comment)
            
            error = False
            print("There was an error. Not evaluating this document.")
            continue

    highest_percentage_list = []

    for page in all_texts:

        # comment variable to store the comment of the page in case the model does not output a number
        comment = ""

        system_prompt = "Du bist ein hilfreicher Assistent, der Berichte von Hauptversammlungen auswertet."
        # user_prompt = "Im folgenden erhältst du einen Bericht einer Hauptversammlung. Wenn das Dokument einen einleitenden Text enthält, in dem eine Angabe in Prozent zum in Präsenz auf der Hauptversammlung vertretenen Grundkapital gemacht wird, antworte mir ausschließlich mit dieser Zahl in eckigen Klammern ohne Prozentzeichen. Verwende Punkt statt Komma. Wenn du im Fließtext keine Zahl finden kannst oder dir nicht sicher bist, antworte mit [0]." + page[1]

        # # Call the GPT-4 chat_completion model
        # response = client.chat.completions.create(model="gpt-4-1106-preview",  # Specify the model, e.g., "gpt-4"
        # messages=[
        #     {"role": "system", "content": system_prompt},
        #     {"role": "user", "content": user_prompt}
        # ],
        # temperature=0.2)

        user_prompt = "Im folgenden erhältst du einen Bericht einer Hauptversammlung. Antworte ausschließlich mit einer Liste im Format [zahl_1, zahl_2, zahl_3], die ausschließlich alle die genannten Prozentzahlen enthält, die sich auf den Prozentsatz des auf der Hauptversammlung vertretenen GRUNDKAPITALS beziehen. Prüfe den ganzen Text auf solche Zahlen, auch außerhalb von Tabellen. Verwechsle die gesuchte Information auf keinen Fall mit Zahlen, die sich auf JA- oder Nein-Stimmen beziehen. Verwende einen Punkt statt Komma. WICHTIG: Verwechsle die gesuchten Zahlen nicht mit solchen aus den JA- und NEIN-Spalten. Wenn du dir nicht absolut sicher bist, antworte mit [0]. Ich gebe dir ein Trinkgeld von $20. Bericht der Hauptversammlung: " + page[1]

        # try:
        #     percentage_intro_object = ast.literal_eval(response.choices[0].message.content)
        #     percentage_intro = percentage_intro_object[0]
        #     print("percentage_intro: ", percentage_intro)
        # except SyntaxError:
        #     percentage_intro = 0
        #     comment = "Page " + str(page[0]) + ": " + response.choices[0].message.content
        #     comment_collection += comment
        #     error = True

        # Call the GPT-4 chat_completion model
        response = client.chat.completions.create(model="gpt-4-1106-preview",  # Specify the model, e.g., "gpt-4"
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.2)

        # Print the response
        print(response.choices[0].message.content)

        # get highest value within page
        try:
            percentage_list = ast.literal_eval(response.choices[0].message.content)
            highest_percentage_of_page = max(percentage_list)
        except SyntaxError:
            highest_percentage_of_page = 0
            comment = "Page " + str(page[0]) + ": " + response.choices[0].message.content
            comment_collection += comment
            error = True

        # get highest value of all pages
        try:
            # highest_percentage_list.append(percentage_intro)
            highest_percentage_list.append(highest_percentage_of_page)
            highest_percentage = max(highest_percentage_list)
            # round highest percentage to 2 decimal places
            highest_percentage = round(highest_percentage, 2)
        except ValueError:
            highest_percentage = 0
            comment = "Page " + str(page[0]) + ": " + response.choices[0].message.content
            comment_collection += comment
            error = True

        print("highest_percentage: ", highest_percentage)

        # calculate cost
        input_tokens = response.usage.prompt_tokens
        output_tokens = response.usage.completion_tokens
        cost_page = calculate_cost(input_tokens, output_tokens)

        cost_total += cost_page

    test_set_small.at[index, 'Presence_predicted'] = highest_percentage

    if row['Presence_enhanced'] == highest_percentage:
        prediction_correct = True
    else:
        # Check for another row with the same ID_Key_original and Year_original
        same_id_year_rows = test_set_small[(test_set_small['ID_Key_original'] == row['ID_Key_original']) & 
                            (test_set_small['Year_original'] == row['Year_original'])]

        # Check if any of those rows have Presence_enhanced equal to highest_percentage
        if any(same_id_year_rows['Presence_enhanced'] == highest_percentage):
            prediction_correct = True
            comment_collection += "Der ermittelte Wert stammt aus dem anderen Bericht diesen Jahres und ist dort korrekt ermittelt."
        else:
            prediction_correct = False

    table.add_data(id_value, year_value, row['Presence_enhanced'], highest_percentage, prediction_correct, error, cost_total, file_path, comment_collection)

print("system_prompt: ", system_prompt)
print("user_prompt: ", user_prompt)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfelixringe[0m ([33mfuels[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/182 [00:00<?, ?it/s]

Found file: data/Praesenzen_hv-info/BASF SE-11193/ASM/HV-Beschluss zur ordentlichen Hauptversammlung am 03.05.19.pdf
[51.2, 51.06, 50.34]
highest_percentage:  51.2
[50.34, 90]
highest_percentage:  90
[50.99, 98.02, 1.98, 33, 50.55]
highest_percentage:  98.02
[49.82, 50.54]
highest_percentage:  98.02
[50.95, 49.82]
highest_percentage:  98.02
[50.52, 50.93]
highest_percentage:  98.02


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set_small.at[index, 'Presence_predicted'] = highest_percentage
 41%|████      | 74/182 [00:11<00:17,  6.29it/s]

[0]
highest_percentage:  98.02
Found file: data/Praesenzen_hv-info/BASF SE-11193/ASM/HV-Beschluss zur ordentlichen Hauptversammlung am 18.06.20.pdf
[49.51, 49.37, 48.83]
highest_percentage:  49.51
[48.83, 49.12, 49.15]
highest_percentage:  49.51
[49.11, 49.19]
highest_percentage:  49.51


 41%|████      | 75/182 [00:21<00:30,  3.50it/s]

[49.10, 98.25, 1.75, 49.10]
highest_percentage:  98.25
system_prompt:  Du bist ein hilfreicher Assistent, der Berichte von Hauptversammlungen auswertet.
user_prompt:  Im folgenden erhältst du einen Bericht einer Hauptversammlung. Basierend auf den folgenden Informationen, antworte ausschließlich mit einer Liste im Format [zahl_1, zahl_2, zahl_3], die ausschließlich alle die genannten Prozentzahlen enthält, die sich auf den Prozentsatz des auf der Hauptversammlung in Präsenz vertretenen GRUNDKAPITALS beziehen. Prüfe den ganzen Text auf solche Zahlen, auch außerhalb von Tabellen. Verwechsle die gesuchte Information auf keinen Fall mit Zahlen, die sich auf JA- oder Nein-Stimmen beziehen. Verwende einen Punkt statt Komma. Wenn du dir nicht absolut sicher bist, antworte mit [0]. Bericht der Hauptversammlung: 450.935.9 49,10 % 443.037.9 98,25 % 7.898.08 1,75 % 3.768.717 angenom
96 08 8 men
Abgegebene gültige Stimmen*
450.935.996
10. Beschlussfassung über die Bestätigung der Aufsichtsratsverg




In [9]:
# Log the table to Weights & Biases
wandb.log({"extraction_results": table})

In [9]:
print(error)

False


In [9]:
test_set_small

Unnamed: 0,ID_Key_original,ID_original,Company_original,Year_original,Presence_original,Index_original,Presence_enhanced,Presence_predicted
8,1215.0,105.0,ADLER Real Estate AG (2015),2010,,,74.33,74.33
9,1215.0,105.0,ADLER Real Estate AG (2015),2011,,,73.70,73.70
10,1215.0,105.0,ADLER Real Estate AG (2015),2012,,,70.65,70.65
11,1215.0,105.0,ADLER Real Estate AG (2015),2013,,,68.26,68.26
12,1215.0,105.0,ADLER Real Estate AG (2015),2014,,,68.39,65.70
...,...,...,...,...,...,...,...,...
1975,14884.0,,Bilfinger SE,2019,,,55.90,
1976,14884.0,,Bilfinger SE,2020,,,47.88,
1977,14884.0,,Bilfinger SE,2021,,,54.04,
1978,14884.0,,Bilfinger SE,2022,,,53.71,


## Read contents of PDF

In [13]:
import pdfplumber

# # Now, full_text contains all the text extracted from the PDF
# print(file_path)

file_path = 'data/Praesenzen_hv-info/Amadeus Fire AG-14519/ASM/HV-Beschluss zur ordentlichen Hauptversammlung am 24.05.18.pdf'

with pdfplumber.open(file_path) as pdf:
    # List to store all DataFrames
    dataframes = []
    
    # List to store all texts
    all_texts = []

    # Iterate through each page
    for page_number, page in enumerate(pdf.pages, start=1):
        # Extract text from the page
        page_text = page.extract_text()
        # Store the text
        all_texts.append((page_number, page_text))

        # Extract tables from the page
        tables = page.extract_tables()

        # # Process each table
        # for table_number, table in enumerate(tables, start=1):
        #     # Convert table to DataFrame
        #     df = pd.DataFrame(table[1:], columns=table[0])

        #     # Store the DataFrame for later use
        #     dataframes.append(df)




In [10]:
all_texts[0][1]

'Abstimmungsergebnis - Übersichtsliste\nADLER Real Estate Aktiengesellschaft\nOrdentliche Hauptversammlung am 31. August 2022\nGültig In % des\nJA-Stimmen NEIN-Stimmen Beschluss-\nTagesordnungspunkt abgegebene Grund- JA-Stimmen NEIN-Stimmen\nin % in % vorschlag\nStimmen kapitals\nGegenantrag von der Whitebox Multi-Strategy\nAntrag Partners LP, MAN FUNDS VI PLC sowie ATLAS\n106.544.064 97,37 365.815 0,3434 106.178.249 99,6566 abgelehnt\nG MACRO MASTER FUND LTD zu TOP 6 auf Absetzung\ndes TOP 6\nVorschlag der Verwaltung sowie Gegenantrag der\n2 Adler Group S.A. zu TOP 2 auf Vertagung der 106.520.316 97,35 105.920.212 99,4366 600.104 0,5634 angenommen\nEntlastung des Vorstands\nAntrag Gegenantrag der SdK zu TOP 2 auf Bestellung eines\n106.595.566 97,42 415.868 0,3902 106.179.698 99,6098 abgelehnt\nE Sonderprüfers gem. § 142 Abs. 1 Satz 1 AktG\nVorschlag der Verwaltung sowie Gegenantrag der\n3 Adler Group S.A. zu TOP 3 auf Vertagung der 106.504.476 97,34 105.920.622 99,4518 583.854 0,5482 

In [10]:
df

Unnamed: 0,TOP 1,Änderung des Beschlusses über die Verwendung des Bilanzgewinns für das\nGeschäftsjahr 2022 zur Ausschüttung einer weiteren Dividende,Unnamed: 3,(angenommen)


In [11]:
df.to_csv('data/231215_gpt_test.csv', index=False)