In [1]:
import pandas as pd
import os
import openai
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

df_original = pd.read_csv('data/combined_old.csv')

df_enhanced = pd.read_csv('data/combined_new.csv')

In [2]:
df_enhanced

Unnamed: 0,ID_Key,ID,Company,Year,Presence,Index
0,1215.0,105.0,ADLER Real Estate AG (2015),2002,,
1,1215.0,105.0,ADLER Real Estate AG (2015),2003,,
2,1215.0,105.0,ADLER Real Estate AG (2015),2004,,
3,1215.0,105.0,ADLER Real Estate AG (2015),2005,,
4,1215.0,105.0,ADLER Real Estate AG (2015),2006,,
...,...,...,...,...,...,...
3961,,56.0,GAGFAH S.A.,2015,,
3962,,140.0,Highlight Communications AG,2015,,
3963,,208.0,QIAGEN N.V.,2015,64.30,TecDAX
3964,,89.0,RTL Group S.A.,2015,82.93,MDAX


In [3]:
df_original

Unnamed: 0,ID_Key,ID,Company,Year,Presence,Index
0,1215.0,105.0,ADLER Real Estate AG (2015),2002,,
1,1215.0,105.0,ADLER Real Estate AG (2015),2003,,
2,1215.0,105.0,ADLER Real Estate AG (2015),2004,,
3,1215.0,105.0,ADLER Real Estate AG (2015),2005,,
4,1215.0,105.0,ADLER Real Estate AG (2015),2006,,
...,...,...,...,...,...,...
3961,,56.0,GAGFAH S.A.,2015,,
3962,,140.0,Highlight Communications AG,2015,,
3963,,208.0,QIAGEN N.V.,2015,64.30,TecDAX
3964,,89.0,RTL Group S.A.,2015,82.93,MDAX


In [4]:
# First, add suffixes to each column in both dataframes except for the index
df_original_suffixed = df_original.add_suffix('_original')
df_enhanced_suffixed = df_enhanced.add_suffix('_enhanced')

# Join both dataframes based on their index
merged_df = pd.concat([df_original_suffixed, df_enhanced_suffixed], axis=1)

# drop the ID_Key_enhanced, ID_enhanced, Company_enhanced, and Index_enhanced columns
merged_df = merged_df.drop(columns=['ID_Key_enhanced', 'ID_enhanced', 'Company_enhanced', 'Year_enhanced', 'Index_enhanced'])
merged_df

Unnamed: 0,ID_Key_original,ID_original,Company_original,Year_original,Presence_original,Index_original,Presence_enhanced
0,1215.0,105.0,ADLER Real Estate AG (2015),2002,,,
1,1215.0,105.0,ADLER Real Estate AG (2015),2003,,,
2,1215.0,105.0,ADLER Real Estate AG (2015),2004,,,
3,1215.0,105.0,ADLER Real Estate AG (2015),2005,,,
4,1215.0,105.0,ADLER Real Estate AG (2015),2006,,,
...,...,...,...,...,...,...,...
3961,,56.0,GAGFAH S.A.,2015,,,
3962,,140.0,Highlight Communications AG,2015,,,
3963,,208.0,QIAGEN N.V.,2015,64.30,TecDAX,64.30
3964,,89.0,RTL Group S.A.,2015,82.93,MDAX,82.93


## Creating a large test set

This one includes the rows that are have been provided by the Kleinanlegerschutzverbund (?)

## Creating the test set

Only the rows Linus has checked manually

In [5]:
# drop all rows where merged_df['Presence_enhanced'] isna. Effectively, I only need ID_Key_original and Presence_enhanced, but the other columns are useful for debugging
test_set_large = merged_df.dropna(subset=['Presence_enhanced'])
test_set_large.to_csv('data/231215_test_set_large.csv', index=False)

In [6]:
# Filter out the rows where 'Presence_original' is NaN and 'Presence_enhanced' is not NaN
test_set_small = merged_df[merged_df['Presence_original'].isna() & merged_df['Presence_enhanced'].notna()]

# The result_df will contain the desired rows with distinct column suffixes
test_set_small.to_csv('data/231215_test_set_small.csv', index=False)

## Finding the correct file

Finding the file path to the PDF the data has been extracted from

In [7]:
import pdfplumber


directory = "data/Praesenzen_hv-info"

for index, row in test_set_small.iterrows():
    id_value = str(int(row['ID_Key_original']))
    year_value = str(int(row['Year_original']))

    # Initialize variable to store the found directory path
    found_directory_path = None

    # Find subdirectory
    for subdirectory in os.listdir(directory):
        subdirectory_path = os.path.join(directory, subdirectory)
        if os.path.isdir(subdirectory_path) and subdirectory.endswith(id_value):
            found_directory_path = subdirectory_path
            break

    # add "ASM" subdirectory to the path
    if found_directory_path:
        found_directory_path = os.path.join(found_directory_path, "ASM")

    # If a matching subdirectory is found, search for the correct file
    if found_directory_path:
        for file in os.listdir(found_directory_path):
            file_path = os.path.join(found_directory_path, file)
            if os.path.isfile(file_path) and file.endswith(year_value[-2:] + ".pdf"):
                # Found the file, you can add your code here to handle the file
                print(f"Found file: {file_path}")
                break
    else:
        # Handle the case where no matching subdirectory is found
        print(f"No subdirectory found for ID {id_value}")

    with pdfplumber.open(file_path) as pdf:
        # List to store all DataFrames
        dataframes = []
        
        # List to store all texts
        all_texts = []

        # Iterate through each page
        for page_number, page in enumerate(pdf.pages, start=1):
            # Extract text from the page
            page_text = page.extract_text()
            # Store the text
            all_texts.append((page_number, page_text))

    for page in all_texts:
        text = page[1]
 
    # Prepare the prompt
    prompt_text = "What is the latest research on climate change?"

    # Call the GPT-4 chat_completion model
    response = openai.ChatCompletion.create(
        model="gpt-4",  # Specify the model, e.g., "gpt-4"
        messages=[
            {"role": "system", "content": "Du bist ein hilfreicher Assistent, der Berichte von Hauptversammlungen auswertet."},
            {"role": "user", "content": "Dies ist der Bericht einer hauptversammlung. Basierend auf den folgenden Informationen, teile mir mit, wie viel Prozent des Stammkapitals mit Sicherheit auf der Hauptversammlung vertreten waren: " + prompt_text}
        ]
    )

    # Print the response
    print(response.choices[0].message['content'])


Found file: data/Praesenzen_hv-info/Adler Real Estate AG-1215/ASM/HV-Beschluss zur ordentlichen Hauptversammlung am 27.08.10.pdf


APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


## Read contents of PDF

In [13]:
import pdfplumber

# # Now, full_text contains all the text extracted from the PDF
# print(file_path)

file_path = 'data/Praesenzen_hv-info/Amadeus Fire AG-14519/ASM/HV-Beschluss zur ordentlichen Hauptversammlung am 24.05.18.pdf'

with pdfplumber.open(file_path) as pdf:
    # List to store all DataFrames
    dataframes = []
    
    # List to store all texts
    all_texts = []

    # Iterate through each page
    for page_number, page in enumerate(pdf.pages, start=1):
        # Extract text from the page
        page_text = page.extract_text()
        # Store the text
        all_texts.append((page_number, page_text))

        # Extract tables from the page
        tables = page.extract_tables()

        # # Process each table
        # for table_number, table in enumerate(tables, start=1):
        #     # Convert table to DataFrame
        #     df = pd.DataFrame(table[1:], columns=table[0])

        #     # Store the DataFrame for later use
        #     dataframes.append(df)




In [14]:
all_texts[0][1]

'27.5.2018 Hauptversammlung\nAbstimmungsergebnisse der Hauptversammlung der Amadeus FiRe\nAG am 24. Mai 2018 in Frankfurt am Main\nVom Grundkapital in Höhe von EUR 5.198.237, eingeteilt in 5.198.237 auf den Inhaber lautende Stückaktien, waren zur Abstimmung 3.110.435 Stückaktien\nanwesend. Das entspricht einer Präsenz zur Abstimmung von 59,84 Prozent des Grundkapitals.\nJa- Nein-\nAbgegebene Ja- Stimmen Nein- Stimmen\nAnteil am\ngültige Stimmen in % der Stimmen in % der Enthaltungen Ergebnis\nGrundkapital\nStimmen gesamt gültigen gesamt gültigen\nStimmen Stimmen\nTOP 2:\nBeschlussfassung\nBeschluss\nüber die 3.110.345 59,83% 3.110.030 99,99% 315 0,01% 90\nangenommen\nVerwendung des\nBilanzgewinns\nTOP 3:\nBeschlussfassung\nüber die Entlastung Beschluss\n3.091.267 59.47% 3.088.113 99,90% 3.154 0,10% 19.168\nder Mitglieder des angenommen\nVorstands für das\nGeschäftsjahr 2017\nTOP 4:\nBeschlussfassung\nüber die Entlastung Beschluss\n2.530.720 48,68% 2.485.809 98,23% 44.911 1,77% 574.515\

In [10]:
df

Unnamed: 0,TOP 1,Änderung des Beschlusses über die Verwendung des Bilanzgewinns für das\nGeschäftsjahr 2022 zur Ausschüttung einer weiteren Dividende,Unnamed: 3,(angenommen)


In [11]:
df.to_csv('data/231215_gpt_test.csv', index=False)