In [6]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import json

load_dotenv()

# Load the config file
def load_config(config_file):
    with open(config_file, 'r') as file:
        return json.load(file)

config = load_config('../config.json')

openai_api_key = os.getenv("OPENAI_API_KEY")

openai_client = OpenAI(
  api_key=openai_api_key
)

def calculate_cost(input_tokens, output_tokens):
    input_price_per_1000 = config['token_cost']['input_price_per_1000']
    output_price_per_1000 = config['token_cost']['output_price_per_1000']

    # Calculate the cost for input tokens
    input_cost = (input_tokens / 1000) * input_price_per_1000

    # Calculate the cost for output tokens
    output_cost = (output_tokens / 1000) * output_price_per_1000

    # Total cost
    return input_cost + output_cost

def call_language_model(client, model, system_prompt, user_prompt):
    """
    Makes a call to the GPT language model and returns the response and the cost.

    Parameters:
    client (obj): The GPT client object.
    system_prompt (str): The system prompt for the GPT model.
    user_prompt (str): The user prompt for the GPT model.
    full_text (str): The text to be analyzed by the GPT model.

    Returns:
    tuple: A tuple containing the response from GPT and the cost of the call.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=300,
    )

    input_tokens = response.usage.prompt_tokens
    output_tokens = response.usage.completion_tokens
    cost = calculate_cost(input_tokens, output_tokens)

    return response.choices[0].message.content, cost



In [23]:
from tqdm.notebook import tqdm

df = pd.read_csv('../data/231225_unique_notifying_parties_cleaned.csv')

df = df[3750:]

cost = 0

# Loop through each row in the dataframe and save the output of evaluate_entity_type() in the legal_type_prediction column
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    # prepare user prompt
    user_prompt = config['prompts']['user_prompts']['natural_vs_legal_prompt'].format(input=row['notifying_party'])
    print(user_prompt)
    # Call the language model
    response, cost_of_call = call_language_model(openai_client, config['gpt_model_name'], config['prompts']['system_prompt'], user_prompt)
    print(response)
    print(cost_of_call)

    # Save the response in the legal_type_prediction column
    if "[1]" in response:
        df.loc[index, 'legal_type_prediction_gpt4'] = 1
    elif "[0]" in response:
        df.loc[index, 'legal_type_prediction_gpt4'] = 0
    elif "[2]" in response:
        df.loc[index, 'legal_type_prediction_gpt4'] = 2
    else:
        # if error
        df.loc[index, 'legal_type_prediction_gpt4'] = -1
    cost += cost_of_call

    # Save to CSV file every 10 rows
    if (index + 1) % 10 == 0:
        df.to_csv('../data/240106_natural_legal_full.csv', index=False)

# Save the remaining rows if the DataFrame length is not a multiple of 10
if len(df) % 10 != 0:
    df.to_csv('../data/240106_natural_legal_full.csv', index=False)

print("Total cost of call: ${}".format(cost))

Processing:   0%|          | 0/10708 [00:00<?, ?it/s]

Evaluate the following input on whether it is a natural person or legal entity. If the input represents a natural person, respond with '[0]'. If the input represents a legal entity, respond with '[1]'. If you are unsure, respond with '[2]'. Input: Oldehaver Beteiligungsgesellschaft mbH
[1]
0.00095
Evaluate the following input on whether it is a natural person or legal entity. If the input represents a natural person, respond with '[0]'. If the input represents a legal entity, respond with '[1]'. If you are unsure, respond with '[2]'. Input: Schönherr, Dr. Dipl.-Ing. Regine
[0]
0.00095
Evaluate the following input on whether it is a natural person or legal entity. If the input represents a natural person, respond with '[0]'. If the input represents a legal entity, respond with '[1]'. If you are unsure, respond with '[2]'. Input: Dr. Schönherr Beteiligungs GmbH
[1]
0.00093
Evaluate the following input on whether it is a natural person or legal entity. If the input represents a natural pe

In [21]:
# create a dataframe with only the rows where legal_type == legal_type_prediction_gpt4
df_correct = df[df['legal_type'] != df['legal_type_prediction_gpt4']]
df_correct



Unnamed: 0,notifying_party,Matched company name,OpenCorporates URL,previous_names,legal_type,legal_type_prediction,legal_type_prediction_gpt4
393,"Ley, Margaretha (Nachlaß)",,,,0.0,0.0,1.0
413,"Oetker, Dr., August (Firma)",,,,0.0,,1.0
444,UAP,UAP,http://opencorporates.com/companies/fr/411464225,[],1.0,1.0,2.0
470,INPARSA,,,,1.0,1.0,2.0
