Since the embeddings calculations are still running, I have put this into a new notebook, but would otherwise just be appended to ```240102_create_embeddings.ipynb```. 

First, we prepare the data:

In [1]:
import pandas as pd

# load ../data/231231_final_result_with_combined_index.csv

df = pd.read_csv('../data/231231_final_result_with_combined_index.csv')

# load ../data/240103_highly_scoring_embeddings.csv

kmeans_results_df = pd.read_csv('../data/240103_highly_scoring_embeddings.csv')


FileNotFoundError: [Errno 2] No such file or directory: '../data/231231_final_result_with_combined_index.csv'

In [25]:
# Joining on 'target_id'
kmeans_results_df = kmeans_results_df.merge(df, left_on='target_id', right_on='id', how='left')
kmeans_results_df.rename(columns={'notifying_party': 'notifying_party_target', 'legal_type': 'legal_type_target'}, inplace=True)
kmeans_results_df.drop('id', axis=1, inplace=True)

# Joining on 'matched_id'
kmeans_results_df = kmeans_results_df.merge(df, left_on='matched_id', right_on='id', how='left')
kmeans_results_df.rename(columns={'notifying_party': 'notifying_party_matched', 'legal_type': 'legal_type_matched'}, inplace=True)
kmeans_results_df.drop('id', axis=1, inplace=True)

In [26]:
kmeans_results_df = kmeans_results_df[['target_id', 'notifying_party_target', 'matched_id', 'notifying_party_matched', 'distance', 'legal_type_target', 'legal_type_matched']]

In [28]:
kmeans_results_df
kmeans_results_df.to_csv('../data/240103_embeddings_top_10_combinations.csv', index=False)

In [13]:
kmeans_results_df.to_csv('../data/240104_highly_scoring_embeddings_test_set.csv', index=False)

Then, we create a function to call a language model, in this case GPT4:

In [2]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import json

load_dotenv()

# Load the config file
def load_config(config_file):
    with open(config_file, 'r') as file:
        return json.load(file)

config = load_config('../config.json')

openai_api_key = os.getenv("OPENAI_API_KEY")
together_api_key = os.getenv("TOGETHER_API_KEY")

openai_client = OpenAI(
  api_key=openai_api_key
)

together_client = OpenAI(
  api_key=together_api_key,
  base_url='https://api.together.xyz/v1',
)

def calculate_cost(input_tokens, output_tokens):
    input_price_per_1000 = config['token_cost']['input_price_per_1000']
    output_price_per_1000 = config['token_cost']['output_price_per_1000']

    # Calculate the cost for input tokens
    input_cost = (input_tokens / 1000) * input_price_per_1000

    # Calculate the cost for output tokens
    output_cost = (output_tokens / 1000) * output_price_per_1000

    # Total cost
    return input_cost + output_cost

def call_language_model(client, model, system_prompt, user_prompt):
    """
    Makes a call to the GPT language model and returns the response and the cost.

    Parameters:
    client (obj): The GPT client object.
    system_prompt (str): The system prompt for the GPT model.
    user_prompt (str): The user prompt for the GPT model.
    full_text (str): The text to be analyzed by the GPT model.

    Returns:
    tuple: A tuple containing the response from GPT and the cost of the call.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=300,
    )

    input_tokens = response.usage.prompt_tokens
    output_tokens = response.usage.completion_tokens
    cost = calculate_cost(input_tokens, output_tokens)

    return response.choices[0].message.content, cost



In [4]:
from tqdm.notebook import tqdm
import wandb

system_prompt = "You are a helpful assistant."

user_prompt = "In the following, your will receive two spellings of legal entities. Is the difference more likely to be a typo/alternative spelling of the same entity or are the two spellings referring to separate subsidiaries, funds or companies, like e.g. a parent-child company relationship? Answer in two sentences explaining your reasoning and take a clear stance on whether they are the same entity, different entities or whether you are unsure."

test_set = pd.read_csv(config['csv_file_path'])

kmeans_results_df = test_set

# WandB
run = wandb.init(project=config['wandb_project_name'])
# Define W&B Table to store results
columns = config['wandb_table_columns_prod']
table = wandb.Table(columns=columns)
# log a row of data to the W&B table
def log_to_wandb(data):
    table.add_data(*data.values())

# Log the config.json file as an artifact
artifact = wandb.Artifact('run_configurations', type='config')
artifact.add_file('../config.json')
run.log_artifact(artifact)

# Assuming kmeans_results_df is your DataFrame
# Add new columns with default values
kmeans_results_df['embeddings_id_1_1'] = None
kmeans_results_df['error'] = None

for index, row in tqdm(kmeans_results_df.iterrows(), total=kmeans_results_df.shape[0]):
    spelling_1 = row['notifying_party_target']
    spelling_2 = row['notifying_party_matched']

    """
    Initialize the default data dictionary for each iteration. 
    This will be updated with the results of each step and then logged to W&B.
    """
    data = {
        "target_id": str(row['target_id']),
        "notifying_party_target": spelling_1,
        "matched_id": str(row['matched_id']),
        "notifying_party_matched": spelling_2,
        "legal_type_target": row['legal_type_target'],
        "legal_type_matched": row['legal_type_matched'],
        "distance": row['distance'],
        "expected_result": row['expected_result'],
        "prediction": None,
        "error": None,
        "correct": None,
        "cost": 0
    }

    content_assessment_prompt = config['prompts']['user_prompts']['spellings_assessment_prompt'].format(spelling_1=spelling_1, spelling_2=spelling_2)

    openai_response, cost = call_language_model(openai_client, config['gpt_model_name'], system_prompt, content_assessment_prompt)
    print("Content assessment: " + openai_response + "\n\n")

    # add cost on top of the current cost
    data["cost"] += cost

    response_assessment_prompt = config['prompts']['user_prompts']['response_assessment_prompt'].format(response=openai_response)
    mixtral_response, cost = call_language_model(openai_client, config['gpt_model_name'], system_prompt, response_assessment_prompt)

    data["cost"] += cost

    # add cost to a "cost" column in the dataframe
    kmeans_results_df.at[index, 'cost'] = data["cost"]

    print("Response assessment: " + mixtral_response + "\n\n ---------------- \n\n")

    # Update the DataFrame based on the response
    if "[1]" in mixtral_response:
        kmeans_results_df.at[index, 'embeddings_id_1_1'] = 1
        data["prediction"] = 1
    elif "[0]" in mixtral_response:
        kmeans_results_df.at[index, 'embeddings_id_1_1'] = 0
        data["prediction"] = 0
    elif "[2]" in mixtral_response:
        kmeans_results_df.at[index, 'error'] = "unsure"
        data.update({"error": "unsure"})
    else:
        kmeans_results_df.at[index, 'error'] = "error"
        data.update({"error": "error"})

    # check if prediction is correct
    if data["prediction"] == data["expected_result"]:
        data["correct"] = True
    else:
        data["correct"] = False

    # Save to CSV file every 10 rows
    if (index + 1) % 10 == 0:
        kmeans_results_df.to_csv('../data/240104_sampled_test_set_results.csv', index=False)

    log_to_wandb(data)

# Save the remaining rows if the DataFrame length is not a multiple of 10
if len(kmeans_results_df) % 10 != 0:
    kmeans_results_df.to_csv('../data/240104_sampled_test_set_results.csv', index=False)
    log_to_wandb(data)

wandb.log({"results": table})
wandb.finish()



  0%|          | 0/100 [00:00<?, ?it/s]

Content assessment: The two spellings reference entities that could potentially be different due to the inclusion of "Piech" in the first and "Familie" in the second, which suggests that each name might represent different formulations or parts of the family ownership structure. However, without additional context, it's not possible to definitively determine if they are separate entities, the same entity with a name variation, or simply a typographical error; thus, I am unsure.


Response assessment: [2]

 ---------------- 


Content assessment: The difference in the provided spellings suggests they are referring to different entities, not simply a typo or alternative spelling. TWENTY-FIRST CENTURY FOX, INC. appears to be the main corporation, while 21ST CENTURY FOX EUROPE, INC. implies a subsidiary or division that operates specifically within Europe, indicating a potential parent-child company relationship.


Response assessment: [0]

 ---------------- 


Content assessment: Credit S

VBox(children=(Label(value='0.031 MB of 0.031 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



In [173]:
# # COPY OF THE ABOVE

# from tqdm.notebook import tqdm

# system_prompt = "You are a helpful assistant."

# user_prompt_1 = "In the following, your will receive two spellings of company names. If the two spellings both have the same company type (like LLC, LP, GmbH, Sarl etc.), reply with [1]. If not, reply with [0]. If you are unsure, reply with [2]."

# # user_prompt_2 = "In the following, your will receive two spellings of company names. If the two spellings refer to the same company, just spelled differently, reply with [1]. If they refer to two different companies, reply with [0]. If you are unsure, reply with [2]."

# # user_prompt_2_2 = "In the following, your will receive two spellings of company names. If the two spellings refer to the same company, just spelled differently, reply with [1]. If they refer to two different companies, reply with [0]. If you are unsure, reply with [2]. Be very strict with your answers. Pay special attention to funds that have a slightly different name, like Fund A-I vs. Fund A-II. Explain your answer."

# # user_prompt_2_3 = "In the following, your will receive two spellings of company names. so that all the spellings that belong to the same legal entity get the same ID. It is important to treat even subsidiaries or divisions as separate entities. Pay special attention to funds that have a slightly different name, like Fund A-I vs. Fund A-II. Are the following belonging to the same entity? Reply only with either '[0]' for different entities, '[1]' for the same entity and '[2]' if you are unsure"

# # user_prompt_2_4 = "In the following, your will receive two spellings of legal entities. Do the two spellings refer to the same legal entity? Explain your reasoning and reply with [1] if they do, [0] if they don’t and [2] if you are unsure. Be very strict with your answers. Treat subsidiaries/divisions as well as funds (Fund A-I vs. Fund A-II) as separate entities."

# # user_prompt_2_5 = "In the following, your will receive two spellings of legal entities. Ask yourself: Is the difference more likely to be a typo/alternative spelling of the same entity or are the two spellings referring to separate subsidiaries or companies, like e.g. a parent-child company relationship? Explain your reasoning and reply with [1] if they refer to the same entity, [0] if they don’t and [2] if you are unsure. Be very strict with your answers."

# user_prompt_2_6 = "In the following, your will receive two spellings of legal entities. Is the difference more likely to be a typo/alternative spelling of the same entity or are the two spellings referring to separate subsidiaries, funds or companies, like e.g. a parent-child company relationship? Answer in two sentences explaining your reasoning and take a clear stance on whether they are the same entity, different entities or whether you are unsure."

# test_set = pd.read_csv('../data/240105_embeddings_id_test_results_19_2_5_false_predictions.csv')

# kmeans_results_df = test_set[22:24]

# # Assuming kmeans_results_df is your DataFrame
# # Add new columns with default values
# kmeans_results_df['embeddings_id_1_1'] = None
# kmeans_results_df['error'] = None

# for index, row in tqdm(kmeans_results_df.iterrows(), total=kmeans_results_df.shape[0]):
#     spelling_1 = row['notifying_party_target']
#     spelling_2 = row['notifying_party_matched']
#     response = call_language_model(client, system_prompt, user_prompt_2_6, spelling_1, spelling_2)
#     print(response + "\n\n ---------------- \n\n")
#     # Update the DataFrame based on the response
#     if "[1]" in response:
#         kmeans_results_df.at[index, 'embeddings_id_1_1'] = 1
#     elif "[0]" in response:
#         kmeans_results_df.at[index, 'embeddings_id_1_1'] = 0
#     elif "[2]" in response:
#         kmeans_results_df.at[index, 'error'] = "unsure"
#     else:
#         kmeans_results_df.at[index, 'error'] = "error"

#     # Save to CSV file every 10 rows
#     if (index + 1) % 10 == 0:
#         kmeans_results_df.to_csv('../data/240104_embeddings_id_test_results_23_2_6.csv', index=False)

# # Save the remaining rows if the DataFrame length is not a multiple of 10
# if len(kmeans_results_df) % 10 != 0:
#     kmeans_results_df.to_csv('../data/240104_embeddings_id_test_results_23_2_6.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kmeans_results_df['embeddings_id_1_1'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kmeans_results_df['error'] = None


  0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
kmeans_results_df = pd.read_csv('../data/240104_embeddings_id_test_results_20_2_5.csv')

# add a column "prediction_correct" to the dataframe that is true if the correct column equals embddings_id_1
kmeans_results_df['prediction_correct'] = kmeans_results_df['correct'] == kmeans_results_df['embeddings_id_1_1']
# give me the percentage of correct predictions
kmeans_results_df['prediction_correct'].mean()

# # give me a dataframe with all the false predictions
false_predictions_df = kmeans_results_df[kmeans_results_df['prediction_correct'] == False]
false_predictions_df

Unnamed: 0,target_id,notifying_party_target,matched_id,notifying_party_matched,distance,legal_type_target,legal_type_matched,correct,error,embeddings_id_1_1,prediction_correct
0,9164,"OCM Real Estate Opportunities Fund IIIA, L.P.",9163,"OCM Real Estate Opportunities Fund III, L.P.",0.994375,1.0,1.0,0.0,,1.0,False
1,9693,Lone Star International Finance Holdings (Irel...,6092,Lone Star International Finance Holdings (Irel...,0.994269,1.0,1.0,0.0,,1.0,False
3,5122,Advent International GPE V-D Limited Partnership,5119,Advent International GPE V Limited Partnership,0.993964,1.0,1.0,0.0,,1.0,False
4,8871,Active Value Investors Fund Advisor N.V.,9324,Active Value Investors Fund Advisors N.V.,0.993915,1.0,1.0,0.0,,1.0,False
5,272,Swisscom Holding Deutschland GmbH,2471,Swisscom Deutschland Holding GmbH,0.99371,1.0,1.0,1.0,,0.0,False
6,2500,Rotermund Venture Capital Fond I GbR,1765,Rotermund Venture Fond I GbR,0.993702,1.0,1.0,0.0,,1.0,False
7,10832,Franklin Mutual Series Funds,6455,Franklin Mutual Series Fund,0.993622,1.0,1.0,0.0,,1.0,False
11,32,Jodexnis GmbH & Co. KG Beteiligungsgesellschaft,4357,Jodexnis GmbH & Co. Beteiligungsgesellschaft,0.993119,1.0,1.0,0.0,,1.0,False
12,10989,TTS Tooltechnic Systems Holding AG,9322,TTS Tooltechnik Systems Holding AG,0.993002,1.0,1.0,1.0,,0.0,False
15,2703,ASSA ABLOY Deutschland GmbH Berlin,577,ASSA ABLOY Holding GmbH Berlin,0.981145,1.0,1.0,,,0.0,False


In [160]:
# save to df
false_predictions_df.to_csv('../data/240105_embeddings_id_test_results_19_2_5_false_predictions.csv', index=False)

In [33]:
# drop the embeddings_id_1 and embeddings_id_1_1 columns
# kmeans_results_df.drop(['embeddings_id_1'], axis=1, inplace=True)
kmeans_results_df
kmeans_results_df.to_csv('../data/240104_embeddings_id_test_results_5.csv', index=False)

In [7]:
# df = pd.read_csv('../data/240103_highly_scoring_embeddings.csv')
df = kmeans_results_df
df

# sort by distance, descending
df.sort_values(by=['distance'], ascending=False, inplace=True)
#remove all rows where target_id == matched_id
df = df[df.target_id != df.matched_id]

df['sorted_ids'] = df.apply(lambda row: tuple(sorted([row['target_id'], row['matched_id']])), axis=1)
df = df.drop_duplicates(subset='sorted_ids').drop('sorted_ids', axis=1)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sorted_ids'] = df.apply(lambda row: tuple(sorted([row['target_id'], row['matched_id']])), axis=1)


Unnamed: 0,target_id,notifying_party_target,matched_id,notifying_party_matched,distance,legal_type_target,legal_type_matched
15590,1559,,1558,BHW Gesellschaft für Wohnungswirtschaft mbH (GfW),1.000000,,1.0
124881,12488,ZVG Verwaltungs- und Dienstleistungsgesellscha...,6601,ZVG Verwaltungs-und Dienstleistungsgesellschaf...,0.999585,1.0,1.0
123951,12395,Mühl Product & Service und Thüringer Baustoffh...,9342,Mühl Product & Service und Thüringer Baustoffh...,0.999334,1.0,1.0
57061,5706,Münchener RückversicherungsGesellschaft Aktien...,5175,Münchener Rückversicherungs-Gesellschaft Aktie...,0.999144,1.0,1.0
107371,10737,Dr. August Oetker Finanzierungs- und Beteiligu...,3457,Dr. August Oetker Finanzierungs-und Beteiligun...,0.998938,1.0,1.0
...,...,...,...,...,...,...,...
101575,10157,Lantmännen ek för,411,STORA Kopparbergs Bergslags AB,0.788096,1.0,1.0
101576,10157,Lantmännen ek för,1261,Vattenfall AB,0.787354,1.0,1.0
101577,10157,Lantmännen ek för,7136,Skandinaviska Enskilda Banken AB (publ),0.786510,1.0,1.0
101578,10157,Lantmännen ek för,7741,Furuholmen Eiendom AS,0.786336,1.0,1.0


In [28]:
# only keep records where legal_type_target and legal_type_matched are 1
test_df = df[(df.legal_type_target == 1) & (df.legal_type_matched == 1)]
test_df
test_df = test_df[200:300]
test_df

test_df.to_csv('../data/240104_test_set_3.csv', index=False)
test_df

Unnamed: 0,target_id,notifying_party_target,matched_id,notifying_party_matched,distance,legal_type_target,legal_type_matched
51234,5123,Advent International GPE V-E Limited Partnership,5105,Advent International GPE V-A Limited Partnership,0.994445,1.0,1.0
128201,12820,DEVK Deutsche Eisenbahn Versicherung Sach- und...,6016,DEVK Deutsche Eisenbahn Versicherung Sach- und...,0.994400,1.0,1.0
12151,1215,AS Industriebesitz und Beteiligungen Allianz V...,899,AS Industriebesitz und Beteiligungen Allianz V...,0.994400,1.0,1.0
51058,5105,Advent International GPE V-A Limited Partnership,5108,Advent International GPE V-H Limited Partnership,0.994395,1.0,1.0
91641,9164,"OCM Real Estate Opportunities Fund IIIA, L.P.",9163,"OCM Real Estate Opportunities Fund III, L.P.",0.994375,1.0,1.0
...,...,...,...,...,...,...,...
82751,8275,"Apax Europe VII-1, L.P.",8276,"Apax Europe VII-A, L.P.",0.992766,1.0,1.0
49351,4935,Doughty Hanson & Co. IV Limited Partnership No. 1,4938,Doughty Hanson & Co. IV Limited Partnership No. 4,0.992763,1.0,1.0
55131,5513,Zweite Medienfonds German Filmproductions GFP ...,5512,Zweite Medienfonds German Filmproductions GFP ...,0.992760,1.0,1.0
87971,8797,swisspartners Insurance Company SPC Ltd.,11859,Swisspartners Insurance Company SPC Ltd.,0.992750,1.0,1.0


In [4]:
import pandas as pd
# read ../data/240105_wandb_data.csv
df = pd.read_csv('../data/240105_wandb_data_2.csv')

cost = df['cost'].mean()
print(cost)

0.006175974025974001
