In [1]:
import pandas as pd
import os
import dotenv
import requests
import json

# set up environment variables
dotenv.load_dotenv()
OPENCORPORATES_API_KEY = os.getenv("OPENCORPORATES_API_KEY")

In [5]:
df = pd.read_csv('../data/openrefine-unique-notifying-parties.csv')

In [3]:
df.head(5)

Unnamed: 0,notifying_party,Matched company name,OpenCorporates URL
0,ROTHENBERGER Vermögensverwaltung GbR 4xS,,
1,"Rothenberger, Dr., Helmut",,
2,"Schneider, Franz Jürgen",,
3,3i Group plc,3I GROUP PLC,http://opencorporates.com/companies/gb/01142830
4,3i Deutschland Gesellschaft für Industriebetei...,3i Deutschland Gesellschaft für Industriebetei...,http://opencorporates.com/companies/de/M1201_H...


In [4]:
# group by 'Matched Name'
df_grouped = df.groupby('OpenCorporates URL').size()
# sort by size
df_grouped.sort_values(ascending=False)

OpenCorporates URL
http://opencorporates.com/companies/us_de/3736888        4
http://opencorporates.com/companies/lu/B153466           4
http://opencorporates.com/companies/fr/352045454         3
http://opencorporates.com/companies/bm/26108             3
http://opencorporates.com/companies/gb/02312079          3
                                                        ..
http://opencorporates.com/companies/de/M1201_HRB42023    1
http://opencorporates.com/companies/de/M1201_HRB41496    1
http://opencorporates.com/companies/de/M1201_HRB40601    1
http://opencorporates.com/companies/de/M1201_HRB39682    1
http://opencorporates.com/companies/za/2012-045177-07    1
Length: 6986, dtype: int64

### Retrieving previous company names

In [6]:
# Set up the OpenCorporates API URL
api_base_url = "https://api.opencorporates.com/v0.4/companies/{}/{}?api_token={}"

# Define a function to retrieve the previous names from the OpenCorporates API for a given URL
def get_previous_names(url):
    # Extract the jurisdiction code and company number from the OpenCorporates URL
    jurisdiction_code, company_number = url.split("/")[-2:]

    # Construct the OpenCorporates API URL using the jurisdiction code, company number, and your API key
    api_url = api_base_url.format(jurisdiction_code, company_number, OPENCORPORATES_API_KEY)

    # Make a GET request to the OpenCorporates API URL and parse the JSON response
    response = requests.get(api_url)
    data = json.loads(response.text)

    # Extract the previous names from the JSON response and return them
    return [name["company_name"] for name in data["results"]["company"].get("previous_names", [])]

# Add a new column 'previous_names' to the dataframe, containing the previous names for each row with an OpenCorporates URL
df['previous_names'] = df.loc[df['OpenCorporates URL'].notnull(), 'OpenCorporates URL'].apply(get_previous_names)

# Iterate over each row in the dataframe
for index, row in df.iterrows():
    # Check if the row has an OpenCorporates URL and previous names
    if pd.notnull(row['OpenCorporates URL']) and row['previous_names']:
        # Iterate over each previous name
        for prev_name in row['previous_names']:
            # Check if the previous name appears in the notifying_party column
            if prev_name in df.loc[df['notifying_party'].notnull(), 'notifying_party'].values:
                # Set the OpenCorporates URL column of the current row to the URL of the row where the notifying_party matches the previous name
                df.loc[df['notifying_party'] == prev_name, 'OpenCorporates URL'] = row['OpenCorporates URL']
                # df.loc[index, 'OpenCorporates URL'] = df.loc[df['notifying_party'] == prev_name, 'OpenCorporates URL'].values[0]


In [None]:
# save the dataframe as a CSV with the name 'openrefine-with-previous-names.csv'
df.to_csv('../data/openrefine-with-previous-names.csv', index=False)