Note: In this document, I am matching rows that previously didn't have an OpenCorporates URL with the alias of a matched record and insert that record's OpenCorporates URL into the record where it is still missing.

In [1]:
import pandas as pd
import os
import dotenv
import requests
import json
from tqdm import tqdm
import ast

# set up environment variables
dotenv.load_dotenv()
OPENCORPORATES_API_KEY = os.getenv("OPENCORPORATES_API_KEY")

In [2]:
df = pd.read_csv('../data/combined.csv')

In [3]:
# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    # Check if 'previous_names' column has any aliases
    if pd.notna(row['previous_names']):
        # Convert the string representation of the list to an actual list
        aliases = ast.literal_eval(row['previous_names'])
        
        # Iterate through aliases
        for alias in aliases:
            # Remove leading and trailing whitespaces
            alias = alias.strip()
            
            # Check if the alias exists in 'notifying_party' column of other rows
            alias_exists = df[df['notifying_party'] == alias]
            
            if not alias_exists.empty:
                # Get the URL from the row where the alias comes from
                alias_url = row['OpenCorporates URL']
                
                # Update the URL column in rows where the alias appears
                df.loc[alias_exists.index, 'OpenCorporates URL'] = alias_url

# Save the updated DataFrame to a CSV file
df.to_csv('../data/df_with_aliases.csv', index=False)