In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
import requests
import warnings
warnings.filterwarnings("ignore")

In [9]:
df = pd.read_excel("forward_firm_universe.xlsx")

In [10]:
df.head()

Unnamed: 0,Entity_Name,LEI,ISIN
0,Shell PLC,21380068P1DRHMJ8KU70,BRRDSABDR009
1,A Finkl & Sons Co,,
2,A. Finkl & Sons Corp,,
3,AEP Generating Co,,
4,AGL Loy Yang Pty Ltd,,


In [11]:
df.shape

(23446, 3)

In [12]:
df.columns = df.columns.str.lower()

In [13]:
GLEIF_API_URL = "https://api.gleif.org/api/v1/lei-records"

In [14]:
import re

def clean_company_name(name):
    # Remove common words and punctuation for comparison
    name = re.sub(r"\b(Co|Ltd|Corp|LLC|Company|Inc)\b", "", name, flags=re.IGNORECASE)
    name = re.sub(r"[^\w\s]", "", name)  # Remove punctuation
    name = name.strip()
    return name

In [None]:
# # Function to find similar strings
# def find_similar(data, threshold=80):
#     similar_pairs = []
#     for i in range(len(data)):
#         for j in range(i + 1, len(data)):
#             if fuzz.ratio(data[i], data[j]) > threshold:
#                 similar_pairs.append((data[i], data[j]))
#     return similar_pairs

In [32]:
# # Find similar company names
# similar_names = find_similar(df['Entity_Name'].tolist())

# print("Similar Company Names:")
# for name_pair in similar_names:
#     print(name_pair)


In [37]:
def get_lei(company_name):
    params = {
        "filter[entity.legalName]": company_name,  # Search by legal name
        "page[size]": 1  # Limit to 1 result
    }
    response = requests.get(GLEIF_API_URL, params=params)
    if response.status_code == 200:
        results = response.json().get('data', [])
        if results:
            # Extract the LEI from the first result
            return results[0]['id']
        else:
            return None
    else:
        print(f"Error: {response.status_code}")
        return None

In [43]:
#df['LEI'] = df['entity_name'].apply(get_lei)

In [15]:
df['cleaned_entity_name'] = df['entity_name'].apply(clean_company_name)

In [16]:
df_cleaned = df.drop_duplicates(subset=['cleaned_entity_name'])

In [17]:
df_cleaned = df_cleaned.drop(columns=['lei'])


In [21]:
df_cleaned = df_cleaned[['cleaned_entity_name', 'entity_name']]

In [None]:
def process_in_batches(companies, batch_size=100):
    results = []
    for i in range(0, len(companies), batch_size):
        batch = companies[i:i+batch_size]
        for company in batch:
            lei = get_lei_cached(company)
            results.append((company, lei))
        print(f"Processed batch {i // batch_size + 1}. Waiting for next batch...")
        time.sleep(60)  # Wait time between batches, e.g., 1 minute
    return results


In [14]:
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

True

In [27]:
import requests
from typing import Tuple

def get_company_info(company_names: list) -> Tuple[list, list, list]:
    """
    Retrieves company information from the PeopleDataLabs API for a list of company names.
    
    Parameters:
    company_names (list): A list of company legal names.
    
    Returns:
    Tuple[list, list, list]: Three lists containing the entity type, industry classification, and company size (number of employees) for each company.
    """
    entity_types = []
    industry_classifications = []
    company_sizes = []
    
    api_key = os.getenv("PEOPLEDATALABS_API_KEY")
    
    for company_name in company_names:
        url = f"https://api.peopledatalabs.com/v5/company/enrich?name={company_name}"
        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            data = response.json()
            
            entity_type = data.get("entity_type", "Not Found")
            industry_classification = data.get("industry_classification", "Not Found")
            company_size = data.get("company_size", "Not Found")
            
            entity_types.append(entity_type)
            industry_classifications.append(industry_classification)
            company_sizes.append(company_size)
        else:
            entity_types.append("Error")
            industry_classifications.append("Error")
            company_sizes.append("Error")
    
    return entity_types, industry_classifications, company_sizes

In [44]:
import requests

# Your People Data Labs API key
PDL_API_KEY = 'your_api_key_here'

def get_company_info(company_name):
    url = 'https://api.peopledatalabs.com/v5/company/enrich'
    params = {
        'name': company_name,
        'pretty': True,
        'api_key': os.getenv("PEOPLEDATALABS_API_KEY")
    }
    
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        return {
            'Entity Type': data.get('type', 'N/A'),
            'Industry Classification': data.get('industry', 'N/A'),
            'Company Size': data.get('size', 'N/A'),
        }
    else:
        print(f"Failed to retrieve data for {company_name}")
        return None



In [47]:
# Example usage with a list of company names
company_names = ['Microsoft', 'Google', 'Tesla', "Ajaokuta Steel", "Chivita", 'MTN']
company_info_list = {}

for name in company_names:
    info = get_company_info(name)
    company_info_list[name] = info

# Display the collected information
for company, info in company_info_list.items():
    print(f"{company}: {info}")


Failed to retrieve data for Chivita
Microsoft: {'Entity Type': 'public', 'Industry Classification': 'computer software', 'Company Size': '10001+'}
Google: {'Entity Type': 'private', 'Industry Classification': 'internet', 'Company Size': '10001+'}
Tesla: {'Entity Type': 'public', 'Industry Classification': 'automotive', 'Company Size': '10001+'}
Ajaokuta Steel: {'Entity Type': 'government', 'Industry Classification': 'computer & network security', 'Company Size': '1-10'}
Chivita: None
MTN: {'Entity Type': 'public', 'Industry Classification': 'telecommunications', 'Company Size': '10001+'}


In [30]:
client = PDLPY(
    api_key=os.getenv("PEOPLEDATALABS_API_KEY")
)

In [43]:
result = client.company.enrichment(
    website="LinkedIn.com",
    pretty=True,
)
if result.ok:
    print(result.text)
else:
    print(
        f"Status: {result.status_code}"
        f"\nReason: {result.reason}")
        #f"\nMessage: {result.json()['error']['message']}")

{
  "status": 200,
  "name": "linkedin",
  "display_name": "LinkedIn",
  "size": "10001+",
  "employee_count": 32643,
  "id": "8QPs7V62lOefbTnR9UpgmwCn3lDe",
  "founded": 2003,
  "industry": "internet",
  "naics": [
    {
      "naics_code": "54161",
      "sector": "professional, scientific, and technical services",
      "sub_sector": "professional, scientific, and technical services",
      "industry_group": "management, scientific, and technical consulting services",
      "naics_industry": "management consulting services",
      "national_industry": null
    },
    {
      "naics_code": "54151",
      "sector": "professional, scientific, and technical services",
      "sub_sector": "professional, scientific, and technical services",
      "industry_group": "computer systems design and related services",
      "naics_industry": "computer systems design and related services",
      "national_industry": null
    },
    {
      "naics_code": "81299",
      "sector": "other services (e