In [21]:
import requests
import json
import tldextract

import polars as pl

from tqdm import tqdm
from threading import Thread
from typing import Callable

In [22]:
TOKEN_TCAPI = "1Y4WIaz5"
TOKEN_CORESIGNAL = "eyJhbGciOiJFZERTQSIsImtpZCI6IjY5ZjcxYTE1LTExZGEtMzE4OC1lMjFkLWEwY2FhODEzZjYxMiJ9.eyJhdWQiOiJ0aXBwYWJibGUuY29tIiwiZXhwIjoxNzQ0MjE0NzQ1LCJpYXQiOjE3MTI2NTc3OTMsImlzcyI6Imh0dHBzOi8vb3BzLmNvcmVzaWduYWwuY29tOjgzMDAvdjEvaWRlbnRpdHkvb2lkYyIsIm5hbWVzcGFjZSI6InJvb3QiLCJwcmVmZXJyZWRfdXNlcm5hbWUiOiJ0aXBwYWJibGUuY29tIiwic3ViIjoiZmEwYzRjOWMtYzIxYy1mZmRmLWMwYjktNDhhZWQ1YWY5YzE2IiwidXNlcmluZm8iOnsic2NvcGVzIjoiY2RhcGkifX0.WTz12DOayKH9Qe3n1Wbx7rk-dZI-4YS5ZI0qEDcb4YwSY0DBcccn53XEQtBFIimVXr1OHdrtyBWOiLsIyJL3Bg"
URL_TCAPI = "https://api.thecompaniesapi.com/v1"
URL_CORESIGNAL = "https://api.coresignal.com/cdapi/v1/linkedin/company"
FILE_SAMPLE = "./samples/samples_aprex.csv"
FILE_RES_TCAPI = "./results/tcapi_aprex.csv"
FILE_RES_CORESIGNAL = "./results/coresignal_aprex.csv"

In [23]:
def get(url: str, headers: dict={}, params: dict={}) -> any:
    res = requests.get(url=url, headers=headers, params=params, timeout=10)
    if res.status_code != 200:
        return {}

    try:
        data = res.content.decode()
        return json.loads(data)
    except:
        return {}
    
def tcapi_find_by_domain(company: tuple, companies: list) -> None:
    linkedin_url, name, website = company
    domain = tldextract.extract(website).registered_domain
    data = get(f"{URL_TCAPI}/companies/{domain}", params={"token": TOKEN_TCAPI})

    linkedin_id_found = data.get("socialNetworks", {}).get("linkedinIdAlpha", None)
    companies.append({
        "linkedin_url": linkedin_url, 
        "input_name": name,
        "website": website,
        "domain": domain,
        "linkedin_id_found": linkedin_id_found
    })

def tcapi_get_data_by_domain(company: tuple, companies: list) -> None:
    linkedin_url, name, website = company
    domain = tldextract.extract(website).registered_domain
    data = get(f"{URL_TCAPI}/companies/{domain}", params={"token": TOKEN_TCAPI})
    linkedin_id_found = data.get("socialNetworks", {}).get("linkedinIdAlpha", None)
    companies.append({
        "linkedin_url": linkedin_url, 
        "input_name": name,
        "website": website,
        "domain": domain,
        "linkedin_id_found": linkedin_id_found,
        "data": data,
    })

def tcapi_get_data_by_name(company: tuple, companies: list) -> None:
    linkedin_url, name, website = company
    domain = tldextract.extract(website).registered_domain
    url_like_name = "-".join(name.lower().split())
    data = get(f"{URL_TCAPI}/companies/by-name", params={"token": TOKEN_TCAPI, "name": name})
    
    linkedin_ids = [node.get("socialNetworks").get("linkedinIdAlpha") for node in data.get("companies", []) if node.get("socialNetworks")]
    if url_like_name in linkedin_ids:
        linkedin_id_found = url_like_name
    else:
        linkedin_id_found = linkedin_ids[0] if linkedin_ids else None

    companies.append({
        "linkedin_url": linkedin_url, 
        "input_name": name,
        "website": website,
        "domain": domain,
        "linkedin_id_found": linkedin_id_found,
        "data": data,
    })

def send_requests(target: Callable, df: pl.DataFrame) -> list:
    companies, threads = [], []

    for company in tqdm(df.rows(), "Curling data..."):
        process = Thread(target=target, args=[company, companies])
        process.start()
        threads.append(process)

    for process in tqdm(threads, "Joining threads..."):
        process.join()

    return companies

def compare_linkedin_ids(df: pl.DataFrame) -> pl.DataFrame:
    success = []
    for company in df.rows(named=True):
        url = company.get("linkedin_url")
        url = url.strip("/")
        linkedin_id = url.split("/")[-1].lower()
        success.append(linkedin_id == company.get("linkedin_id_found"))

    serie = pl.Series("success", success)
    return df.with_columns(serie)

In [42]:
def post(url: str, headers: dict={}, params: dict={}, data={}, data_json={}) -> any:
    res = requests.post(
        url=url,
        headers=headers,
        params=params,
        data=data,
        json=data_json
    )
    if res.status_code != 200:
        return {}

    try:
        data = res.content.decode()
        return json.loads(data)
    except:
        return {}

def coresignal_find_id(name: str|None=None, domain: str|None=None) -> str|None:
    data_json = {}
    if name:
        data_json["name"] = name
    if domain:
        data_json["website"] = domain
    data = post(
        url=f"{URL_CORESIGNAL}/search/filter",
        headers={
            "Authorization": f"Bearer {TOKEN_CORESIGNAL}",
            "Content-Type": "application/json",
            "accept": "application/json"
        },
        data_json=data_json
    )

    if data:
        return str(data[0])
    return None

def coresignal_find_by_domain(company: tuple, companies: list) -> None:
    linkedin_url, name, website = company
    domain = tldextract.extract(website).registered_domain
    coresignal_id = coresignal_find_id(domain=domain)
    linkedin_id_found = None

    if coresignal_id:
        data = get(
            f"{URL_CORESIGNAL}/collect/{coresignal_id}",
            headers={"Authorization": f"Bearer {TOKEN_CORESIGNAL}"}
        )
        linkedin_url_found = data.get("url", None)
        if linkedin_url_found:
            linkedin_id_found = linkedin_url_found.strip("/").split("/")[-1]
            
    companies.append({
        "linkedin_url": linkedin_url, 
        "input_name": name,
        "domain_found": domain,
        "coresignal_id": coresignal_id,
        "linkedin_id_found": linkedin_id_found
    })

In [25]:
df = pl.read_csv(FILE_SAMPLE)
df = df.drop_nulls()
print(df)

shape: (130, 3)
┌───────────────────────────────────┬──────────────────────────┬──────────────────────────────┐
│ linkedin_url                      ┆ input_name               ┆ website                      │
│ ---                               ┆ ---                      ┆ ---                          │
│ str                               ┆ str                      ┆ str                          │
╞═══════════════════════════════════╪══════════════════════════╪══════════════════════════════╡
│ https://www.linkedin.com/company… ┆ FAURECIA CLEAN  MOBILITY ┆ https://www.faurecia.com/    │
│                                   ┆ (groupe…                 ┆                              │
│ https://www.linkedin.com/company… ┆ PRYSMIAN GROUP           ┆ https://fr.prysmian.com/     │
│ https://www.linkedin.com/company… ┆ PETIT BATEAU             ┆ https://www.petit-bateau.fr/ │
│ https://www.linkedin.com/company… ┆ VALEO VISION             ┆ https://www.valeo.com/fr/    │
│ https://www.linkedin.c

In [142]:
tcapi_companies = send_requests(tcapi_find_by_domain, df)
df_tcapi_companies = pl.DataFrame(tcapi_companies)
df_tcapi_companies_results = compare_linkedin_ids(df_tcapi_companies)
print(df_tcapi_companies_results.head())
df_tcapi_companies_results.write_csv(FILE_RES_TCAPI)

Curling data...: 100%|██████████| 130/130 [00:00<00:00, 201.93it/s]
Joining threads...: 100%|██████████| 130/130 [00:01<00:00, 78.04it/s]

shape: (5, 6)
┌─────────────────┬─────────────────┬─────────────────┬─────────────────┬────────────────┬─────────┐
│ linkedin_url    ┆ input_name      ┆ website         ┆ domain          ┆ linkedin_id_fo ┆ success │
│ ---             ┆ ---             ┆ ---             ┆ ---             ┆ und            ┆ ---     │
│ str             ┆ str             ┆ str             ┆ str             ┆ ---            ┆ bool    │
│                 ┆                 ┆                 ┆                 ┆ str            ┆         │
╞═════════════════╪═════════════════╪═════════════════╪═════════════════╪════════════════╪═════════╡
│ https://www.lin ┆ Innothera       ┆ https://www.inn ┆ innothera.fr    ┆ null           ┆ false   │
│ kedin.com/compa ┆                 ┆ othera.fr/fr/   ┆                 ┆                ┆         │
│ ny…             ┆                 ┆                 ┆                 ┆                ┆         │
│ https://www.lin ┆ SOLOCAP-MAB     ┆ https://www.mai ┆ maisonmelanmout ┆ mai




In [146]:
tcapi_success = sum(1 for i in df_tcapi_companies_results.rows() if i[-1])
tcapi_none = sum(1 for i in df_tcapi_companies_results.rows() if i[-2] is None)
print(f"{tcapi_success=}/{df_tcapi_companies_results.shape[0]}") 
print(f"{tcapi_none=}/{df_tcapi_companies_results.shape[0]}")
print(f"success rate={tcapi_success/df_tcapi_companies_results.shape[0]*100:.2f}%")

tcapi_success=97/130
tcapi_none=12/130
success rate=74.62%


In [43]:
coresignal_companies = send_requests(coresignal_find_by_domain, df)
df_coresignal_companies = pl.DataFrame(coresignal_companies)
df_coresignal_companies_results = compare_linkedin_ids(df_coresignal_companies)
print(df_coresignal_companies_results.head())
df_coresignal_companies_results.write_csv(FILE_RES_CORESIGNAL)

Curling data...: 100%|██████████| 130/130 [00:00<00:00, 143.02it/s]
Joining threads...: 100%|██████████| 130/130 [00:01<00:00, 65.94it/s]

shape: (5, 6)
┌──────────────────┬─────────────────┬─────────────────┬───────────────┬─────────────────┬─────────┐
│ linkedin_url     ┆ input_name      ┆ domain_found    ┆ coresignal_id ┆ linkedin_id_fou ┆ success │
│ ---              ┆ ---             ┆ ---             ┆ ---           ┆ nd              ┆ ---     │
│ str              ┆ str             ┆ str             ┆ str           ┆ ---             ┆ bool    │
│                  ┆                 ┆                 ┆               ┆ str             ┆         │
╞══════════════════╪═════════════════╪═════════════════╪═══════════════╪═════════════════╪═════════╡
│ https://www.link ┆ BORLIS          ┆ borlis-solution ┆ null          ┆ null            ┆ false   │
│ edin.com/company ┆                 ┆ s.com           ┆               ┆                 ┆         │
│ …                ┆                 ┆                 ┆               ┆                 ┆         │
│ https://www.link ┆ LISAQUA         ┆ lisaqua.com     ┆ null          ┆ null




In [44]:
for i in df_coresignal_companies_results.rows():
    if i[-1]:
        print(i)

('https://www.linkedin.com/company/ykk-france/', 'YKK FRANCE', 'ykk.fr', '3102765', 'ykk-france', True)
('https://www.linkedin.com/company/fysol-sas/', 'FYSOL SAS', 'fysol.com', '90112604', 'fysol-sas', True)
('https://www.linkedin.com/company/jimmyfairly-com/', 'JIMMY FAIRLY LAB', 'jimmyfairly.com', '8099175', 'jimmyfairly-com', True)
('https://www.linkedin.com/company/ynsect/', 'YNSECT', 'ynsect.com', '3098043', 'ynsect', True)
('https://www.linkedin.com/company/sylektis/', 'SYLEKTIS', 'sylektis.com', '82805022', 'sylektis', True)
('https://www.linkedin.com/company/thuasne/', 'THUASNE', 'thuasne.com', '2423768', 'thuasne', True)
('https://www.linkedin.com/company/acome/', 'ACOME', 'acome.com', '8843336', 'acome', True)
('https://www.linkedin.com/company/diagast/', 'DIAGAST', 'diagast.com', '7015678', 'diagast', True)
('https://www.linkedin.com/company/fcagroup', 'Fiat Chrysler Automobiles', 'fcagroup.com', '10156517', 'fcagroup', True)
('https://www.linkedin.com/company/wienerbergerf

In [45]:
df_coresignal_companies_results.describe()

statistic,linkedin_url,input_name,domain_found,coresignal_id,linkedin_id_found,success
str,str,str,str,str,str,f64
"""count""","""130""","""130""","""130""","""59""","""59""",130.0
"""null_count""","""0""","""0""","""0""","""71""","""71""",0.0
"""mean""",,,,,,0.223077
"""std""",,,,,,
"""min""","""https://www.li…","""ACOME""","""acome.com""","""100201""","""acome""",0.0
"""25%""",,,,,,
"""50%""",,,,,,
"""75%""",,,,,,
"""max""","""https://www.li…","""YNSECT""","""ynsect.com""","""9439699""","""ynsect""",1.0


In [46]:
coresignal_success = sum(1 for i in df_coresignal_companies_results.rows() if i[-1])
coresignal_none = sum(1 for i in df_coresignal_companies_results.rows() if i[-2] is None)
print(f"{coresignal_success=}/{df_coresignal_companies_results.shape[0]}") 
print(f"{coresignal_none=}/{df_coresignal_companies_results.shape[0]}")
print(f"success rate={coresignal_success/df_coresignal_companies_results.shape[0]*100:.2f}%")

coresignal_success=29/130
coresignal_none=71/130
success rate=22.31%


In [47]:
71/130*100

54.61538461538461