In [1]:
import requests
import json
import os 

import polars as pl

from tqdm import tqdm

In [105]:
API_KEY = "9278o0hrad85ouh3dhl7idcd6r7mvn4f12ek2q4d5b6b5upnj48"
API = "https://societeinfo.com/app/rest/api/v2/company.json"
FILE_SAMPLE = f"./samples/siren_samples.csv"
FILE_RES = f"./results/societeinfo_siren.csv"

In [8]:
df = pl.read_csv(FILE_SAMPLE)
df = df.cast({"siren": pl.String})
print(df.describe())

shape: (9, 5)
┌────────────┬──────────────────────────────┬────────────┬───────────┬─────────────────────────────┐
│ statistic  ┆ linkedin_url                 ┆ input_name ┆ siren     ┆ website                     │
│ ---        ┆ ---                          ┆ ---        ┆ ---       ┆ ---                         │
│ str        ┆ str                          ┆ str        ┆ str       ┆ str                         │
╞════════════╪══════════════════════════════╪════════════╪═══════════╪═════════════════════════════╡
│ count      ┆ 117                          ┆ 117        ┆ 117       ┆ 117                         │
│ null_count ┆ 0                            ┆ 0          ┆ 0         ┆ 0                           │
│ mean       ┆ null                         ┆ null       ┆ null      ┆ null                        │
│ std        ┆ null                         ┆ null       ┆ null      ┆ null                        │
│ min        ┆ https://www.linkedin.com/com ┆ ACOME      ┆ 301374922 ┆ http:/

In [9]:
def send_request(siren: str) -> dict:
    res = requests.get(f"{API}/{siren}", params={"key": API_KEY}, timeout=10)
    if res.status_code != 200:
        return {}
    
    try:
        data = res.content
        return json.loads(data)
    except:
        return {}

In [97]:
companies = []
for company in df.rows()[114:]:
    linkedin_url, input_name, siren, website = company
    
    data = send_request(siren)
    linkedin_url_found = data.get("result", {}).get("web_infos", {}).get("linkedin", {}).get("url")

    linkedin_id_found = None
    if linkedin_url_found:
        linkedin_id_found = linkedin_url_found.strip("/").split("/")[-1].lower()

    companies.append({
        "linkedin_url": linkedin_url,
        "input_name": input_name,
        "siren": siren,
        "website": website,
        "linkedin_id_found": linkedin_id_found
    })

print(len(companies))

3


In [98]:
df_companies = pl.DataFrame(companies)
df_companies.describe()

statistic,linkedin_url,input_name,siren,website,linkedin_id_found
str,str,str,str,str,str
"""count""","""3""","""3""","""3""","""3""","""3"""
"""null_count""","""0""","""0""","""0""","""0""","""0"""
"""mean""",,,,,
"""std""",,,,,
"""min""","""https://www.li…","""Alsapan""","""333916385""","""http://www.als…","""alsapan-indust…"
"""25%""",,,,,
"""50%""",,,,,
"""75%""",,,,,
"""max""","""https://www.li…","""SNCF""","""552049447""","""https://www.ei…","""sncf-immobilie…"


In [99]:
def compare_linkedin_ids(df: pl.DataFrame) -> pl.DataFrame:
    success = []
    for company in df.rows(named=True):
        url = company.get("linkedin_url")
        url = url.strip("/")
        linkedin_id = url.split("/")[-1].lower()
        success.append(linkedin_id == company.get("linkedin_id_found"))

    serie = pl.Series("success", success)
    return df.with_columns(serie)

In [100]:
df_results = compare_linkedin_ids(df_companies)
df_results.tail()

linkedin_url,input_name,siren,website,linkedin_id_found,success
str,str,str,str,str,bool
"""https://www.li…","""SNCF""","""552049447""","""http://www.snc…","""sncf-immobilie…",False
"""https://www.li…","""Alsapan""","""392213146""","""http://www.als…","""alsapan-indust…",False
"""https://www.li…","""Eiffage Metal""","""333916385""","""https://www.ei…","""eiffage-metal""",True


In [101]:
for i in df.rows()[114:]:
    print(i)

('https://www.linkedin.com/company/sncf', 'SNCF', '552049447', 'http://www.sncf.com')
('https://www.linkedin.com/company/alsapan', 'Alsapan', '392213146', 'http://www.alsapan.com')
('https://www.linkedin.com/company/eiffage-metal', 'Eiffage Metal', '333916385', 'https://www.eiffagemetal.com/')


In [102]:
df_results.write_csv("./results/siren/batch_7.csv")

In [108]:
df_results = pl.read_csv(FILE_RES)
print(df_results)

shape: (117, 6)
┌───────────────────┬──────────────────┬───────────┬──────────────────┬──────────────────┬─────────┐
│ linkedin_url      ┆ input_name       ┆ siren     ┆ website          ┆ linkedin_id_foun ┆ success │
│ ---               ┆ ---              ┆ ---       ┆ ---              ┆ d                ┆ ---     │
│ str               ┆ str              ┆ i64       ┆ str              ┆ ---              ┆ bool    │
│                   ┆                  ┆           ┆                  ┆ str              ┆         │
╞═══════════════════╪══════════════════╪═══════════╪══════════════════╪══════════════════╪═════════╡
│ https://www.linke ┆ FAURECIA CLEAN   ┆ 542005376 ┆ https://www.faur ┆ faurecia         ┆ true    │
│ din.com/company…  ┆ MOBILITY         ┆           ┆ ecia.com/        ┆                  ┆         │
│                   ┆ (groupe…         ┆           ┆                  ┆                  ┆         │
│ https://www.linke ┆ PRYSMIAN GROUP   ┆ 95750311  ┆ https://fr.prysm ┆ nul

In [111]:
print(sum(1 for i in df_results.rows() if i[-1]))
print(sum(1 for i in df_results.rows() if i[-2] is None))

68
18


In [112]:
print(68/117*100)
print(18/117*100)

58.119658119658126
15.384615384615385
