In [1]:
import requests
import json
import os

import polars as pl

from tqdm import tqdm
from loader import load_db_dump
from threading import Thread

In [2]:
TOKEN = "eyJhbGciOiJFZERTQSIsImtpZCI6IjlmNTMzZWQ3LTU0NzMtMzEzNy1iMTUzLTY3NGZhYTZmOWRjZiJ9.eyJhdWQiOiJuYW1raW4uZnIiLCJleHAiOjE3NDM4ODM3OTksImlhdCI6MTcxMjMyNjg0NywiaXNzIjoiaHR0cHM6Ly9vcHMuY29yZXNpZ25hbC5jb206ODMwMC92MS9pZGVudGl0eS9vaWRjIiwibmFtZXNwYWNlIjoicm9vdCIsInByZWZlcnJlZF91c2VybmFtZSI6Im5hbWtpbi5mciIsInN1YiI6ImZhMGM0YzljLWMyMWMtZmZkZi1jMGI5LTQ4YWVkNWFmOWMxNiIsInVzZXJpbmZvIjp7InNjb3BlcyI6ImNkYXBpIn19.bZ_rb4-2DLyuAegn2faP4txfune0AgDxCNzYDlhKkUZJWnamW9D1kbWa4xECd8Tb_d6UFQihefQbWuMgKhl3Bw"
API_URL_ID = "https://api.coresignal.com/cdapi/v1/linkedin/company/search/filter"
API_URL_LINKEDIN = "https://api.coresignal.com/cdapi/v1/linkedin/company/collect"
NUMBER_OF_SAMPLES = 250
SAMPLE_FILE = f"./samples/tcapi_sample_{NUMBER_OF_SAMPLES}.parquet"
RESULTS_FILE = f"./results/coresignal_results_{NUMBER_OF_SAMPLES}.parquet"

In [4]:
if os.path.isfile(SAMPLE_FILE):
    print("Getting samples from file...")
    df_sample = pl.read_parquet(SAMPLE_FILE)
else:
    print("Extracting samples...")
    df = load_db_dump()
    df_sample = df.select(["linkedin_url", "input_name"]).sample(NUMBER_OF_SAMPLES)
    print("Saving samples")
    df_sample.write_parquet(SAMPLE_FILE)

Getting samples from file...


In [5]:
print(df_sample)

shape: (250, 2)
┌───────────────────────────────────┬─────────────────────┐
│ linkedin_url                      ┆ input_name          │
│ ---                               ┆ ---                 │
│ str                               ┆ str                 │
╞═══════════════════════════════════╪═════════════════════╡
│ https://www.linkedin.com/company… ┆ GROUPE ALLEVIO      │
│ https://www.linkedin.com/company… ┆ ECA ROBOTICS Toulon │
│ https://www.linkedin.com/company… ┆ MER MEC SPA         │
│ https://www.linkedin.com/company… ┆ LBA LEBRONZE ALLOYS │
│ https://www.linkedin.com/company… ┆ Europastry          │
│ …                                 ┆ …                   │
│ https://www.linkedin.com/company… ┆ CSEM Neuchatel      │
│ https://www.linkedin.com/company… ┆ ROBIN RADAR SYSTEMS │
│ https://www.linkedin.com/company… ┆ UAB “FEK COMPANY”   │
│ https://www.linkedin.com/company… ┆ APISSYS             │
│ https://www.linkedin.com/company… ┆ HUTCHINSON          │
└───────────────────────

In [6]:
def find_id(company: tuple, companies: list) -> None:
    linkedin_url, input_name = company
    res = requests.post(
        API_URL_ID,
        headers={
            "Authorization": f"Bearer {TOKEN}",
            "Content-Type": "application/json",
            "accept": "application/json"
        },
        json={
            "name": input_name
        }
    )

    if res.status_code == 200:
        try:
            data = res.content.decode()
            data = json.loads(data)
        except:
            data = []
    else:
        data = []

    if data:
        company_id = str(data[0])
    else:
        company_id = ""

    companies.append({"linkedin_url": linkedin_url, "company_id": company_id})

In [7]:
companies, threads = [], []

for company in tqdm(df_sample[:75].rows(), "Curling data..."):
    process = Thread(target=find_id, args=[company, companies])
    process.start()
    threads.append(process)

for process in tqdm(threads, "Joining threads..."):
    process.join()

print(len(companies))

Curling data...:   0%|          | 0/75 [00:00<?, ?it/s]

Curling data...: 100%|██████████| 75/75 [00:00<00:00, 186.24it/s]
Joining threads...: 100%|██████████| 75/75 [00:00<00:00, 176.31it/s]

75





In [8]:
df_companies = pl.DataFrame(companies)
print(df_companies)

shape: (75, 2)
┌───────────────────────────────────┬────────────┐
│ linkedin_url                      ┆ company_id │
│ ---                               ┆ ---        │
│ str                               ┆ str        │
╞═══════════════════════════════════╪════════════╡
│ https://www.linkedin.com/company… ┆            │
│ https://www.linkedin.com/company… ┆            │
│ https://www.linkedin.com/company… ┆            │
│ https://www.linkedin.com/company… ┆            │
│ https://www.linkedin.com/company… ┆            │
│ …                                 ┆ …          │
│ https://www.linkedin.com/company… ┆ 23929594   │
│ https://www.linkedin.com/company… ┆ 4016880    │
│ https://www.linkedin.com/company… ┆ 46252890   │
│ https://www.linkedin.com/company… ┆            │
│ https://www.linkedin.com/company… ┆            │
└───────────────────────────────────┴────────────┘


In [10]:
def find_linkedin(company: tuple, matching_linkedin: list) -> None:
    linkedin_url, company_id = company
    res = requests.get(
        f"{API_URL_LINKEDIN}/{company_id}",
        headers={
            "Authorization": f"Bearer {TOKEN}"
        }
    )

    if res.status_code == 200:
        try:
            data = res.content.decode()
            data = json.loads(data)
        except:
            data = {}
    else:
        data = {}    
    
    linkedin_url_found = data.get("url", "")
    if linkedin_url_found:
        if linkedin_url_found[-1] == "/":
            linkedin_url_found = linkedin_url_found[:len(linkedin_url_found)-1]
    
    success = linkedin_url == linkedin_url_found

    matching_linkedin.append({"expected": linkedin_url, "got": linkedin_url_found, "ok": success})

In [11]:
matching_linkedin, threads = [], []

for company in tqdm(df_companies.rows(), "Curling data..."):
    if company[-1] != "":
        process = Thread(target=find_linkedin, args=[company, matching_linkedin])
        process.start()
        threads.append(process)

for process in tqdm(threads, "Joining threads..."):
    process.join()

Curling data...: 100%|██████████| 75/75 [00:00<00:00, 1785.84it/s]


In [12]:
df_matching_linkedin = pl.DataFrame(matching_linkedin)
print(df_matching_linkedin)

shape: (19, 3)
┌───────────────────────────────────┬───────────────────────────────────┬───────┐
│ expected                          ┆ got                               ┆ ok    │
│ ---                               ┆ ---                               ┆ ---   │
│ str                               ┆ str                               ┆ bool  │
╞═══════════════════════════════════╪═══════════════════════════════════╪═══════╡
│ https://www.linkedin.com/company… ┆ https://www.linkedin.com/company… ┆ true  │
│ https://www.linkedin.com/company… ┆ https://www.linkedin.com/company… ┆ false │
│ https://www.linkedin.com/company… ┆ https://www.linkedin.com/company… ┆ true  │
│ https://www.linkedin.com/company… ┆ https://www.linkedin.com/company… ┆ false │
│ https://www.linkedin.com/company… ┆ https://www.linkedin.com/company… ┆ true  │
│ …                                 ┆ …                                 ┆ …     │
│ https://www.linkedin.com/company… ┆ https://www.linkedin.com/company… ┆ true  │
│

In [16]:
df_matching_linkedin.write_parquet(RESULTS_FILE)