In [14]:
import requests
import polars as pl

from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.common.by import By
from webdriver_manager.firefox import GeckoDriverManager
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from tqdm import tqdm
from typing import Callable
from threading import Thread

In [15]:
df = pl.read_csv("./samples/samples_aprex.csv")
df = df.drop_nulls()
print(df)

shape: (130, 3)
┌───────────────────────────────────┬──────────────────────────┬──────────────────────────────┐
│ linkedin_url                      ┆ input_name               ┆ website                      │
│ ---                               ┆ ---                      ┆ ---                          │
│ str                               ┆ str                      ┆ str                          │
╞═══════════════════════════════════╪══════════════════════════╪══════════════════════════════╡
│ https://www.linkedin.com/company… ┆ FAURECIA CLEAN  MOBILITY ┆ https://www.faurecia.com/    │
│                                   ┆ (groupe…                 ┆                              │
│ https://www.linkedin.com/company… ┆ PRYSMIAN GROUP           ┆ https://fr.prysmian.com/     │
│ https://www.linkedin.com/company… ┆ PETIT BATEAU             ┆ https://www.petit-bateau.fr/ │
│ https://www.linkedin.com/company… ┆ VALEO VISION             ┆ https://www.valeo.com/fr/    │
│ https://www.linkedin.c

In [27]:
def get_soup(url: str, driver) -> BeautifulSoup|None:
    # try:
    #     res = requests.get(url, timeout=10)
    # except:
    #     return None
    # if res.status_code != 200:
    #     return None

    # try:
    #     return BeautifulSoup(res.text)
    # except:
    #     return None

    driver.get(url)
    return BeautifulSoup(driver.page_source)
    
def extract_linkedins(soup: BeautifulSoup) -> list[str]|None:
    if soup is None:
        return None
    
    linkedins = set()
    for link in soup.find_all("a", href=True):
        if "linkedin.com/company" in link.get("href"):
            linkedins.add(link.get("href"))
    return list(linkedins) or None

def clean_url(url: str) -> str:
    path = urlparse(url).path
    return urljoin(url, path).strip("/").split("/")[-1].lower()

def find_linkedins(company: tuple, driver,  companies: list) -> None:
    linkedin_url, input_name, website = company
    website = website.strip("/")
    soup = get_soup(website, driver)
    linkedins = extract_linkedins(soup)
    linkedin_id_found = None

    if linkedins:
        linkedin_id_found = clean_url(linkedins[0]) if linkedins[0] else None

    companies.append({
        "linkedin_url": linkedin_url,
        "input_name": input_name,
        "website": website,
        "linkedin_id_found": linkedin_id_found,
    })

def send_requests(func: Callable, df: pl.DataFrame) -> list:
    companies, threads = [], []

    for company in tqdm(df.rows(), "Curling data..."):
        process = Thread(target=func, args=[company, companies])
        process.start()
        threads.append(process)

    for process in tqdm(threads, "Joining threads..."):
        process.join()

    return companies

In [28]:
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))




Curling data...: 100%|██████████| 5/5 [00:14<00:00,  2.95s/it]


In [35]:
companies = []
for company in tqdm(df.rows(), "Curling data..."):
    find_linkedins(company, driver, companies)

companies

Curling data...: 100%|██████████| 130/130 [11:31<00:00,  5.32s/it]


[{'linkedin_url': 'https://www.linkedin.com/company/faurecia/',
  'input_name': 'FAURECIA CLEAN  MOBILITY (groupe FORVIA)',
  'website': 'https://www.faurecia.com',
  'linkedin_id_found': 'faurecia'},
 {'linkedin_url': 'https://www.linkedin.com/company/prysmian/',
  'input_name': 'PRYSMIAN GROUP',
  'website': 'https://fr.prysmian.com',
  'linkedin_id_found': 'prysmian'},
 {'linkedin_url': 'https://www.linkedin.com/company/petit-bateau/',
  'input_name': 'PETIT BATEAU',
  'website': 'https://www.petit-bateau.fr',
  'linkedin_id_found': None},
 {'linkedin_url': 'https://www.linkedin.com/company/valeo/',
  'input_name': 'VALEO VISION',
  'website': 'https://www.valeo.com/fr',
  'linkedin_id_found': 'valeo'},
 {'linkedin_url': 'https://www.linkedin.com/company/antolin/',
  'input_name': 'GRUPO ANTOLIN BESANCON',
  'website': 'https://www.antolin.com',
  'linkedin_id_found': 'grupo-antolin'},
 {'linkedin_url': 'https://www.linkedin.com/company/lisi-automotive/',
  'input_name': 'LISI AUTOM

In [45]:
df_companies = pl.DataFrame(companies)
print(df_companies)

shape: (130, 4)
┌──────────────────────────┬─────────────────────────┬─────────────────────────┬───────────────────┐
│ linkedin_url             ┆ input_name              ┆ website                 ┆ linkedin_id_found │
│ ---                      ┆ ---                     ┆ ---                     ┆ ---               │
│ str                      ┆ str                     ┆ str                     ┆ str               │
╞══════════════════════════╪═════════════════════════╪═════════════════════════╪═══════════════════╡
│ https://www.linkedin.com ┆ FAURECIA CLEAN          ┆ https://www.faurecia.co ┆ faurecia          │
│ /company…                ┆ MOBILITY (groupe…       ┆ m                       ┆                   │
│ https://www.linkedin.com ┆ PRYSMIAN GROUP          ┆ https://fr.prysmian.com ┆ prysmian          │
│ /company…                ┆                         ┆                         ┆                   │
│ https://www.linkedin.com ┆ PETIT BATEAU            ┆ https://www.petit-ba

In [46]:
print(df_companies.describe())

shape: (9, 5)
┌────────────┬──────────────────────────┬────────────┬─────────────────────────┬───────────────────┐
│ statistic  ┆ linkedin_url             ┆ input_name ┆ website                 ┆ linkedin_id_found │
│ ---        ┆ ---                      ┆ ---        ┆ ---                     ┆ ---               │
│ str        ┆ str                      ┆ str        ┆ str                     ┆ str               │
╞════════════╪══════════════════════════╪════════════╪═════════════════════════╪═══════════════════╡
│ count      ┆ 130                      ┆ 130        ┆ 130                     ┆ 103               │
│ null_count ┆ 0                        ┆ 0          ┆ 0                       ┆ 27                │
│ mean       ┆ null                     ┆ null       ┆ null                    ┆ null              │
│ std        ┆ null                     ┆ null       ┆ null                    ┆ null              │
│ min        ┆ https://www.linkedin.com ┆ ACOME      ┆ http://liquid-robotics

In [47]:
print(sum(1 for i in df_companies.rows() if i[-1]))

103


In [48]:
def compare_linkedin_ids(df: pl.DataFrame) -> pl.DataFrame:
    success = []
    for company in df.rows(named=True):
        linkedin_url = company.get("linkedin_url")
        linkedin_id = linkedin_url.strip("/").split("/")[-1].lower()
        linkedin_id_found = company.get("linkedin_id_found")
        success.append(linkedin_id == linkedin_id_found)

    serie = pl.Series("success", success)
    return df.with_columns(serie)

In [49]:
df_results = compare_linkedin_ids(df_companies)
print(df_results)

shape: (130, 5)
┌────────────────────────┬───────────────────┬───────────────────────┬───────────────────┬─────────┐
│ linkedin_url           ┆ input_name        ┆ website               ┆ linkedin_id_found ┆ success │
│ ---                    ┆ ---               ┆ ---                   ┆ ---               ┆ ---     │
│ str                    ┆ str               ┆ str                   ┆ str               ┆ bool    │
╞════════════════════════╪═══════════════════╪═══════════════════════╪═══════════════════╪═════════╡
│ https://www.linkedin.c ┆ FAURECIA CLEAN    ┆ https://www.faurecia. ┆ faurecia          ┆ true    │
│ om/company…            ┆ MOBILITY (groupe… ┆ com                   ┆                   ┆         │
│ https://www.linkedin.c ┆ PRYSMIAN GROUP    ┆ https://fr.prysmian.c ┆ prysmian          ┆ true    │
│ om/company…            ┆                   ┆ om                    ┆                   ┆         │
│ https://www.linkedin.c ┆ PETIT BATEAU      ┆ https://www.petit-bat ┆ null

In [54]:
sum(1 for i in df_results.rows() if i[-1])

82

In [58]:
print(103/130*100)
print(82/130*100)

79.23076923076923
63.07692307692307


In [64]:
27/130*100

20.76923076923077

In [59]:
df_results.write_csv("./results/scraping.csv")

In [61]:
sum(1 for i in df_results.rows() if i[-2] is None)

27

In [62]:
for i in df_results.rows():
    if not i[-1] and i[-2]:
        print(i)

('https://www.linkedin.com/company/antolin/', 'GRUPO ANTOLIN BESANCON', 'https://www.antolin.com', 'grupo-antolin', False)
('https://www.linkedin.com/company/lisi-automotive/', 'LISI AUTOMOTIVE DELLE', 'https://www.lisi-automotive.com/fr', '83710', False)
('https://www.linkedin.com/company/saint-gobain/', 'SAINT GOBAIN PLACO', 'https://www.placo.fr', 'placo', False)
('https://www.linkedin.com/company/herta-sas', 'HERTA', 'https://www.herta.fr', 'herta-france', False)
('https://www.linkedin.com/company/wienerbergerfr/', 'WIENERBERGER', 'https://www.wienerberger.fr', 'wienerbergerfr%20%20', False)
('https://www.linkedin.com/company/hagergroup/', 'HAGER CONTROLS', 'https://hager.com/fr', 'hager-france', False)
('https://www.linkedin.com/company/europe-technologies-group/', 'EUROPE TECHNOLOGIES', 'https://oratech-et.fr', 'mps-oratech', False)
('https://www.linkedin.com/company/constellium/', 'CONSTELLIUM EXTRUSIONS France', 'https://www.constellium.com', '2249972', False)
('https://www.lin