In [18]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
# Import ChromeOptions to set headless mode
from selenium.webdriver.chrome.options import Options
import time

In [19]:
from bs4 import BeautifulSoup
import json
import re

In [20]:
def scrape_with_selenium(url : str) -> str | None:
        """
        Uses headless Selenium with a generic waiting strategy.
        """
        # Configure headless options ---
        chrome_options = Options()
        chrome_options.add_argument("--headless=new") # Runs Chrome without a UI
        chrome_options.add_argument("--window-size=1920,1080") # Optional: Specify window size

        # Initialize the driver with the new options
        driver = webdriver.Chrome(options=chrome_options)

        html_content = None
        try:
            driver.get(url)
            print("Waiting for page to load in headless mode...")
            WebDriverWait(driver, 20).until(
                lambda d: d.execute_script("return document.readyState") == 'complete'
            )
            time.sleep(2)
            
            print("Content loaded successfully.")
            html_content = driver.page_source

        except TimeoutException:
            print("Timed out waiting for page to load.")
            html_content = driver.page_source
        except Exception as e:
            print(f"An error occurred during Selenium scraping: {e}")
        finally:
            driver.quit()
        
        if html_content:
            return html_content
        return None


In [21]:
url = "https://www.linkedin.com/in/jai-soni-879764257/"
res = scrape_with_selenium(url)

Waiting for page to load in headless mode...
Content loaded successfully.


In [39]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def fetch_profile_html_with_cookies(profile_url, cookies_file="cookies.json"):
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import json, time

        chrome_options = Options()
        chrome_options.add_argument("--headless=new")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=chrome_options)

    with open(cookies_file, "r", encoding="utf-8") as f:
        cookies = json.load(f)

    driver.get("https://www.linkedin.com")
    time.sleep(2)

    for cookie in cookies:
        try:
            driver.add_cookie({
                "name": cookie.get("name"),
                "value": cookie.get("value"),
                "domain": cookie.get("domain"),
                "path": cookie.get("path"),
                "secure": cookie.get("secure"),
                "httpOnly": cookie.get("httpOnly")
            })
        except:
            pass

    driver.get(profile_url)

    # Wait for the name element
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.text-heading-xlarge"))
        )
    except:
        print("⚠ Name element not found — possibly not logged in or restricted profile.")

    # Scroll slowly to load all sections
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    html_content = driver.page_source
    driver.quit()
    return html_content


In [40]:
from bs4 import BeautifulSoup
import re

def parse_linkedin_profile(html_content: str, profile_url: str):
    soup = BeautifulSoup(html_content, "html.parser")

    profile = {
        "id": None,
        "name": None,
        "city": None,
        "country_code": None,
        "position": None,
        "about": None,
        "current_company": {
            "name": None,
            "id": None,
            "title": None,
            "location": None
        },
        "experience": [],
        "url": profile_url,
        "education": [],
        "avatar": None,
        "certifications": [],
        "followers": None,
        "connections": None,
        "projects": [],
        "location": None,
        "activity": []
    }

    # ID from meta tag or URL
    canonical_link = soup.find("link", {"rel": "canonical"})
    if canonical_link and "linkedin.com/in/" in canonical_link.get("href", ""):
        profile["id"] = canonical_link["href"].split("/")[-2]
    
    # Name
    name_tag = soup.find("h1", {"class": re.compile(".*text-heading-xlarge.*")})
    if name_tag:
        profile["name"] = name_tag.get_text(strip=True)

    # Headline
    headline_tag = soup.find("div", {"class": re.compile(".*text-body-medium.*")})
    if headline_tag:
        profile["position"] = headline_tag.get_text(strip=True)

    # Location
    location_tag = soup.find("span", {"class": re.compile(".*text-body-small.*")})
    if location_tag:
        location_text = location_tag.get_text(strip=True)
        profile["location"] = location_text
        parts = location_text.split(",")
        if len(parts) >= 1:
            profile["city"] = parts[0].strip()
        if len(parts) >= 2:
            profile["country_code"] = parts[-1].strip()

    # Avatar
    avatar_tag = soup.find("img", {"class": re.compile(".*pv-top-card-profile-picture__image.*")})
    if avatar_tag and avatar_tag.get("src"):
        profile["avatar"] = avatar_tag["src"]

    # About
    about_section = soup.find("section", {"id": "about"})
    if about_section:
        profile["about"] = about_section.get_text(separator=" ", strip=True)

    # Experience
    experience_section = soup.find("section", {"id": "experience"})
    if experience_section:
        for role in experience_section.find_all("li", recursive=True):
            title = role.find("span", {"class": re.compile(".*mr1.*")})
            company = role.find("span", {"class": re.compile(".*t-14.*")})
            date_range = role.find(string=re.compile(r"\d{4}"))
            loc = role.find("span", {"class": re.compile(".*t-14.*")})
            profile["experience"].append({
                "title": title.get_text(strip=True) if title else None,
                "company": company.get_text(strip=True) if company else None,
                "start_date": date_range.strip() if date_range else None,
                "location": loc.get_text(strip=True) if loc else None
            })

    # Education
    education_section = soup.find("section", {"id": "education"})
    if education_section:
        for school in education_section.find_all("li"):
            school_name = school.find("span", {"class": re.compile(".*mr1.*")})
            degree = school.find("span", string=re.compile("BTech|Bachelor|Master|B.Sc|M.Sc"))
            year_tag = school.find("span", string=re.compile(r"\d{4}"))
            profile["education"].append({
                "title": school_name.get_text(strip=True) if school_name else None,
                "degree": degree.get_text(strip=True) if degree else None,
                "start_year": year_tag.get_text(strip=True) if year_tag else None
            })

    # Followers
    followers_tag = soup.find(string=re.compile(r"followers"))
    if followers_tag:
        match = re.findall(r"\d[\d,]*", followers_tag)
        if match:
            profile["followers"] = int(match[0].replace(",", ""))

    # Connections
    connections_tag = soup.find(string=re.compile(r"connections"))
    if connections_tag:
        match = re.findall(r"\d[\d,]*", connections_tag)
        if match:
            profile["connections"] = int(match[0].replace(",", ""))

    return profile


In [42]:
profile_url = "https://www.linkedin.com/in/jai-soni-879764257/"
html = fetch_profile_html_with_cookies(profile_url)
parse_linkedin_profile(html, profile_url)

⚠ Name element not found — possibly not logged in or restricted profile.


{'id': None,
 'name': None,
 'city': None,
 'country_code': None,
 'position': None,
 'about': None,
 'current_company': {'name': None,
  'id': None,
  'title': None,
  'location': None},
 'experience': [],
 'url': 'https://www.linkedin.com/in/jai-soni-879764257/',
 'education': [],
 'avatar': None,
 'certifications': [],
 'followers': None,
 'connections': None,
 'projects': [],
 'location': None,
 'activity': []}