In [18]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
# Import ChromeOptions to set headless mode
from selenium.webdriver.chrome.options import Options
import time

In [19]:
from bs4 import BeautifulSoup
import json
import re

In [20]:
def scrape_with_selenium(url : str) -> str | None:
        """
        Uses headless Selenium with a generic waiting strategy.
        """
        # Configure headless options ---
        chrome_options = Options()
        chrome_options.add_argument("--headless=new") # Runs Chrome without a UI
        chrome_options.add_argument("--window-size=1920,1080") # Optional: Specify window size

        # Initialize the driver with the new options
        driver = webdriver.Chrome(options=chrome_options)

        html_content = None
        try:
            driver.get(url)
            print("Waiting for page to load in headless mode...")
            WebDriverWait(driver, 20).until(
                lambda d: d.execute_script("return document.readyState") == 'complete'
            )
            time.sleep(2)
            
            print("Content loaded successfully.")
            html_content = driver.page_source

        except TimeoutException:
            print("Timed out waiting for page to load.")
            html_content = driver.page_source
        except Exception as e:
            print(f"An error occurred during Selenium scraping: {e}")
        finally:
            driver.quit()
        
        if html_content:
            return html_content
        return None


In [21]:
url = "https://www.linkedin.com/in/jai-soni-879764257/"
res = scrape_with_selenium(url)

Waiting for page to load in headless mode...
Content loaded successfully.


In [25]:
from bs4 import BeautifulSoup
import json
import re
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def fetch_profile_html_with_cookies(profile_url, cookies_file="cookies.json"):
    # Set Chrome to headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Chrome 109+ headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=chrome_options)

    # Load cookies
    with open(cookies_file, "r", encoding="utf-8") as f:
        cookies = json.load(f)

    # Visit LinkedIn first so domain matches
    driver.get("https://www.linkedin.com")
    time.sleep(2)

    # Inject cookies
    for cookie in cookies:
        cookie_dict = {
            "name": cookie.get("name"),
            "value": cookie.get("value"),
            "domain": cookie.get("domain"),
            "path": cookie.get("path"),
            "secure": cookie.get("secure"),
            "httpOnly": cookie.get("httpOnly")
        }
        try:
            driver.add_cookie(cookie_dict)
        except Exception as e:
            print(f"Skipping cookie {cookie.get('name')} → {e}")

    # Load profile page
    driver.get(profile_url)
    time.sleep(5)  # wait for full load

    html_content = driver.page_source
    with open("res.txt", "w", encoding="utf-8") as f:
        f.write(html)
    f.close()
    driver.quit()
    return html_content

def parse_linkedin_profile_with_cookies(profile_url, cookies_file="cookies.json"):
    html_content = fetch_profile_html_with_cookies(profile_url, cookies_file)
    soup = BeautifulSoup(html_content, "html.parser")

    profile = {
        "id": None,
        "name": None,
        "city": None,
        "country_code": None,
        "position": None,
        "about": None,
        "current_company": {},
        "experience": [],
        "url": profile_url,
        "education": [],
        "avatar": None,
        "certifications": [],
        "followers": None,
        "connections": None,
        "projects": [],
        "location": None,
        "activity": []
    }

    # BASIC INFO
    name_tag = soup.find("h1", {"class": re.compile(".*text-heading-xlarge.*")})
    if name_tag:
        profile["name"] = name_tag.get_text(strip=True)

    headline_tag = soup.find("div", {"class": re.compile(".*text-body-medium.*")})
    if headline_tag:
        profile["position"] = headline_tag.get_text(strip=True)

    location_tag = soup.find("span", {"class": re.compile(".*text-body-small.*")})
    if location_tag:
        profile["location"] = location_tag.get_text(strip=True)

    avatar_tag = soup.find("img", {"class": re.compile(".*pv-top-card-profile-picture__image.*")})
    if avatar_tag and avatar_tag.get("src"):
        profile["avatar"] = avatar_tag["src"]

    # ABOUT
    about_section = soup.find("section", {"id": "about"})
    if about_section:
        profile["about"] = about_section.get_text(separator=" ", strip=True)

    # EXPERIENCE
    experience_section = soup.find("section", {"id": "experience"})
    if experience_section:
        roles = experience_section.find_all("li")
        for role in roles:
            title_tag = role.find("span", {"class": re.compile(".*mr1.*")})
            company_tag = role.find("span", {"class": re.compile(".*t-14.*")})
            date_tag = role.find("span", string=re.compile(r"\d{4}"))
            loc_tag = role.find("span", {"class": re.compile(".*t-14.*")})

            profile["experience"].append({
                "title": title_tag.get_text(strip=True) if title_tag else None,
                "company": company_tag.get_text(strip=True) if company_tag else None,
                "start_date": date_tag.get_text(strip=True) if date_tag else None,
                "location": loc_tag.get_text(strip=True) if loc_tag else None
            })

    # EDUCATION
    education_section = soup.find("section", {"id": "education"})
    if education_section:
        schools = education_section.find_all("li")
        for school in schools:
            school_name = school.find("span", {"class": re.compile(".*mr1.*")})
            degree = school.find("span", string=re.compile("BTech|Bachelor|Master|B.Sc|M.Sc"))
            year_tag = school.find("span", string=re.compile(r"\d{4}"))

            profile["education"].append({
                "title": school_name.get_text(strip=True) if school_name else None,
                "degree": degree.get_text(strip=True) if degree else None,
                "start_year": year_tag.get_text(strip=True) if year_tag else None
            })

    # FOLLOWERS & CONNECTIONS
    followers_tag = soup.find(string=re.compile(r"followers"))
    if followers_tag:
        followers_count = re.findall(r"\d[\d,]*", followers_tag)
        if followers_count:
            profile["followers"] = int(followers_count[0].replace(",", ""))

    connections_tag = soup.find(string=re.compile(r"connections"))
    if connections_tag:
        connections_count = re.findall(r"\d[\d,]*", connections_tag)
        if connections_count:
            profile["connections"] = int(connections_count[0].replace(",", ""))

    return profile


In [26]:
parse_linkedin_profile_with_cookies(url)

{'id': None,
 'name': None,
 'city': None,
 'country_code': None,
 'position': 'Software Engineer @FOG Technologies | Full Stack Developer | AI Enthusiast',
 'about': None,
 'current_company': {},
 'experience': [],
 'url': 'https://www.linkedin.com/in/jai-soni-879764257/',
 'education': [],
 'avatar': 'https://media.licdn.com/dms/image/v2/D4D35AQHYv2na6-1-iA/profile-framedphoto-shrink_400_400/B4DZgSIZbzG8Ac-/0/1752650848394?e=1755446400&v=beta&t=FJTxRkhz7jPVyUYu5Yaj51wtUl2pbP2UG5EpVwp1fCg',
 'certifications': [],
 'followers': 4,
 'connections': 95,
 'projects': [],
 'location': 'He/Him',
 'activity': []}