## Linkedin Profile Scraper

In [1]:
# Import the warnings module and suppress any warnings 
import warnings
warnings.filterwarnings("ignore")

###  Import Necessary Libraries

In [2]:
import os #For handling environment variables
from bs4 import BeautifulSoup #For parsing and extracting HTML content (if needed)
from selenium import webdriver #For browser automation
from selenium.webdriver.common.by import By #For locating elements on a web page
from time import sleep #For adding delays
from dotenv import load_dotenv #For securely loading environment variables
from selenium.webdriver.common.action_chains import ActionChains #For advanced browser interactions
from selenium.common.exceptions import NoSuchElementException #For handling exceptions
# Load sensitive information like email and password from a .env file
load_dotenv()

True

### Load Environment Variables

In [None]:
# Set up environment variables for email and password using dotenv
os.environ['EMAIL'], os.environ['PASSWORD']

### Initialize the Browser Driver

In [4]:
# Set up the Chrome WebDriver to automate browser actions
driver = webdriver.Chrome()

### Open LinkedIn Login Page

In [5]:
# Navigate to LinkedIn's login page and wait for the page to load
driver.get('https://www.linkedin.com/login')
sleep(2)

### Login to LinkedIn

In [6]:
# Find and fill the email and password fields, then submit the login form
email = driver.find_element(By.ID,'username')
email.send_keys(os.environ['EMAIL'])

password = driver.find_element(By.ID,'password')
password.send_keys(os.environ['PASSWORD'])

# Submit the login form
password.submit()

### Take Input for Search Query

In [7]:
# Take the input for search query (first name and last name)
first_name = input("Enter first name: ")
last_name = input("Enter last name: ")



### Navigate to LinkedIn Search Page

In [8]:
# Navigate to LinkedIn search page
search_url = f"https://www.linkedin.com/search/results/people/?keywords={first_name}%20{last_name}"
driver.get(search_url)
sleep(3)


### Extract Profile Data

In [9]:
# Initialize an empty list to store extracted profile data
profile_info = []
# Identify and loop through search result profiles
profiles = driver.find_elements(By.XPATH, ".//ul[contains(@class, 'list-style-none')]/li")  # Target 'li' elements

### Sample
##### If giving N/A, Rerun both cells below again

In [10]:
profil = {}  # Initialize as a dictionary

try:
    # Locate the profile link
    profile_link = driver.find_element(
        By.XPATH,
        f"//ul/li[1]//a[contains(@href, '/in/')]"
    )

    # Scroll into view and click the link
    ActionChains(driver).move_to_element(profile_link).perform()
    profile_link.click()

    # Extract the page source again after navigating
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'lxml')

    # ==========================================
    # Extract Name
    # ==========================================
    name = soup.find('h1', {'class': 'tltAylKhFAgBRttOJxuNkBYpMngWHBDlKY inline t-24 v-align-middle break-words'})
    profil['name'] = name.get_text().strip() if name else 'N/A'
        
except Exception as e:
    print(f"Error occurred: {e}")

sleep(3)
driver.back()
sleep(3)

print(profil)  # Print profile data for each iteration


{'name': 'N/A'}


### Loop Through Profiles and Extract Details

In [14]:
# Initialize a list to store all profile data
all_profiles = []

# Get the page source
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')

for list in range(1, 6): # Extract data for up to the first 5 profiles (or adjust the loop as needed)
    profile_data = {}  # Initialize as a dictionary

    try:
        # Locate the profile link
        profile_link = driver.find_element(
            By.XPATH,
            f"//ul/li[{list}]//a[contains(@href, '/in/')]"
        )

        # Scroll into view and click the link
        ActionChains(driver).move_to_element(profile_link).perform()
        profile_link.click()

        # Extract the page source again after navigating
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'lxml')

        # ==========================================
        # Extract Name
        # ==========================================
        name = soup.find('h1', {'class': 'tltAylKhFAgBRttOJxuNkBYpMngWHBDlKY inline t-24 v-align-middle break-words'})
        profile_data['name'] = name.get_text().strip() if name else 'N/A'

        # ==========================================
        # Extract Profile URL
        # ==========================================
        url_element = soup.find('a', {'class': 'ember-view SFhPYJROBTIphhHbqiXhFZLdcoPAUark'})
        if url_element:
            url = url_element['href']
            full_url = f"https://www.linkedin.com{url}" if url.startswith("/") else url
        else:
            full_url = 'N/A'
        profile_data['url'] = full_url

        # ==========================================
        # Extract Headline
        # ==========================================
        headline = soup.find('div', {'class': 'text-body-medium break-words'})
        profile_data['headline'] = headline.get_text().strip() if headline else 'N/A'

        # ==========================================
        # Extract Location
        # ==========================================
        location = soup.find('span', {'class': 'text-body-small inline t-black--light break-words'})
        profile_data['location'] = location.get_text().strip() if location else 'N/A'

        # ==========================================
        # Extract About
        # ==========================================
        
        # try:
        #     # Try clicking the "See More" button if it exists
        #     see_more_button = driver.find_element(By.XPATH, "/html/body/div[6]/div[3]/div/div/div[2]/div/div/main/section[2]/div[3]/div/div/div/span[3]/button")
        #     ActionChains(driver).move_to_element(see_more_button).perform()
        #     see_more_button.click()
        # except NoSuchElementException:
        #     print("See More button not found. Proceeding with available content.")

        about = soup.find('div', {'class': 'display-flex ph5 pv3'})
        profile_data['about'] = about.get_text().strip() if about else "No About section found"

        # ==========================================
        # Extract Education
        # ==========================================
        
        # Locate all sections in the page containing profile details
        sections = soup.find_all('section', {'class': 'artdeco-card pv-profile-card break-words mt2'})
        
        # Initialize 'educations' as None and loop through sections to locate the education section
        educations = None
        for section in sections:
            if section.find('div', {'id': 'education'}):
                educations = section
                break  # Stop once the education section is found

        if educations:
            # Find all individual education items within the education section
            items = educations.find_all(
                'div',
                {'class': 'MLWKRCqgqPIIoWDJwvNDfYgezWKkGUDZeHwo ZTfGgnxuIQmWwjZNmBsQXDyQiCdTxMAxaTMA kJpxNUwEyrItimxpmdlHFJCfPhhUDuJIks'}
            )

            def get_edu(item):
                """
                Extracts college, degree, and duration from an education item.

                Args:
                    item (Tag): A BeautifulSoup tag containing education details.

                Returns:
                    dict: A dictionary with keys 'college', 'degree', and 'duration'.
                """
                item_dict = {}
                spans = item.find_all('span', {'class': 'visually-hidden'})
                item_dict['college'] = spans[0].get_text().strip() if len(spans) > 0 else 'N/A'
                item_dict['degree'] = spans[1].get_text().strip() if len(spans) > 1 else 'N/A'
                item_dict['duration'] = spans[2].get_text().strip() if len(spans) > 2 else 'N/A'
                return item_dict
            
            # Generate a list of dictionaries containing education details for each item
            item_list = [get_edu(item) for item in items]
            profile_data['education'] = item_list
        else:
            profile_data['education'] = []
            
        # Append this profile's data to the list of all profiles
        all_profiles.append(profile_data)    

    except Exception as e:
        print(f"Error occurred: {e}")

    sleep(15)
    driver.back()
    sleep(3)
    
    print(all_profiles)  # Print profile data for each iteration


[{'name': 'Jenish Kumar', 'url': 'https://www.linkedin.com/in/jenish-kumar-b1427514/overlay/about-this-profile/', 'headline': 'Strategic HR, Culture & Workplace Transformation, Change Management & Talent Management', 'location': 'Pune, Maharashtra, India', 'about': 'Proven track record in driving strategic HR initiatives to realize bottom line results and build organizational capabilities in pursuit of organizational objectivesProven track record in driving strategic HR initiatives to realize bottom line results and build organizational capabilities in pursuit of organizational objectives', 'education': [{'college': 'MBA in Human Resource Management , University of Wales, UK', 'degree': 'N/A', 'duration': 'N/A'}]}]
[{'name': 'Jenish Kumar', 'url': 'https://www.linkedin.com/in/jenish-kumar-b1427514/overlay/about-this-profile/', 'headline': 'Strategic HR, Culture & Workplace Transformation, Change Management & Talent Management', 'location': 'Pune, Maharashtra, India', 'about': 'Proven t

### Save Data to CSV File

In [15]:
import csv

# Save all profiles data to a CSV file
csv_file = "linkedin_profiles.csv"

with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=['name', 'url', 'headline', 'location', 'about', 'education'])
    writer.writeheader()

    for profile in all_profiles:
        # Flatten education data as a string for CSV
        profile['education'] = '; '.join(
            [f"{edu['college']} ({edu['degree']} - {edu['duration']})" for edu in profile['education']]
        )
        writer.writerow(profile)
        
# Confirm the file was saved successfully
print(f"Data saved to {csv_file}")

Data saved to linkedin_profiles.csv


### Close the Browser

In [16]:
# Close the browser and end the WebDriver session
driver.back()