<a href="https://colab.research.google.com/github/11bender/alumni-scraping/blob/main/alumni_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Scrape Georiga Tech alumni LinkedIn profile URLs from your connections**

### Import, install, and download browser driver

In [None]:
!pip install selenium

Download Google Chrome Drive : https://developer.chrome.com/docs/chromedriver/downloads

In [1]:
import os, random, sys, time 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import pandas as pd

from merge_urls import merge_urls

Create a `config.txt` and add your LinkedIn username at **1st** line and your password at the **2nd** line.

### Start scraping

Settings that make code less detectable by anti-scrape bot

In [None]:
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
]
driver_path = "D:/chromedriver-win64/chromedriver.exe"
options = Options()
options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")  # Rotate User-Agent
options.add_argument("--disable-blink-features=AutomationControlled")  # Hide WebDriver Flag
options.add_experimental_option("excludeSwitches", ["enable-automation"])  # Disable automation flags
options.add_experimental_option("useAutomationExtension", False)


service = Service(driver_path)
browser = webdriver.Chrome(service=service)

# Remove navigator.webdriver property
browser.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

Go to LinkedIn Login page

In [None]:
browser.get("https://www.linkedin.com/login/")
time.sleep(random.uniform(2, 4))

Read in account info and login

In [None]:
file = open("config.txt")
line = file.readlines()
username = line[0]
password = line[1]

elementID = browser.find_element(By.ID, 'username')
elementID.send_keys(username)
time.sleep(random.uniform(1, 3))
elementID = browser.find_element(By.ID,'password')
elementID.send_keys(password)
time.sleep(random.uniform(1, 2))
# elementID.submit()
# time.sleep(random.uniform(5, 7))

Define helper functions to scrape profile URLs

In [None]:
def human_like_scroll(driver, scroll_pause=(1,3)):
    """
    Scrolls the page in small random increments to simulate human behavior.
    scroll_pause is a tuple (min, max) for random pause times.
    """
    current_height = driver.execute_script("return window.pageYOffset;")
    total_height = driver.execute_script("return document.body.scrollHeight;")

    while current_height < total_height:
        # Random step size between 200 and 600 px
        scroll_step = random.randint(200, 600)
        current_height += scroll_step

        driver.execute_script(f"window.scrollTo(0, {current_height});")
        time.sleep(random.uniform(*scroll_pause))  # short random pause to mimic reading

        # Update total_height if page loads more content on scroll
        new_total_height = driver.execute_script("return document.body.scrollHeight;")
        if new_total_height > total_height:
            total_height = new_total_height

def get_all_links(browser, output_csv):
	collected_links = set()   
	# XPATH for <a> tags that start with "https://www.linkedin.com/in/"
	profile_xpath = "//div[@class='display-flex align-items-center']//a[starts-with(@href, 'https://www.linkedin.com/in/')]"
	# XPATH for the 'Next' button
	next_button_xpath = "//button[@aria-label='Next']" 
 
	while True:
		# SCROLL DOWN THE PAGE LIKE A HUMAN
		human_like_scroll(browser, scroll_pause=(1,3)) 
		# COLLECT ALL AVAILABLE LINKS
		anchor_elements = browser.find_elements(By.XPATH, profile_xpath)
		# Extract href from each element
		for anchor in anchor_elements:
			href_value = anchor.get_attribute("href")
			href_value = href_value.split("?")[0]  # Remove query string
			if href_value in collected_links:
				print(f"Duplicate link found: {href_value}")
			else:
				collected_links.add(href_value)

		# CHECK IF 'NEXT' BUTTON EXISTS & CLICK IT
		try:
			next_button = browser.find_element(By.XPATH, next_button_xpath)
			if next_button.is_enabled():
				next_button.click()
				time.sleep(random.uniform(2, 3))
			else:
				print("Next button found but not clickable. Exiting loop.")
				break
		except:
			print("Next button not found or not clickable. Exiting loop.")
			break

	# SAVE RESULTS TO CSV
	csv_filename = output_csv
	with open(csv_filename, mode="w", newline="", encoding="utf-8") as f:
		writer = csv.writer(f)
		writer.writerow(["url"])
		for link in collected_links:
			writer.writerow([link])

	print(f"Saved {len(collected_links)} links to '{csv_filename}'.")

Go to 1st connection

In [None]:
connection_url = "https://www.linkedin.com/mynetwork/invite-connect/connections/"
browser.get(connection_url)
time.sleep(random.uniform(3, 6))

search_with_filters_ele = browser.find_element(By.XPATH, '//a[contains(@href, "/search/results/people/") and @class="ember-view mn-connections__search-with-filters link-without-visited-state"]')
search_with_filters_ele.click()
time.sleep(random.uniform(3, 6))

all_filters_ele = browser.find_element(By.XPATH, '//button[text()="All filters"]')
all_filters_ele.click()
time.sleep(random.uniform(3, 6))

gt_check_box = browser.find_element(By.XPATH, '//ul[@class="list-style-none flex-1"]//fieldset[h3[text()="School"]]//label[.//span[text()="Georgia Institute of Technology"]]')
ActionChains(browser).move_to_element(gt_check_box).perform()
time.sleep(1)
gt_check_box.click()


apply_filters_ele = browser.find_element(By.XPATH, '//button[@aria-label="Apply current filters to show results"]')
time.sleep(random.uniform(3, 5))
apply_filters_ele.click()


Start collecting 1st degree connection URLs

In [None]:
get_all_links(browser, "profile_urls_1st.csv")

Go to your 2nd connection

In [None]:
sec_deg_ele = browser.find_element(By.XPATH, '//ul[@class="inline-flex list-style-none search-reusables__multiselect-pill-list"]//li//button[@aria-label="2nd"]')
sec_deg_ele.click()
time.sleep(random.uniform(3, 4))
first_deg_ele = browser.find_element(By.XPATH, '//ul[@class="inline-flex list-style-none search-reusables__multiselect-pill-list"]//li//button[@aria-label="1st"]')
first_deg_ele.click()
time.sleep(random.uniform(3, 4))

Start collecting 2nd degree connection URLs

In [None]:
get_all_links(browser, "profile_urls_2nd.csv")

Go to your 3rd+ connections

In [None]:
third_p_deg_ele = browser.find_element(By.XPATH, '//ul[@class="inline-flex list-style-none search-reusables__multiselect-pill-list"]//li//button[@aria-label="3rd+"]')
third_p_deg_ele.click()
time.sleep(random.uniform(3, 4))
sec_deg_ele = browser.find_element(By.XPATH, '//ul[@class="inline-flex list-style-none search-reusables__multiselect-pill-list"]//li//button[@aria-label="2nd"]')
sec_deg_ele.click()
time.sleep(random.uniform(3, 4))

Start collecting 3rd+ degree connection URLs

In [None]:
get_all_links(browser, "profile_urls_3rd+.csv")

Quit the browser

In [None]:
browser.quit()

### Merge all URL CSVs into one final CSV and remove any duplciates

In [2]:
merge_urls("profile_urls_1st.csv", "profile_urls_2nd.csv", "profile_urls_3rd+.csv")

Final CSV file length: 2202
