In [87]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
# from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
import time
from selenium.webdriver.common.by import By
import logging
import pandas as pd
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import random
from selenium.webdriver.chrome.service import Service

import json
import os

In [88]:
def random_sleep(start, end):
    """
    Introduces a random sleep interval between the specified start and end range to simulate human-like delays in script execution.

    Args:
        start (int): The minimum number of seconds to sleep.
        end (int): The maximum number of seconds to sleep.

    Returns:
        None
    """

    time.sleep(random.randint(start, end))

def refresh_wd():
    """
    Initializes and returns a new instance of the Chrome WebDriver with specified options to avoid sandboxing and disable shared memory usage. It maximizes the browser window, navigates to LinkedIn's homepage, and sets up action chains for future interactions.

    Args:
        None

    Returns:
        WebDriver: A new instance of the Chrome WebDriver.
    """

    # Define your Chrome options outside this function or ensure they are passed as parameters
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    # Create a new instance of Chrome
    wd = webdriver.Chrome(options=options)
    
    # Initialize action chains for possible future actions
    actions = ActionChains(wd)
    
    # Maximize the window to avoid elements being out of view
    wd.maximize_window()
    
    # Navigate to LinkedIn
    wd.get('https://www.linkedin.com/')

    return wd

def login(wd, user_name, password):
    """
    Logs into a LinkedIn account using the provided WebDriver instance, username, and password. It waits for the relevant input fields to become available, enters the credentials, and clicks the login button.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        user_name (str): The LinkedIn account username.
        password (str): The LinkedIn account password.

    Returns:
        None
    """

    try:
        # Use WebDriverWait for more reliable element handling
        username_input = WebDriverWait(wd, 10).until(
            EC.presence_of_element_located((By.ID, "session_key"))
        )
        username_input.clear()
        username_input.send_keys(user_name)
        
        password_input = WebDriverWait(wd, 10).until(
            EC.presence_of_element_located((By.ID, "session_password"))
        )
        password_input.clear()
        password_input.send_keys(password)

        print("Logging into your LinkedIn account!")
        
        # Locate and click the login button
        login_button = WebDriverWait(wd, 10).until(
            EC.element_to_be_clickable((By.XPATH, '/html/body/main/section[1]/div/div/form/div[2]/button'))
        )
        login_button.click()
    except TimeoutException:
        print("Failed to login: Timeout while waiting for page elements.")
        raise
    except NoSuchElementException:
        print("Failed to login: Could not find one of the elements.")
        raise

def login_alternative(wd, user_name, password):
    """
    Handles logging into LinkedIn using the alternative method. This method clicks the "Sign in" button, enters the username and password into the appropriate fields, and then clicks the "Sign in" button again.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        user_name (str): The LinkedIn account username.
        password (str): The LinkedIn account password.

    Returns:
        None
    """
    try:
        sign_in_button = WebDriverWait(wd, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//a[@class="nav__button-secondary btn-md btn-secondary-emphasis"]'))
        )
        sign_in_button.click()

        username_input = WebDriverWait(wd, 10).until(
            EC.presence_of_element_located((By.ID, "username"))
        )
        username_input.clear()
        username_input.send_keys(user_name)

        password_input = WebDriverWait(wd, 10).until(
            EC.presence_of_element_located((By.ID, "password"))
        )
        password_input.clear()
        password_input.send_keys(password)

        print("Logging into your LinkedIn account using alternative method!")
        
        login_button = WebDriverWait(wd, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//button[@class="btn__primary--large from__button--floating"]'))
        )
        login_button.click()
    except (TimeoutException, NoSuchElementException) as e:
        print(f"Failed to login using alternative method: {str(e)}")

def login_with_fallback(wd, user_name, password):
    """
    Attempts to log into LinkedIn using the primary login method. If an exception occurs, it falls back to the alternative login method.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        user_name (str): The LinkedIn account username.
        password (str): The LinkedIn account password.

    Returns:
        None
    """
    try:
        login(wd, user_name, password)
    except Exception as e:
        print(f"Primary login method failed: {str(e)}")
        login_alternative(wd, user_name, password)

def security_verification(wd):
    """
    Handles the security verification process by prompting the user to input a verification code received from LinkedIn. It waits for the OTP input field to become available, enters the OTP, and submits it.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.

    Returns:
        None
    """
    otp = input("You have been sent a verification code from LinkedIn via your email.\nPlease input that here: ")
    try:
        # Use WebDriverWait to wait for the OTP input field to become available
        otp_input = WebDriverWait(wd, 10).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div/main/form/div[1]/input[15]'))
        )
        otp_input.clear()
        otp_input.send_keys(otp)

        # Locate and click the submit button for OTP
        submit_button = WebDriverWait(wd, 10).until(
            EC.element_to_be_clickable((By.XPATH, '/html/body/div/main/form/div[2]/button'))
        )
        submit_button.click()
        print("Successfully Authenticated!")
    except TimeoutException:
        print("Authentication failed: Timeout while waiting for page elements.")
    except NoSuchElementException:
        print("Authentication failed: Could not find one of the elements.")

def search_query(wd, query):
    """
    Performs a search on LinkedIn using the provided search query. It waits for the search input field to become clickable, enters the search query, and executes the search.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        query (str): The search query to be entered.

    Returns:
        None
    """

    # Ensure the search input field is visible and clickable
    search_field = WebDriverWait(wd, 10).until(
        EC.element_to_be_clickable((By.CLASS_NAME, 'search-global-typeahead__input'))
    )
    search_field.click()  # Focus on the search field

    # Introduce a random delay to mimic human typing speed
    random_sleep(1,2)

    # Send the search query
    search_field.send_keys(query)
    random_sleep(1,2)

    # Press ENTER to execute the search
    search_field.send_keys(Keys.ENTER)


def add_filters1(wd, location=None, current_company=None, past_company=None):
    """
    Applies search filters on LinkedIn based on location, current company, and past company. It navigates to the people results, opens the all filters menu, and applies the specified filters.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        location (str, optional): The location filter to be applied.
        current_company (str, optional): The current company filter to be applied.
        past_company (str, optional): The past company filter to be applied.

    Returns:
        str: The current URL after applying the filters.
    """

    navigate_to_people_results(wd)
    open_all_filters(wd)

    if current_company:
        apply_filter(wd, current_company, 'company', filter_index=0)
    if past_company:
        apply_filter(wd, past_company, 'company', filter_index=1)
    if location:
        apply_filter(wd, location, 'location')

    show_all_results(wd)
    random_sleep(1,2)

    url = wd.current_url
    return url

def apply_filter(wd, filter_value, filter_type, filter_index=None):
    """
    Applies a specified filter on LinkedIn based on the filter type and index. It waits for the filter button to become clickable, enters the filter value, and selects the correct option from the listbox.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        filter_value (str): The value to be applied in the filter.
        filter_type (str): The type of filter to be applied (e.g., 'company', 'location').
        filter_index (int, optional): The index of the filter button if multiple buttons exist.

    Returns:
        None
    """

    # Locate the "Add a filter" button based on the type and index (if applicable)
    filter_button_xpath = f"//*[text()='Add a {filter_type}']"
    filter_buttons = wd.find_elements(By.XPATH, filter_button_xpath)
    filter_button = filter_buttons[filter_index] if filter_index is not None else filter_buttons[0]

    wd.execute_script("arguments[0].scrollIntoView();", filter_button)
    filter_button.click()
    time.sleep(random.randint(1, 2))

    # Enter the filter value in the corresponding input field
    input_selector = f'input[placeholder="Add a {filter_type}"]'
    input_field = WebDriverWait(wd, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, input_selector))
    )
    input_field.click()
    input_field.send_keys(filter_value)
    time.sleep(random.randint(1, 2))

    # Wait for the listbox to appear and select the correct option
    listbox = WebDriverWait(wd, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'basic-typeahead__triggered-content'))
    )
    options = listbox.find_elements(By.XPATH, ".//div[@role='option']")
    for option in options:
        if filter_value in option.text:
            option.click()
            break
    time.sleep(random.randint(1, 2))

def navigate_to_people_results(wd):
    """
    Navigates to the LinkedIn people results page by clicking on the "See all people results" button. It waits for the button to become clickable and then clicks it.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.

    Returns:
        None
    """

    try:
        # Use WebDriverWait to wait until the button is visible and clickable
        people_results_button = WebDriverWait(wd, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'See all people results')]"))
        )
        wd.execute_script("arguments[0].style.border='2px solid red'", people_results_button)
        people_page_link = people_results_button.get_attribute("href")
        wd.get(people_page_link)
        # Wait for a random time between 1 and 2 seconds after loading the page
        random_sleep(1, 2)
    except TimeoutException:
        print("Failed to find the 'See all people results' button within the expected time.")


def open_all_filters(wd):
    """
    Opens the "All filters" menu on LinkedIn's search results page. It waits for the "All filters" button to become clickable and then clicks it.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.

    Returns:
        None
    """

    random_sleep(2,3)
    all_filters = wd.find_element(By.CLASS_NAME, "relative.mr2")
    wd.execute_script("arguments[0].style.border='2px solid red'", all_filters)
    all_filters.click()
    random_sleep(1,2)


def industry_filter(wd, industry_list_num=0):
    """
    Applies an industry filter based on the provided index. It navigates to the all filters menu, scrolls to the industry filter, and selects the industry option based on the given index.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        industry_list_num (int): The index of the industry option to be selected.

    Returns:
        None
    """

    # Navigate and click on all filters button
    navigate_and_click_filters(wd)

    # Scroll to and interact with the industry filter
    industry_filter_ul = WebDriverWait(wd, 10).until(
        EC.presence_of_element_located((By.XPATH, "//*[text()='Add an industry']/ancestor::ul[1]"))
    )
    wd.execute_script("arguments[0].scrollIntoView();", industry_filter_ul)

    # Select the industry based on provided index
    select_industry(wd, industry_filter_ul, industry_list_num)

    # Click show all results
    show_all_results(wd)

def navigate_and_click_filters(wd):
    """
    Navigates to and clicks the "All filters" button on LinkedIn's search results page. It waits for the button to become clickable and then clicks it.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.

    Returns:
        None
    """

    all_filters = WebDriverWait(wd, 10).until(
        EC.element_to_be_clickable((By.CLASS_NAME, "relative.mr2"))
    )
    wd.execute_script("arguments[0].style.border='2px solid red'", all_filters)
    all_filters.click()
    time.sleep(random.randint(1, 2))

def select_industry(wd, industry_filter_ul, industry_list_num):
    """
    Selects an industry filter option based on the provided index. It locates the industry filter element, scrolls to it, and selects the specified option.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        industry_filter_ul (WebElement): The unordered list element containing industry filter options.
        industry_list_num (int): The index of the industry option to be selected.

    Returns:
        None
    """

    li_elements = industry_filter_ul.find_elements(By.TAG_NAME, "li")
    if 0 <= industry_list_num < len(li_elements):
        li_element = li_elements[industry_list_num]
        input_checkbox = li_element.find_element(By.XPATH, ".//input[@type='checkbox']")
        if not input_checkbox.is_selected():
            wd.execute_script("arguments[0].click();", input_checkbox)
        print(f"Added industry option {industry_list_num}")
    else:
        print("Invalid industry index")

def show_all_results(wd):
    """
    Clicks the "Show all results" button on LinkedIn's search filters page. It waits for the button to become clickable and then clicks it.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.

    Returns:
        None
    """

    all_results = WebDriverWait(wd, 10).until(
        EC.element_to_be_clickable((By.XPATH, '/html/body/div[3]/div/div/div[3]/div/button[2]'))
    )
    all_results.click()
    time.sleep(random.randint(1, 2))


def collect_links(wd, page_start, limit, total_results, unlimited=False):
    """
    Collects LinkedIn profile links from search results pages starting from the specified page. It iterates through the search results, extracts profile information, and handles pagination.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        page_start (int): The starting page number for scraping.
        limit (int): The maximum number of pages to scrape.
        total_results (int): The total number of search results.
        unlimited (bool, optional): If True, ignores the page limit.

    Returns:
        DataFrame: A pandas DataFrame containing the collected profile information.
    """

    total_results = min(total_results, 1000)  # Limiting the total results to 1000
    names, curr_jobs, summarys, locations, links = [], [], [], [], []

    try:
        while True:
            # Check for "Retry Search" button at the start of each loop iteration
            handle_retry_search(wd)

            if page_start % 10 == 0:
                print(f"Scraping links on Page {page_start}")

            people_list = wd.find_elements(By.CLASS_NAME, "reusable-search__result-container")
            for person in people_list:
                try:
                    wd.execute_script("arguments[0].style.border='2px solid red'", person)
                    collect_person_info(person, names, curr_jobs, summarys, locations, links)
                except NoSuchElementException as e:
                    print("Required Element Not Found...Moving on")
                    print(e)
                    wd.execute_script("arguments[0].style.border='2px solid green'", person)
                    continue

            # random_sleep(2, 3)
            scroll_down(wd)

            if not process_next_page(wd, page_start, limit, unlimited):
                break

            page_start += 1

    except WebDriverException as e:
        print(f"An error occurred during the web scraping process: {str(e)}")

    return pd.DataFrame({
        'Name': names,
        'Current Job': curr_jobs,
        'Relevant Experience to Keyword': summarys,
        'Location': locations,
        'Profile Link': links
    })

def handle_retry_search(wd):
    """
    Handles the "Retry Search" scenario by waiting for the retry button to appear and clicking it after a random delay. This is useful for handling LinkedIn's anti-scraping measures.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.

    Returns:
        None
    """

    try:
        retry_button = WebDriverWait(wd, 3).until(
            EC.presence_of_element_located((By.XPATH, "//button[contains(text(), 'Retry Search')]"))
        )
        print("Retry Search button found. Waiting before clicking.")
        random_sleep(42, 64)  # Wait for a random time between 42 and 64 seconds
        retry_button.click()
        print("Retry button clicked.")
    except TimeoutException:
        print("No 'Retry Search' button found, continuing with normal process.")

def collect_person_info(person, names, curr_jobs, summarys, locations, links):
    """
    Collects information from a LinkedIn profile element and appends it to the respective lists for names, current jobs, summaries, locations, and profile links.

    Args:
        person (WebElement): The LinkedIn profile element to collect information from.
        names (list): The list to store profile names.
        curr_jobs (list): The list to store current job titles.
        summarys (list): The list to store profile summaries.
        locations (list): The list to store profile locations.
        links (list): The list to store profile links.

    Returns:
        None
    """

    all_links = person.find_elements(By.TAG_NAME, 'a')
    name_text, curr_job_text, link_text, summary_text, location_text = extract_person_details(person, all_links)

    links.append(link_text)
    locations.append(location_text)
    summarys.append(summary_text)
    names.append(name_text)
    curr_jobs.append(curr_job_text)

def extract_person_details(person, all_links):
    """
    Extracts detailed information from a LinkedIn profile element, including name, current job, profile link, summary, and location.

    Args:
        person (WebElement): The LinkedIn profile element to extract information from.
        all_links (list): The list of link elements within the profile element.

    Returns:
        tuple: A tuple containing the extracted name, current job, profile link, summary, and location.
    """

    name_text = "LinkedIn Member"

    if "LinkedIn Member" not in person.text:
        name_element = person.find_element(By.CSS_SELECTOR, ".entity-result__title-text.t-16 a span[aria-hidden='true']")
        name_text = name_element.text

    curr_jobs = person.find_elements(By.CLASS_NAME, "entity-result__primary-subtitle.t-14.t-black.t-normal")
    curr_job_text = '"' + curr_jobs[0].text + '"' if curr_jobs else "Null"

    summary_elements = person.find_elements(By.TAG_NAME ,'p')
    summary_text = '"' + summary_elements[0].text + '"' if summary_elements else "Null"

    location_element = person.find_element(By.CSS_SELECTOR, ".entity-result__secondary-subtitle.t-14.t-normal")
    location_text = location_element.text

    link_text = "Null"
    for a in all_links:
        href = a.get_attribute('href')
        if href.startswith("https://www.linkedin.com/in") and not href.startswith("https://www.linkedin.com/in/ACo") and href not in all_links:
            link_text = href
            break

    return name_text, curr_job_text, link_text, summary_text, location_text

def scroll_down(wd):
    """
    Scrolls down the LinkedIn search results page to load more profiles for scraping.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.

    Returns:
        None
    """

    scroll_script = "window.scrollBy(0, 2000);"
    wd.execute_script(scroll_script)

def process_next_page(wd, page_start, limit, unlimited):
    """
    Navigates to the next page of LinkedIn search results if the "Next" button is enabled. It handles pagination logic based on the specified limit and the unlimited flag.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        page_start (int): The current page number being processed.
        limit (int): The maximum number of pages to scrape.
        unlimited (bool): If True, ignores the page limit.

    Returns:
        bool: True if navigation to the next page was successful, False otherwise.
    """

    next_page_button = wd.find_element(By.CLASS_NAME, "artdeco-pagination__button.artdeco-pagination__button--next")
    if next_page_button.is_enabled():
        next_page_button.click()
        random_sleep(1, 2)
        return True
    elif not unlimited and page_start == limit + 1:
        return False
    else:
        return False


def experience_json3(wd, link):
    """
    Extracts the "About" section and experience details from a LinkedIn profile and returns them as a JSON-like dictionary.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        link (str): The LinkedIn profile link to extract information from.

    Returns:
        dict: A dictionary containing the "About" section and experience details.
    """

    wd.get(link)
    about_text = ""
    WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))  # Ensure page is loaded

    about_text = extract_about_section(wd)
    experience_list = extract_experience_section(wd)

    return jsonify(about_text, experience_list)

def extract_about_section(wd):
    """
    Extracts the "About" section text from a LinkedIn profile if it exists.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.

    Returns:
        str: The extracted "About" section text.
    """

    try:
        about_tags = wd.find_elements(By.XPATH, "//*[text()='About']")
        if about_tags:
            about_tag = about_tags[0]
            wd.execute_script("arguments[0].scrollIntoView();", about_tag)
            wd.execute_script("arguments[0].style.border='2px solid red'", about_tag)
            about_section_tag = about_tag.find_element(By.XPATH, "ancestor::section")
            wd.execute_script("arguments[0].style.border='2px solid blue'", about_section_tag)
            random_sleep(0, 1)
            return about_section_tag.text.replace("About\nAbout\n", "", 1)
    except NoSuchElementException:
        print("About section not found.")
    return ""

def extract_experience_section(wd):
    """
    Extracts the experience details from a LinkedIn profile, including company names, job roles, and job durations.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.

    Returns:
        list: A list of dictionaries containing experience details.
    """

    experience_list = []
    try:
        experience_tag = wd.find_element(By.XPATH, "//*[text()='Experience']")
        wd.execute_script("arguments[0].scrollIntoView();", experience_tag)
        wd.execute_script("arguments[0].style.border='2px solid red'", experience_tag)
        section_tag = experience_tag.find_element(By.XPATH, "ancestor::section")
        wd.execute_script("arguments[0].style.border='2px solid blue'", section_tag)
        div_tag = section_tag.find_element(By.XPATH, ".//div[@class='pvs-list__outer-container']")
        jobs = div_tag.find_elements(By.XPATH, "./ul/li")

        for job in jobs:
            process_job_entry(job, experience_list)
    except NoSuchElementException:
        print("Experience section not found.")

    return experience_list

def process_job_entry(job, experience_list):
    """
    Processes a job entry from a LinkedIn profile and appends the extracted details to the experience list.

    Args:
        job (WebElement): The job entry element to process.
        experience_list (list): The list to store experience details.

    Returns:
        None
    """

    try:
        company_name, job_role, job_time = extract_job_details(job)
        if company_name:
            experience_list.append({'company': company_name, 'job_role': job_role, 'job_time': job_time})
    except NoSuchElementException:
        print("Failed to process job entry.")

def extract_job_details(job):
    """
Extracts job details, including company name, job role, and job duration, from a job entry element.

Args:
    job (WebElement): The job entry element to extract details from.

Returns:
    tuple: A tuple containing the extracted company name, job role, and job duration.
"""

    company_name = job.find_element(By.CSS_SELECTOR, "div.display-flex.flex-wrap.align-items-center.full-height span[aria-hidden='true']").text.split('·')[0].strip()
    job_role = job.find_element(By.XPATH, ".//span[@aria-hidden='true']").text.split('·')[0].strip()
    job_time = job.find_element(By.CSS_SELECTOR, "span.t-14.t-normal.t-black--light span.pvs-entity__caption-wrapper").text.split('·')[0].strip()
    return company_name, job_role, job_time


def jsonify(about_text, experience_list):
    """
Converts the "About" section text and experience details into a JSON-like dictionary format.

Args:
    about_text (str): The "About" section text.
    experience_list (list): The list of dictionaries containing experience details.

Returns:
    dict: A dictionary containing the "About" section and experience details.
"""

    final_dict = {"About": about_text, "Experience": {}}
    for experience in experience_list:
        company_dict = {}
        if isinstance(experience['job_role'], list) and isinstance(experience['job_time'], list):
            for role, time in zip(experience['job_role'], experience['job_time']):
                company_dict[role] = time
        else:
            company_dict[experience['job_role']] = experience['job_time']
        final_dict["Experience"][experience['company']] = company_dict
    return final_dict


def get_search_results_number(wd):
    """
Retrieves the number of search results from LinkedIn's search results page.

Args:
    wd (WebDriver): The Selenium WebDriver instance used for browser automation.

Returns:
    int: The number of search results.
"""

    try:
        results_text = wd.find_element(By.CLASS_NAME, "pb2.t-black--light.t-14").text
        # Assuming the format of results_text is either "1,234 results" or "Showing 1-10 out of 1,234"
        number = int(results_text.split()[-2].replace(',', ''))
        return number
    except NoSuchElementException:
        print("Could not find the search results element.")
        return 0
    except ValueError:
        print("Conversion error, possibly due to unexpected text format.")
        return 0

def search_results_more_than_1000(wd):
    """
Checks if the number of search results exceeds 1000.

Args:
    wd (WebDriver): The Selenium WebDriver instance used for browser automation.

Returns:
    bool: True if the number of search results is more than 1000, False otherwise.
"""

    number = get_search_results_number(wd)
    return number > 1000


def dataframe_output(wd, search_term, location=None, current_company=None, past_company=None, num_pages=4, unlimited=False, industry=True):
    """
    Conducts a search based on specified parameters and collects results into a DataFrame.

    Args:
    wd (WebDriver): The Selenium WebDriver instance.
    search_term (str): The search query.
    location (str): Filter by location.
    current_company (str): Filter by current company.
    past_company (str): Filter by past company.
    num_pages (int): Number of pages to scrape.
    unlimited (bool): If True, ignores page limits.
    industry (bool): If True, applies industry filters if results exceed 1000.

    Returns:
    tuple: A DataFrame containing the collected data and a list of profile links.
    """
    columns = ['Name', 'Current Job', 'Relevant Experience to Keyword', 'Location', 'Profile Link']
    df = pd.DataFrame(columns=columns)
    try:
        search_query(wd, search_term)  # Assuming search_query now accepts wd as a parameter
        
        filters_page = add_filters1(wd, location, current_company, past_company)
        if industry and search_results_more_than_1000(wd):
            industry_options = [0, 1, 2, 3, 4]
            for i in industry_options:
                industry_filter(wd, i)
                total_results = get_search_results_number(wd)
                sub_df = collect_links(wd, 1, num_pages, total_results=total_results, unlimited=unlimited)
                df = pd.concat([df, sub_df], ignore_index=True)
                df = df.drop_duplicates(subset=['Profile Link'])
                wd.get(filters_page)
        else:
            total_results = get_search_results_number(wd)
            df = collect_links(wd, 1, num_pages, total_results=total_results, unlimited=unlimited)
    except Exception as e:
        print(f"Something went wrong: {e}")
        print("Exporting current dataframe...")

    links = list(df['Profile Link'])
    return df, links


In [89]:
def main_scraping_process(wd, keywords, companies, username, password):
    """
    This function performs the main scraping process on LinkedIn profiles based on given keywords and companies.
    It logs into LinkedIn, iterates through each keyword and company pair, and scrapes relevant profile information.
    The scraped data is stored in a DataFrame and saved to CSV files. The function handles any exceptions by saving
    the current state of the DataFrame before stopping execution.

    Args:
    wd: WebDriver instance for browser automation.
    keywords (list): A list of keywords to search for in LinkedIn profiles.
    companies (list): A list of companies to search within LinkedIn profiles.
    username (str): LinkedIn account username.
    password (str): LinkedIn account password.
    
    Returns:
    None
    """
    login(wd, username, password)
    columns = ['Name', 'Current Job', 'Relevant Experience to Keyword', 'Location', 'Profile Link', 'Keyword', 'Company']
    job_df = pd.DataFrame(columns=columns)
    keyword = ""
    company = ""
    query = ""

    try:
        for keyword in keywords:
            timeTotal = 0
            for i in range(0, len(companies) - 2, 2):

                if i != 0:    
                    wd.close()
                    print("Sleeping for 2 minutes")
                    random_sleep(100, 120)
                    wd = refresh_wd()

                login(wd, username, password)
                for company in companies[i:i+2]:
                    query = keyword
                    location = "United States"
                    current_company = company
                    past_company = None
                    unlimited = False
                    num_pages = 5
                    print("Scraping your request...")
                    print((query, current_company))

                    start_time = time.time()
                    df, links = dataframe_output(wd, query, location, current_company, past_company, num_pages, unlimited, industry=True)
                    print(len(df))
                    end_time = time.time()
                    timeTotal += (end_time - start_time)
                    print(end_time - start_time)

                    if len(df) > 0:
                        df['Keyword'] = keyword
                        df['Company'] = current_company
                        df.to_csv(f"{query.replace(' ', '')}_CompanyDFs/{company.replace(' ', '')}.csv")
                        job_df = pd.concat([job_df, df], ignore_index=True)

                    job_df.to_csv(f"{query.replace(' ', '')}_profiles.csv")
                    wd.get('https://www.linkedin.com/')

            print(f"Average Time per Company Scrape: {(timeTotal / len(companies)) / 60} minutes")
            print(f"Total Time per Company Scrape: {timeTotal / 60} minutes")

    except Exception as e:
        if len(df) > 0:
            df['Keyword'] = keyword
            df['Company'] = company
            job_df = pd.concat([job_df, df], ignore_index=True)
        job_df.to_csv(f"{query.replace(' ', '')}_profiles_er.csv")
        print(f'An error occurred: {str(e)} and execution stopped')

In [90]:
username, password = None, None

def set_user_as_PurdueCS(Purdue=True):
    global username, password
    if Purdue:
        username = "huoerxiu@gmail.com"
        password = "PURDUEcs"
    else:
        username = "bhat35@purdue.edu"
        password = "Aymwos@1977!!"

# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# options = webdriver.ChromeOptions()
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-dev-shm-usage')

# wd = webdriver.Chrome(options=options)

# # wd = webdriver.Chrome(ChromeDriverManager().install())
# actions = ActionChains(wd)

# wd.maximize_window()
# wd.switch_to.window(wd.current_window_handle)
# wd.implicitly_wait(10)

# wd.get('https://www.linkedin.com/')

# wd.implicitly_wait(1)

keywords = ['Machine Learning Engineer', "Artificial Intelligence", "AI", "Machine Learning", "ML", "Natural Language Processing", "NLP", "Data Mining", 
    "Robotics", "Computer Vision", "CV", "Autonomous Driving", "Self-driving", "Medical Imaging", "object detection", 
    "disease prediction", "disease diagnosis", "Data Science", "Deep Learning", "DL", "Language Modeling", "Language Models", 
    "Data Analysis", "Information Retrieval", "Reinforcement Learning", "Data Scientist", "Data Scientists", "Knowledge Graph", 
    "Knowledge Representation", "Representation Learning", "Generative Models", "Graph Mining", "Graph Model", "Perception Model", 
    "Predictive Model", "Predictive Diagnosis", "Model Prediction", "Model Robustness", "Neural Network", "Neural Model", "DNN", 
    "Bayesian Network", "Bayesian Model", "SVM", "PCA", "LASSO", "Transformer Model", "ChatGPT", "Pre-trained Model", "GPT", 
    "Named Entity Recognition", "Sentiment Analysis", "Image Segmentation", "Relation Extraction", "Information Extraction", 
    "Text Generation", "Conversational Agent", "Autonomous Agent", "Autonomous System", "Robotic Vehicle", "Unmanned Vehicle", 
    "Unmanned Aerial Vehicle", "UVA", "Unmanned Ground Vehicle", "Image Generation", "Diffusion Model", "Stable Diffusion", 
    "QA Model", "Intelligent System", "Intelligent Engine", "Information Retrieval", "Machine Intelligence", "CNN", "ResNet", 
    "VGG", "LeNet", "AlexNet", "Yolo", "DenseNet", "Vision Transformer", "Language Transformer", "Text Transformer", 
    "Semantic Parsing", "DeepViT", "Percepton", "MLP", "Encoder-Decoder", "AutoEncoder", "Variational Encoder", "GAN", 
    "Adversarial Network", "VAE", "Data Labeling", "Data Augmentation", "Semantic Segmentation", "Feature Extraction", 
    "Feature Engineering", "Prompt Engineering", "BERT", "RNN", "CLIP", "Video Generation", "Face Recognition", "Object Tracking", 
    "Machine Translation", "Text Translation", "Sequence-to-Sequence Model", "Seq-to-Seq Model", "Image-to-Image Translation", 
    "Image Translation", "Action Recognition", "Movement Recognition", "Object Segmentation", "Video Recognition", "Pose Estimation", 
    "Depth Estimation", "Image Retrieval", "Document Retrieval", "Text Retrieval", "Model Training", "Adversarial Training", 
    "Adversarial Attack", "Model Safety", "Model Security", "Supervised Learning", "Unsupervised Learning", "Self-supervised Learning", 
    "Generative Training", "Weak Supervision", "Data Annotation", "Explainable Model", "Style Transfer", "Interpretable Model", 
    "Model Explanability", "Model Interpretability", "Upsamplng", "Downsampling", "text representation", "video representation", 
    "image representation", "VQA", "visual grounding", "image transformation", "inpainting", "trajectory prediction", 
    "motion detection", "motion prediction", "posture detection", "posture recognition", "Explainable Model", "Style Transfer"]
companies = ['Netflix', 
             'Amazon',
             'Apple', 
             'Meta', 
             'Google', 
             'Microsoft', 
             'OpenAI',
             'Intel',
             'Cicsco',
             'NVIDIA',
             'Salesforce',
             'LinkedIn',
             'DeepMind',
             'IBM',
             'Bloomberg',
             'Tesla',
             'Mayo Clinic']

In [91]:
def write_total_results(message):
    file_path = 'total_results_per_search.txt'

    with open(file_path, 'a') as file:
        file.write(message + '\n')

In [92]:
def collect_links1(wd, search_term, current_company, state, limit, total_results, unlimited=False, output_file='continuous_data_scrape.csv'):
    """
Collects LinkedIn profile links from search results starting from the specified page and continues until the limit or total results are reached. It saves the collected data to a CSV file and updates the scraping state.

Args:
    wd (WebDriver): The Selenium WebDriver instance used for browser automation.
    search_term (str): The search query to be entered.
    current_company (str): The company to filter profiles by.
    state (dict): The current state of the scraping process, including the starting page and other parameters.
    limit (int): The maximum number of pages to scrape.
    total_results (int): The total number of search results.
    unlimited (bool, optional): If True, ignores the page limit and continues scraping until all results are collected.
    output_file (str, optional): The file to save the collected data. Default is 'continuous_data_scrape.csv'.

Returns:
    DataFrame: A pandas DataFrame containing the collected profile information.
"""

    state = load_state()
    page_to_start_at = state['page_start']

    total_results = min(total_results, 1000)  # Limiting the total results to 1000
    names, curr_jobs, summarys, locations, links = [], [], [], [], []
    print("Unlimited:", unlimited)

    ### IMPLEMENT THE BELOW METHOD
    if page_to_start_at != 1:
        if not go_to_page(wd, page_to_start_at):
            print(f"Could not navigate to page {page_to_start_at}, trying again...")
            if not go_to_page(wd, page_to_start_at):
                print(f"Failed to navigate to page {page_to_start_at}, aborting...")
                return pd.DataFrame()  # Fail gracefully

    curr_page = page_to_start_at
    
    try:
        while True:
            # Check for "Retry Search" button at the start of each loop iteration
            p_names, p_curr_jobs, p_summarys, p_locations, p_links = [], [], [], [], []
            handle_retry_search1(wd)

            if curr_page % 10 == 0:
                print(f"Scraping links on Page {curr_page}")

            people_list = wd.find_elements(By.CLASS_NAME, "reusable-search__result-container")
            for index, person in enumerate(people_list):
                try:
                    wd.execute_script("arguments[0].style.border='2px solid red'", person)
                    collect_person_info1(person, names, curr_jobs, summarys, locations, links) # Adding to general lists
                    collect_person_info1(person, p_names, p_curr_jobs, p_summarys, p_locations, p_links) # Adding to lists of specific page
                except NoSuchElementException as e:
                    print("Required Element Not Found...Moving on")
                    print(e)
                    wd.execute_script("arguments[0].style.border='2px solid green'", person)
                    continue
            
            df = pd.DataFrame({
                'Name': p_names,
                'Current Job': p_curr_jobs,
                'Relevant Experience to Keyword': p_summarys,
                'Location': p_locations,
                'Profile Link': p_links
            })

            df['Keyword'] = search_term
            df['Company'] = current_company

            if not os.path.isfile(output_file):
                df.to_csv(output_file, index=False)  # Create file and write data
            else:
                df.to_csv(output_file, mode='a', header=False, index=False)  # Append data to existing file

            scroll_down1(wd)
            if not process_next_page1(wd, curr_page, limit, unlimited):
                break

            curr_page += 1
            state.update({'page_start': curr_page})
            save_state(state)

    except WebDriverException as e:
        print(f"An error occurred during the web scraping process: {str(e)}")
        # Handling to save the state or decide what to do next can be added here

    return pd.DataFrame({
        'Name': names,
        'Current Job': curr_jobs,
        'Relevant Experience to Keyword': summarys,
        'Location': locations,
        'Profile Link': links
    })

def go_to_page(wd, page_number):
    """
Navigates to the specified page number in the LinkedIn search results. If the desired page number is not visible, it uses ellipsis buttons to load more pages.

Args:
    wd (WebDriver): The Selenium WebDriver instance used for browser automation.
    page_number (int): The page number to navigate to.

Returns:
    bool: True if navigation to the specified page was successful, False otherwise.
"""

    try:
        # Wait for the pagination container to be visible
        scroll_down1(wd)
        pagination = WebDriverWait(wd, 5).until(
            EC.visibility_of_element_located((By.CLASS_NAME, 'artdeco-pagination__pages'))
        )
        while True:
            pages = pagination.find_elements(By.TAG_NAME, 'button')
            # Attempt to find and click the desired page number
            page_found = False
            ellipses = []
            for page in pages:
                if '…' in page.text:
                    ellipses.append(page)

                if page.text.strip() == str(page_number):
                    page.click()
                    return True

            # If not found, find and click the "..." button to load more pages
            # ellipses = [btn for btn in pages if "..." in btn.text]
            if not ellipses:
                print(f"No '…'found and page {page_number} is not present.")
                return False

            # Click the appropriate "..." button
            # If looking for a higher page number, click the last "..." button, else the first
            target_ellipsis = ellipses[-1] if page_number > int(pages[-3].text.strip()) else ellipses[0]
            target_ellipsis.click()
            WebDriverWait(wd, 3).until(EC.staleness_of(target_ellipsis))  # Ensure the pagination has refreshed

            # Reassign pagination element to handle potential StaleElementReferenceException
            pagination = WebDriverWait(wd, 10).until(
                EC.visibility_of_element_located((By.CLASS_NAME, 'artdeco-pagination__pages'))
            )
    except TimeoutException as e:
        print(f"Failed to navigate to page {page_number}: {e}")
        return False

def handle_retry_search1(wd):
    """
Handles the "Retry Search" scenario by waiting for the retry button to appear and clicking it after a random delay. This helps to bypass LinkedIn's anti-scraping measures.

Args:
    wd (WebDriver): The Selenium WebDriver instance used for browser automation.

Returns:
    None
"""

    try:
        retry_button = WebDriverWait(wd, 1).until(
            EC.presence_of_element_located((By.XPATH, "//button[contains(text(), 'Retry Search')]"))
        )
        print("Retry Search button found. Waiting before clicking.")
        random_sleep(42, 64)  # Wait for a random time between 42 and 64 seconds
        retry_button.click()
        print("Retry button clicked.")
    except TimeoutException:
        not_found = True

def collect_person_info1(person, names, curr_jobs, summarys, locations, links):
    """
Collects information from a LinkedIn profile element and appends it to the respective lists for names, current jobs, summaries, locations, and profile links.

Args:
    person (WebElement): The LinkedIn profile element to collect information from.
    names (list): The list to store profile names.
    curr_jobs (list): The list to store current job titles.
    summarys (list): The list to store profile summaries.
    locations (list): The list to store profile locations.
    links (list): The list to store profile links.

Returns:
    None
"""

    all_links = person.find_elements(By.TAG_NAME, 'a')
    name_text, curr_job_text, link_text, summary_text, location_text = extract_person_details1(person, all_links)

    links.append(link_text)
    locations.append(location_text)
    summarys.append(summary_text)
    names.append(name_text)
    curr_jobs.append(curr_job_text)

    

def extract_person_details1(person, all_links):
    """
Extracts detailed information from a LinkedIn profile element, including name, current job, profile link, summary, and location.

Args:
    person (WebElement): The LinkedIn profile element to extract information from.
    all_links (list): The list of link elements within the profile element.

Returns:
    tuple: A tuple containing the extracted name, current job, profile link, summary, and location.
"""

    name_text = "LinkedIn Member"

    if "LinkedIn Member" not in person.text:
        name_element = person.find_element(By.CSS_SELECTOR, ".entity-result__title-text.t-16 a span[aria-hidden='true']")
        name_text = name_element.text

    curr_jobs = person.find_elements(By.CLASS_NAME, "entity-result__primary-subtitle.t-14.t-black.t-normal")
    curr_job_text = '"' + curr_jobs[0].text + '"' if curr_jobs else "Null"

    summary_elements = person.find_elements(By.TAG_NAME ,'p')
    summary_text = '"' + summary_elements[0].text + '"' if summary_elements else "Null"

    location_element = person.find_element(By.CSS_SELECTOR, ".entity-result__secondary-subtitle.t-14.t-normal")
    location_text = location_element.text

    link_text = "Null"
    for a in all_links:
        href = a.get_attribute('href')
        if href.startswith("https://www.linkedin.com/in") and not href.startswith("https://www.linkedin.com/in/ACo") and href not in all_links:
            link_text = href
            break

    return name_text, curr_job_text, link_text, summary_text, location_text

def scroll_down1(wd):
    """
Scrolls down the LinkedIn search results page to load more profiles for scraping.

Args:
    wd (WebDriver): The Selenium WebDriver instance used for browser automation.

Returns:
    None
"""

    scroll_script = "window.scrollBy(0, 2000);"
    wd.execute_script(scroll_script)

def process_next_page1(wd, curr_page, limit, unlimited):
    """
Navigates to the next page of LinkedIn search results if the "Next" button is enabled. It handles pagination logic based on the specified limit and the unlimited flag.

Args:
    wd (WebDriver): The Selenium WebDriver instance used for browser automation.
    curr_page (int): The current page number being processed.
    limit (int): The maximum number of pages to scrape.
    unlimited (bool): If True, ignores the page limit.

Returns:
    bool: True if navigation to the next page was successful, False otherwise.
"""

    next_page_button = wd.find_element(By.CLASS_NAME, "artdeco-pagination__button.artdeco-pagination__button--next")
    if not unlimited and curr_page == limit:
        return False

    if next_page_button.is_enabled():
        next_page_button.click()
        random_sleep(1, 2)
        return True
    else:
        return False


def save_state(state):
    """
Saves the current state of the scraping process to a JSON file for persistence across sessions.

Args:
    state (dict): The state of the scraping process to be saved.

Returns:
    None
"""

    with open('state.json', 'w') as f:
        json.dump(state, f)

def load_state():
    """
    Loads the saved state of the scraping process from a JSON file. If the file does not exist, returns None.

    Args:
        None

    Returns:
        dict: The state of the scraping process, or None if the state file does not exist.
    """

    try:
        with open('state.json', 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return None



def dataframe_output1(wd, search_term, location=None, current_company=None, past_company=None, num_pages=4, unlimited=False):
    """
    Conducts a search based on specified parameters and collects results into a DataFrame. Applies filters and collects profile information from multiple pages of search results, updating the state throughout the process.

    Args:
        wd (WebDriver): The Selenium WebDriver instance used for browser automation.
        search_term (str): The search query.
        location (str, optional): Filter by location.
        current_company (str, optional): Filter by current company.
        past_company (str, optional): Filter by past company.
        num_pages (int, optional): Number of pages to scrape. Default is 4.
        unlimited (bool, optional): If True, ignores page limits. Default is False.

    Returns:
        DataFrame: A pandas DataFrame containing the collected profile information.
    """

    state = load_state()
    industry = state['industry']
    
    output_scrape_file = 'continuous_data_scrape.csv'

    if past_company:
        output_scrape_file = "past_company_continuous_scrape.csv"
    
    df = pd.DataFrame(columns=['Name', 'Current Job', 'Relevant Experience to Keyword', 'Location', 'Profile Link'])
    try:
        search_query(wd, search_term)
        # add_filters1(wd, location, current_company, past_company)
        
        filters_page = add_filters1(wd, location, current_company, past_company)
        print(filters_page)

        total_results = get_search_results_number(wd)
        total_results_string = f'{keywords[state["keyword_index"]]}-{companies[state["company_index"]]}: {total_results}'
        write_total_results(total_results_string)

        if industry and search_results_more_than_1000(wd):
            industry_options = [0, 1, 2, 3, 4]
            industry_options = industry_options[state['industry_number']:]

            for i in industry_options:

                industry_filter(wd, i)
                sub_df = collect_links1(wd, search_term, current_company, state, num_pages, total_results, unlimited, output_file=output_scrape_file)
                df = pd.concat([df, sub_df], ignore_index=True)
                df = df.drop_duplicates()
                print(filters_page)
                wd.get(filters_page)
                random_sleep(1,2)
                
                state = load_state()
                state.update({'page_start': 1, 'industry_number': ((i+1)%5)})
                save_state(state)
        else:
            df = collect_links1(wd, search_term, current_company, state, num_pages, total_results, unlimited, output_file=output_scrape_file)
    except Exception as e:
        print(f"Something went wrong: {e}")
        # Here you might want to save the last state or rethrow the exception to handle it in the main scraping function
        raise

    return df

In [93]:
def main_scraping_process_rob(wd, keywords, companies, username, password, past_company=False, output_file='scraped_data.csv'):
    """
The main robust scraping function that takes in exceptions and restarts the process when an issue occurs. It logs into LinkedIn, iterates through each keyword and company pair, scrapes relevant profile information, and handles the state of the scraping process to resume from where it left off.

Args:
    wd (WebDriver): The Selenium WebDriver instance used for browser automation.
    keywords (list): A list of keywords to search for in LinkedIn profiles.
    companies (list): A list of companies to search within LinkedIn profiles.
    username (str): LinkedIn account username.
    password (str): LinkedIn account password.
    output_file (str, optional): The file to save the collected data. Default is 'scraped_data.csv'.

Returns:
    None
"""

    state = load_state()
    if state is None:
        state = {'keyword_index': 0, 'company_index': 0, 'page_start': 1, 'industry' : True ,'industry_number': 0, 'problem_locs': {}}
        save_state(state)

    try:
        keyword_start_index = state['keyword_index']
        company_start_index = state['company_index']
        for keyword_index in range(keyword_start_index, len(keywords)):
            keyword = keywords[keyword_index]
            
            if keyword_index != keyword_start_index:
                state = load_state()
                state.update({'keyword_index': keyword_index, 'company_index': 0, 'page_start': 1, 'industry_number': 0})
                save_state(state)
                company_start_index = state['company_index']

            for company_index in range(company_start_index, len(companies)):
                company = companies[company_index]
                
                if company_index != company_start_index:
                    state = load_state()
                    state.update({'company_index': company_index, 'page_start': 1, 'industry_number': 0})
                    save_state(state)

                try:
                    print(f"Scraping your request for {keyword} at {company}")
                    if not past_company:
                        df = dataframe_output1(
                            wd=wd, search_term=keyword, location="United States", current_company=company, past_company=None, num_pages=10, unlimited=True)
                    else:
                        df = dataframe_output1(
                            wd=wd, search_term=keyword, location="United States", current_company=None, past_company=company, num_pages=10, unlimited=True)
                    df['Keyword'] = keyword
                    df['Company'] = company
                    if not os.path.isfile(output_file):
                        df.to_csv(output_file, index=False)  # Create file and write data
                    else:
                        df.to_csv(output_file, mode='a', header=False, index=False)  # Append data to existing file

                except Exception as e:
                    print(f"Encountered an issue with {company}: {e}, saving state and restarting...")
                    state = load_state()

                    problem_locs = state['problem_locs']
                    keyword_index_str = str(keyword_index)
                    current_list = problem_locs.get(keyword_index_str, [])
                    current_list.append(f"{company_index}.{state['page_start']}.{state['industry_number']}")
                    problem_locs[keyword_index_str] = current_list

                    state.update({'problem_locs':problem_locs})
                    save_state(state)

                wd.get('https://www.linkedin.com/')

    except Exception as e:
        print(f'Final error occurred: {str(e)}')

In [94]:
def run_scrape_continuously(past_company=False):
    # Opening new selenium window
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    wd = webdriver.Chrome(options=options)

    # wd = webdriver.Chrome(ChromeDriverManager().install())
    actions = ActionChains(wd)

    wd.maximize_window()
    wd.switch_to.window(wd.current_window_handle)
    wd.implicitly_wait(10)

    wd.get('https://www.linkedin.com/')

    wd.implicitly_wait(1)

    # Setting user as Purdue user
    set_user_as_PurdueCS(Purdue=True)
    print(username,password)

    login_with_fallback(wd, username, password)
    
    if not past_company:
        main_scraping_process_rob(wd, keywords, companies, username, password)
    else:
        main_scraping_process_rob(wd, keywords, companies, username, password, past_company=True, output_file='past_company_scraped_data.csv')

## Run this code block to scrape past employees

In [99]:

start_time = time.time()  # Record the start time
run_scrape_continuously(past_company=True)  # Run the job
end_time = time.time()  # Record the end time

duration = end_time - start_time  # Calculate the duration
print(f"Job completed. Duration: {duration / 60:.2f} minutes.") # Run the job
print("Job completed. Waiting for 30 minutes before the next run.")
time.sleep(30 * 60)
print("Sleep over.")

huoerxiu@gmail.com PURDUEcs
Failed to login: Timeout while waiting for page elements.
Primary login method failed: Message: 
Stacktrace:
#0 0x62899972c993 <unknown>
#1 0x628999427136 <unknown>
#2 0x628999471d48 <unknown>
#3 0x628999471e01 <unknown>
#4 0x6289994b4e44 <unknown>
#5 0x628999493cfd <unknown>
#6 0x6289994b2319 <unknown>
#7 0x628999493a73 <unknown>
#8 0x628999464c93 <unknown>
#9 0x62899946565e <unknown>
#10 0x6289996f108b <unknown>
#11 0x6289996f5005 <unknown>
#12 0x6289996df491 <unknown>
#13 0x6289996f5b92 <unknown>
#14 0x6289996c49ef <unknown>
#15 0x62899971bdf8 <unknown>
#16 0x62899971bfcb <unknown>
#17 0x62899972bae4 <unknown>
#18 0x7b5fe7094ac3 <unknown>

Failed to login using alternative method: Message: 
Stacktrace:
#0 0x62899972c993 <unknown>
#1 0x628999427136 <unknown>
#2 0x628999471d48 <unknown>
#3 0x628999471e01 <unknown>
#4 0x6289994b4e44 <unknown>
#5 0x628999493cfd <unknown>
#6 0x6289994b2319 <unknown>
#7 0x628999493a73 <unknown>
#8 0x628999464c93 <unknown>
#9 0x

KeyboardInterrupt: 

### Do not Run This code block below for the time being (Current Employees)

In [86]:
# start_time = time.time()  # Record the start time
# run_scrape_continuously()  # Run the job
# end_time = time.time()  # Record the end time

# duration = end_time - start_time  # Calculate the duration
# print(f"Job completed. Duration: {duration / 60:.2f} minutes.")

huoerxiu@gmail.com PURDUEcs
Failed to login: Timeout while waiting for page elements.
Primary login method failed: Message: 
Stacktrace:
#0 0x58e511aa8993 <unknown>
#1 0x58e5117a3136 <unknown>
#2 0x58e5117edd48 <unknown>
#3 0x58e5117ede01 <unknown>
#4 0x58e511830e44 <unknown>
#5 0x58e51180fcfd <unknown>
#6 0x58e51182e319 <unknown>
#7 0x58e51180fa73 <unknown>
#8 0x58e5117e0c93 <unknown>
#9 0x58e5117e165e <unknown>
#10 0x58e511a6d08b <unknown>
#11 0x58e511a71005 <unknown>
#12 0x58e511a5b491 <unknown>
#13 0x58e511a71b92 <unknown>
#14 0x58e511a409ef <unknown>
#15 0x58e511a97df8 <unknown>
#16 0x58e511a97fcb <unknown>
#17 0x58e511aa7ae4 <unknown>
#18 0x7dede4694ac3 <unknown>

Logging into your LinkedIn account using alternative method!
Scraping your request for image transformation at IBM
https://www.linkedin.com/search/results/people/?currentCompany=%5B%221009%22%5D&geoUrn=%5B%22103644278%22%5D&keywords=image%20transformation&origin=FACETED_SEARCH&sid=9%3Bz
Unlimited: True
Failed to navigat

KeyboardInterrupt: 

# Run this if you want to scrape employees based on current company

In [84]:
for i in range(10):
    print(f"Scrape No. {i+1}")
    start_time = time.time()  # Record the start time
    run_scrape_continuously()  # Run the job
    end_time = time.time()  # Record the end time
    
    duration = end_time - start_time  # Calculate the duration
    print(f"Job completed. Duration: {duration / 60:.2f} minutes.") # Run the job
    print("Job completed. Waiting for 30 minutes before the next run.")
    time.sleep(30 * 60)
    print("Sleep over.")

Scrape No. 1
huoerxiu@gmail.com PURDUEcs
Logging into your LinkedIn account!
Scraping your request for Machine Learning at Salesforce
https://www.linkedin.com/search/results/people/?currentCompany=%5B%223185%22%5D&geoUrn=%5B%22103644278%22%5D&keywords=machine%20learning&origin=FACETED_SEARCH&sid=alu
Added industry option 0
Unlimited: True
Failed to navigate to page 83: Message: 
Stacktrace:
#0 0x5612d9969993 <unknown>
#1 0x5612d9664136 <unknown>
#2 0x5612d96aed48 <unknown>
#3 0x5612d96aee01 <unknown>
#4 0x5612d96f1e44 <unknown>
#5 0x5612d96d0cfd <unknown>
#6 0x5612d96ef319 <unknown>
#7 0x5612d96d0a73 <unknown>
#8 0x5612d96a1c93 <unknown>
#9 0x5612d96a265e <unknown>
#10 0x5612d992e08b <unknown>
#11 0x5612d9932005 <unknown>
#12 0x5612d991c491 <unknown>
#13 0x5612d9932b92 <unknown>
#14 0x5612d99019ef <unknown>
#15 0x5612d9958df8 <unknown>
#16 0x5612d9958fcb <unknown>
#17 0x5612d9968ae4 <unknown>
#18 0x7138d0894ac3 <unknown>

Could not navigate to page 83, trying again...
Failed to navigat

KeyboardInterrupt: 

# Run this if you want to scrape employees based on past company

In [None]:

start_time = time.time()  # Record the start time
run_scrape_continuously(past_company=True)  # Run the job
end_time = time.time()  # Record the end time

duration = end_time - start_time  # Calculate the duration
print(f"Job completed. Duration: {duration / 60:.2f} minutes.") # Run the job
print("Job completed. Waiting for 30 minutes before the next run.")
time.sleep(30 * 60)
print("Sleep over.")