In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
import time
from selenium.webdriver.common.by import By
import logging
import pandas as pd
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import random

import json

In [2]:
def random_sleep(start, end):
    time.sleep(random.randint(start, end))

def refresh_wd():
    # Define your Chrome options outside this function or ensure they are passed as parameters
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    # Create a new instance of Chrome
    wd = webdriver.Chrome(options=options)
    
    # Initialize action chains for possible future actions
    actions = ActionChains(wd)
    
    # Maximize the window to avoid elements being out of view
    wd.maximize_window()
    
    # Navigate to LinkedIn
    wd.get('https://www.linkedin.com/')

    return wd

def login(wd, user_name, password):
    try:
        # Use WebDriverWait for more reliable element handling
        username_input = WebDriverWait(wd, 10).until(
            EC.presence_of_element_located((By.ID, "session_key"))
        )
        username_input.clear()
        username_input.send_keys(user_name)
        
        password_input = WebDriverWait(wd, 10).until(
            EC.presence_of_element_located((By.ID, "session_password"))
        )
        password_input.clear()
        password_input.send_keys(password)

        print("Logging into your LinkedIn account!")
        
        # Locate and click the login button
        login_button = WebDriverWait(wd, 10).until(
            EC.element_to_be_clickable((By.XPATH, '/html/body/main/section[1]/div/div/form/div[2]/button'))
        )
        login_button.click()
    except TimeoutException:
        print("Failed to login: Timeout while waiting for page elements.")
    except NoSuchElementException:
        print("Failed to login: Could not find one of the elements.")

def security_verification(wd):
    otp = input("You have been sent a verification code from LinkedIn via your email.\nPlease input that here: ")
    try:
        # Use WebDriverWait to wait for the OTP input field to become available
        otp_input = WebDriverWait(wd, 10).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div/main/form/div[1]/input[15]'))
        )
        otp_input.clear()
        otp_input.send_keys(otp)

        # Locate and click the submit button for OTP
        submit_button = WebDriverWait(wd, 10).until(
            EC.element_to_be_clickable((By.XPATH, '/html/body/div/main/form/div[2]/button'))
        )
        submit_button.click()
        print("Successfully Authenticated!")
    except TimeoutException:
        print("Authentication failed: Timeout while waiting for page elements.")
    except NoSuchElementException:
        print("Authentication failed: Could not find one of the elements.")

def search_query(wd, query):
    # Ensure the search input field is visible and clickable
    search_field = WebDriverWait(wd, 10).until(
        EC.element_to_be_clickable((By.CLASS_NAME, 'search-global-typeahead__input'))
    )
    search_field.click()  # Focus on the search field

    # Introduce a random delay to mimic human typing speed
    random_sleep(1,2)

    # Send the search query
    search_field.send_keys(query)
    random_sleep(1,2)

    # Press ENTER to execute the search
    search_field.send_keys(Keys.ENTER)


def add_filters1(wd, location=None, current_company=None, past_company=None):
    navigate_to_people_results(wd)
    open_all_filters(wd)

    if current_company:
        apply_filter(wd, current_company, 'company', filter_index=0)
    if past_company:
        apply_filter(wd, past_company, 'company', filter_index=1)
    if location:
        apply_filter(wd, location, 'location')

    show_all_results(wd)
    return wd.current_url

def apply_filter(wd, filter_value, filter_type, filter_index=None):
    # Locate the "Add a filter" button based on the type and index (if applicable)
    filter_button_xpath = f"//*[text()='Add a {filter_type}']"
    filter_buttons = wd.find_elements(By.XPATH, filter_button_xpath)
    filter_button = filter_buttons[filter_index] if filter_index is not None else filter_buttons[0]

    wd.execute_script("arguments[0].scrollIntoView();", filter_button)
    filter_button.click()
    time.sleep(random.randint(1, 2))

    # Enter the filter value in the corresponding input field
    input_selector = f'input[placeholder="Add a {filter_type}"]'
    input_field = WebDriverWait(wd, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, input_selector))
    )
    input_field.click()
    input_field.send_keys(filter_value)
    time.sleep(random.randint(1, 2))

    # Wait for the listbox to appear and select the correct option
    listbox = WebDriverWait(wd, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'basic-typeahead__triggered-content'))
    )
    options = listbox.find_elements(By.XPATH, ".//div[@role='option']")
    for option in options:
        if filter_value in option.text:
            option.click()
            break
    time.sleep(random.randint(1, 2))

def navigate_to_people_results(wd):
    try:
        # Use WebDriverWait to wait until the button is visible and clickable
        people_results_button = WebDriverWait(wd, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'See all people results')]"))
        )
        wd.execute_script("arguments[0].style.border='2px solid red'", people_results_button)
        people_page_link = people_results_button.get_attribute("href")
        wd.get(people_page_link)
        # Wait for a random time between 1 and 2 seconds after loading the page
        random_sleep(1, 2)
    except TimeoutException:
        print("Failed to find the 'See all people results' button within the expected time.")


def open_all_filters(wd):
    all_filters = wd.find_element(By.CLASS_NAME, "relative.mr2")
    wd.execute_script("arguments[0].style.border='2px solid red'", all_filters)
    all_filters.click()
    random_sleep(1,2)


def industry_filter(wd, industry_list_num=0):
    # Navigate and click on all filters button
    navigate_and_click_filters(wd)

    # Scroll to and interact with the industry filter
    industry_filter_ul = WebDriverWait(wd, 10).until(
        EC.presence_of_element_located((By.XPATH, "//*[text()='Add an industry']/ancestor::ul[1]"))
    )
    wd.execute_script("arguments[0].scrollIntoView();", industry_filter_ul)

    # Select the industry based on provided index
    select_industry(wd, industry_filter_ul, industry_list_num)

    # Click show all results
    show_all_results(wd)

def navigate_and_click_filters(wd):
    all_filters = WebDriverWait(wd, 10).until(
        EC.element_to_be_clickable((By.CLASS_NAME, "relative.mr2"))
    )
    wd.execute_script("arguments[0].style.border='2px solid red'", all_filters)
    all_filters.click()
    time.sleep(random.randint(1, 2))

def select_industry(wd, industry_filter_ul, industry_list_num):
    li_elements = industry_filter_ul.find_elements(By.TAG_NAME, "li")
    if 0 <= industry_list_num < len(li_elements):
        li_element = li_elements[industry_list_num]
        input_checkbox = li_element.find_element(By.XPATH, ".//input[@type='checkbox']")
        if not input_checkbox.is_selected():
            wd.execute_script("arguments[0].click();", input_checkbox)
        print(f"Added industry option {industry_list_num}")
    else:
        print("Invalid industry index")

def show_all_results(wd):
    all_results = WebDriverWait(wd, 10).until(
        EC.element_to_be_clickable((By.XPATH, '/html/body/div[3]/div/div/div[3]/div/button[2]'))
    )
    all_results.click()
    time.sleep(random.randint(1, 2))


def collect_links(wd, page_start, limit, total_results, unlimited=False):
    total_results = min(total_results, 1000)  # Limiting the total results to 1000
    names, curr_jobs, summarys, locations, links = [], [], [], [], []

    try:
        while True:
            # Check for "Retry Search" button at the start of each loop iteration
            handle_retry_search(wd)

            if page_start % 10 == 0:
                print(f"Scraping links on Page {page_start}")

            people_list = wd.find_elements(By.CLASS_NAME, "reusable-search__result-container")
            for person in people_list:
                try:
                    wd.execute_script("arguments[0].style.border='2px solid red'", person)
                    collect_person_info(person, names, curr_jobs, summarys, locations, links)
                except NoSuchElementException as e:
                    print("Required Element Not Found...Moving on")
                    print(e)
                    wd.execute_script("arguments[0].style.border='2px solid green'", person)
                    continue

            # random_sleep(2, 3)
            scroll_down(wd)

            if not process_next_page(wd, page_start, limit, unlimited):
                break

            page_start += 1

    except WebDriverException as e:
        print(f"An error occurred during the web scraping process: {str(e)}")

    return pd.DataFrame({
        'Name': names,
        'Current Job': curr_jobs,
        'Relevant Experience to Keyword': summarys,
        'Location': locations,
        'Profile Link': links
    })

def handle_retry_search(wd):
    try:
        retry_button = WebDriverWait(wd, 3).until(
            EC.presence_of_element_located((By.XPATH, "//button[contains(text(), 'Retry Search')]"))
        )
        print("Retry Search button found. Waiting before clicking.")
        random_sleep(42, 64)  # Wait for a random time between 42 and 64 seconds
        retry_button.click()
        print("Retry button clicked.")
    except TimeoutException:
        print("No 'Retry Search' button found, continuing with normal process.")

def collect_person_info(person, names, curr_jobs, summarys, locations, links):
    all_links = person.find_elements(By.TAG_NAME, 'a')
    name_text, curr_job_text, link_text, summary_text, location_text = extract_person_details(person, all_links)

    links.append(link_text)
    locations.append(location_text)
    summarys.append(summary_text)
    names.append(name_text)
    curr_jobs.append(curr_job_text)

def extract_person_details(person, all_links):
    name_text = "LinkedIn Member"

    if "LinkedIn Member" not in person.text:
        name_element = person.find_element(By.CSS_SELECTOR, ".entity-result__title-text.t-16 a span[aria-hidden='true']")
        name_text = name_element.text

    curr_jobs = person.find_elements(By.CLASS_NAME, "entity-result__primary-subtitle.t-14.t-black.t-normal")
    curr_job_text = '"' + curr_jobs[0].text + '"' if curr_jobs else "Null"

    summary_elements = person.find_elements(By.TAG_NAME ,'p')
    summary_text = '"' + summary_elements[0].text + '"' if summary_elements else "Null"

    location_element = person.find_element(By.CSS_SELECTOR, ".entity-result__secondary-subtitle.t-14.t-normal")
    location_text = location_element.text

    link_text = "Null"
    for a in all_links:
        href = a.get_attribute('href')
        if href.startswith("https://www.linkedin.com/in") and not href.startswith("https://www.linkedin.com/in/ACo") and href not in all_links:
            link_text = href
            break

    return name_text, curr_job_text, link_text, summary_text, location_text

def scroll_down(wd):
    scroll_script = "window.scrollBy(0, 2000);"
    wd.execute_script(scroll_script)

def process_next_page(wd, page_start, limit, unlimited):
    next_page_button = wd.find_element(By.CLASS_NAME, "artdeco-pagination__button.artdeco-pagination__button--next")
    if next_page_button.is_enabled():
        next_page_button.click()
        random_sleep(1, 2)
        return True
    elif not unlimited and page_start == limit + 1:
        return False
    else:
        return False


def experience_json3(wd, link):
    wd.get(link)
    about_text = ""
    WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))  # Ensure page is loaded

    about_text = extract_about_section(wd)
    experience_list = extract_experience_section(wd)

    return jsonify(about_text, experience_list)

def extract_about_section(wd):
    try:
        about_tags = wd.find_elements(By.XPATH, "//*[text()='About']")
        if about_tags:
            about_tag = about_tags[0]
            wd.execute_script("arguments[0].scrollIntoView();", about_tag)
            wd.execute_script("arguments[0].style.border='2px solid red'", about_tag)
            about_section_tag = about_tag.find_element(By.XPATH, "ancestor::section")
            wd.execute_script("arguments[0].style.border='2px solid blue'", about_section_tag)
            random_sleep(0, 1)
            return about_section_tag.text.replace("About\nAbout\n", "", 1)
    except NoSuchElementException:
        print("About section not found.")
    return ""

def extract_experience_section(wd):
    experience_list = []
    try:
        experience_tag = wd.find_element(By.XPATH, "//*[text()='Experience']")
        wd.execute_script("arguments[0].scrollIntoView();", experience_tag)
        wd.execute_script("arguments[0].style.border='2px solid red'", experience_tag)
        section_tag = experience_tag.find_element(By.XPATH, "ancestor::section")
        wd.execute_script("arguments[0].style.border='2px solid blue'", section_tag)
        div_tag = section_tag.find_element(By.XPATH, ".//div[@class='pvs-list__outer-container']")
        jobs = div_tag.find_elements(By.XPATH, "./ul/li")

        for job in jobs:
            process_job_entry(job, experience_list)
    except NoSuchElementException:
        print("Experience section not found.")

    return experience_list

def process_job_entry(job, experience_list):
    try:
        company_name, job_role, job_time = extract_job_details(job)
        if company_name:
            experience_list.append({'company': company_name, 'job_role': job_role, 'job_time': job_time})
    except NoSuchElementException:
        print("Failed to process job entry.")

def extract_job_details(job):
    company_name = job.find_element(By.CSS_SELECTOR, "div.display-flex.flex-wrap.align-items-center.full-height span[aria-hidden='true']").text.split('·')[0].strip()
    job_role = job.find_element(By.XPATH, ".//span[@aria-hidden='true']").text.split('·')[0].strip()
    job_time = job.find_element(By.CSS_SELECTOR, "span.t-14.t-normal.t-black--light span.pvs-entity__caption-wrapper").text.split('·')[0].strip()
    return company_name, job_role, job_time


def jsonify(about_text, experience_list):
    final_dict = {"About": about_text, "Experience": {}}
    for experience in experience_list:
        company_dict = {}
        if isinstance(experience['job_role'], list) and isinstance(experience['job_time'], list):
            for role, time in zip(experience['job_role'], experience['job_time']):
                company_dict[role] = time
        else:
            company_dict[experience['job_role']] = experience['job_time']
        final_dict["Experience"][experience['company']] = company_dict
    return final_dict


def get_search_results_number(wd):
    try:
        results_text = wd.find_element(By.CLASS_NAME, "pb2.t-black--light.t-14").text
        # Assuming the format of results_text is either "1,234 results" or "Showing 1-10 out of 1,234"
        number = int(results_text.split()[-2].replace(',', ''))
        return number
    except NoSuchElementException:
        print("Could not find the search results element.")
        return 0
    except ValueError:
        print("Conversion error, possibly due to unexpected text format.")
        return 0

def search_results_more_than_1000(wd):
    number = get_search_results_number(wd)
    return number > 1000


def dataframe_output(wd, search_term, location=None, current_company=None, past_company=None, num_pages=4, unlimited=False, industry=True):
    """
    Conducts a search based on specified parameters and collects results into a DataFrame.

    Args:
    wd (WebDriver): The Selenium WebDriver instance.
    search_term (str): The search query.
    location (str): Filter by location.
    current_company (str): Filter by current company.
    past_company (str): Filter by past company.
    num_pages (int): Number of pages to scrape.
    unlimited (bool): If True, ignores page limits.
    industry (bool): If True, applies industry filters if results exceed 1000.

    Returns:
    tuple: A DataFrame containing the collected data and a list of profile links.
    """
    columns = ['Name', 'Current Job', 'Relevant Experience to Keyword', 'Location', 'Profile Link']
    df = pd.DataFrame(columns=columns)
    try:
        search_query(wd, search_term)  # Assuming search_query now accepts wd as a parameter
        
        filters_page = add_filters1(wd, location, current_company, past_company)
        if industry and search_results_more_than_1000(wd):
            industry_options = [0, 1, 2, 3, 4]
            for i in industry_options:
                industry_filter(wd, i)
                total_results = get_search_results_number(wd)
                sub_df = collect_links(wd, 1, num_pages, total_results=total_results, unlimited=unlimited)
                df = pd.concat([df, sub_df], ignore_index=True)
                df = df.drop_duplicates(subset=['Profile Link'])
                wd.get(filters_page)
        else:
            total_results = get_search_results_number(wd)
            df = collect_links(wd, 1, num_pages, total_results=total_results, unlimited=unlimited)
    except Exception as e:
        print(f"Something went wrong: {e}")
        print("Exporting current dataframe...")

    links = list(df['Profile Link'])
    return df, links


In [3]:
def main_scraping_process(wd, keywords, companies, username, password):
    login(wd, username, password)
    columns = ['Name', 'Current Job', 'Relevant Experience to Keyword', 'Location', 'Profile Link', 'Keyword', 'Company']
    job_df = pd.DataFrame(columns=columns)
    keyword = ""
    company = ""
    query = ""

    try:
        for keyword in keywords:
            timeTotal = 0
            for i in range(0, len(companies) - 2, 2):

                if i != 0:    
                    wd.close()
                    print("Sleeping for 2 minutes")
                    random_sleep(100, 120)
                    wd = refresh_wd()

                login(wd, username, password)
                for company in companies[i:i+2]:
                    query = keyword
                    location = "United States"
                    current_company = company
                    past_company = None
                    unlimited = False
                    num_pages = 5
                    print("Scraping your request...")
                    print((query, current_company))

                    start_time = time.time()
                    df, links = dataframe_output(wd, query, location, current_company, past_company, num_pages, unlimited, industry=True)
                    print(len(df))
                    end_time = time.time()
                    timeTotal += (end_time - start_time)
                    print(end_time - start_time)

                    if len(df) > 0:
                        df['Keyword'] = keyword
                        df['Company'] = current_company
                        df.to_csv(f"{query.replace(' ', '')}_CompanyDFs/{company.replace(' ', '')}.csv")
                        job_df = pd.concat([job_df, df], ignore_index=True)

                    job_df.to_csv(f"{query.replace(' ', '')}_profiles.csv")
                    wd.get('https://www.linkedin.com/')

            print(f"Average Time per Company Scrape: {(timeTotal / len(companies)) / 60} minutes")
            print(f"Total Time per Company Scrape: {timeTotal / 60} minutes")

    except Exception as e:
        if len(df) > 0:
            df['Keyword'] = keyword
            df['Company'] = company
            job_df = pd.concat([job_df, df], ignore_index=True)
        job_df.to_csv(f"{query.replace(' ', '')}_profiles_er.csv")
        print(f'An error occurred: {str(e)} and execution stopped')

In [5]:
username = "huoerxiu@gmail.com"
password = "PURDUEcs"

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome(options=options)
actions = ActionChains(wd)

wd.maximize_window()
wd.switch_to.window(wd.current_window_handle)
wd.implicitly_wait(10)

wd.get('https://www.linkedin.com/')

wd.implicitly_wait(1)

keywords = ['Machine Learning Engineer']
companies = ['Meta', 
             'Amazon',
             'Apple', 
             'Netflix', 
             'Google', 
             'Microsoft', 
             'Open AI',
             'Intel',
             'Cicsco',
             'Nvidia',
             'Salesforce',
             'LinkedIn',
             'DeepMind',
             'IBM',
             'Bloomberg',
             'Tesla',
             'Mayo Clinic']

KeyboardInterrupt: 

In [4]:
main_scraping_process(wd, keywords, companies, username, password)

NameError: name 'wd' is not defined

In [None]:
def collect_links(wd, page_start, limit, total_results, unlimited=False):
    total_results = min(total_results, 1000)  # Limiting the total results to 1000
    names, curr_jobs, summarys, locations, links = [], [], [], [], []
    last_processed_person = None
    last_processed_page = page_start

    try:
        while True:
            # Check for "Retry Search" button at the start of each loop iteration
            handle_retry_search(wd)

            if page_start % 10 == 0:
                print(f"Scraping links on Page {page_start}")

            people_list = wd.find_elements(By.CLASS_NAME, "reusable-search__result-container")
            for index, person in enumerate(people_list):
                try:
                    wd.execute_script("arguments[0].style.border='2px solid red'", person)
                    collect_person_info(person, names, curr_jobs, summarys, locations, links)
                    last_processed_person = index  # Update last processed person
                except NoSuchElementException as e:
                    print("Required Element Not Found...Moving on")
                    print(e)
                    wd.execute_script("arguments[0].style.border='2px solid green'", person)
                    continue

            scroll_down(wd)
            if not process_next_page(wd, page_start, limit, unlimited):
                break
            last_processed_page = page_start
            page_start += 1

    except WebDriverException as e:
        print(f"An error occurred during the web scraping process: {str(e)}")
        # Handling to save the state or decide what to do next can be added here

    return pd.DataFrame({
        'Name': names,
        'Current Job': curr_jobs,
        'Relevant Experience to Keyword': summarys,
        'Location': locations,
        'Profile Link': links
    }), last_processed_page, last_processed_person

def handle_retry_search(wd):
    try:
        retry_button = WebDriverWait(wd, 3).until(
            EC.presence_of_element_located((By.XPATH, "//button[contains(text(), 'Retry Search')]"))
        )
        print("Retry Search button found. Waiting before clicking.")
        random_sleep(42, 64)  # Wait for a random time between 42 and 64 seconds
        retry_button.click()
        print("Retry button clicked.")
    except TimeoutException:
        print("No 'Retry Search' button found, continuing with normal process.")

def collect_person_info(person, names, curr_jobs, summarys, locations, links):
    all_links = person.find_elements(By.TAG_NAME, 'a')
    name_text, curr_job_text, link_text, summary_text, location_text = extract_person_details(person, all_links)

    links.append(link_text)
    locations.append(location_text)
    summarys.append(summary_text)
    names.append(name_text)
    curr_jobs.append(curr_job_text)

def extract_person_details(person, all_links):
    name_text = "LinkedIn Member"

    if "LinkedIn Member" not in person.text:
        name_element = person.find_element(By.CSS_SELECTOR, ".entity-result__title-text.t-16 a span[aria-hidden='true']")
        name_text = name_element.text

    curr_jobs = person.find_elements(By.CLASS_NAME, "entity-result__primary-subtitle.t-14.t-black.t-normal")
    curr_job_text = '"' + curr_jobs[0].text + '"' if curr_jobs else "Null"

    summary_elements = person.find_elements(By.TAG_NAME ,'p')
    summary_text = '"' + summary_elements[0].text + '"' if summary_elements else "Null"

    location_element = person.find_element(By.CSS_SELECTOR, ".entity-result__secondary-subtitle.t-14.t-normal")
    location_text = location_element.text

    link_text = "Null"
    for a in all_links:
        href = a.get_attribute('href')
        if href.startswith("https://www.linkedin.com/in") and not href.startswith("https://www.linkedin.com/in/ACo") and href not in all_links:
            link_text = href
            break

    return name_text, curr_job_text, link_text, summary_text, location_text

def scroll_down(wd):
    scroll_script = "window.scrollBy(0, 2000);"
    wd.execute_script(scroll_script)

def process_next_page(wd, page_start, limit, unlimited):
    next_page_button = wd.find_element(By.CLASS_NAME, "artdeco-pagination__button.artdeco-pagination__button--next")
    if next_page_button.is_enabled():
        next_page_button.click()
        random_sleep(1, 2)
        return True
    elif not unlimited and page_start == limit + 1:
        return False
    else:
        return False


def save_state(state):
    with open('state.json', 'w') as f:
        json.dump(state, f)

def load_state():
    try:
        with open('state.json', 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return None



def dataframe_output(wd, search_term, location=None, current_company=None, past_company=None, num_pages=4, unlimited=False, industry=True, start_page=1, last_person_index=0):
    df = pd.DataFrame(columns=['Name', 'Current Job', 'Relevant Experience to Keyword', 'Location', 'Profile Link'])
    try:
        search_query(wd, search_term)
        add_filters1(wd, location, current_company, past_company)
        
        if industry and search_results_more_than_1000(wd):
            industry_options = [0, 1, 2, 3, 4]
            for i in industry_options:
                industry_filter(wd, i)
                total_results = get_search_results_number(wd)
                sub_df, last_page, last_person = collect_links(wd, start_page, num_pages, total_results, unlimited, last_person_index)
                df = pd.concat([df, sub_df], ignore_index=True)
                df = df.drop_duplicates(subset=['Profile Link'])
        else:
            df, last_page, last_person = collect_links(wd, start_page, num_pages, 1000, unlimited, last_person_index)
    except Exception as e:
        print(f"Something went wrong: {e}")
        # Here you might want to save the last state or rethrow the exception to handle it in the main scraping function
        raise

    return df, last_page, last_person

In [None]:
def main_scraping_process_rob(wd, keywords, companies, username, password):
    state = load_state()
    if state is None:
        state = {'keyword_index': 0, 'company_index': 0, 'page_start': 1, 'last_person_index': 0}
        initial_login = True
    else:
        initial_login = False

    try:
        login(wd, username, password) if initial_login else None
        for keyword_index in range(state['keyword_index'], len(keywords)):
            keyword = keywords[keyword_index]
            for company_index in range(state['company_index'], len(companies)):
                company = companies[company_index]
                try:
                    print(f"Scraping your request for {keyword} at {company}")
                    df, last_page, last_person = dataframe_output(
                        wd, keyword, "United States", company, None, 5, False, True, 
                        state['page_start'], state['last_person_index'])
                    
                    # Reset page start and last person index after successful scrape
                    state.update({'page_start': 1, 'last_person_index': 0})
                    save_state(state)
                except Exception as e:
                    print(f"Encountered an issue with {company}: {e}, saving state and restarting...")
                    state.update({'keyword_index': keyword_index, 'company_index': company_index,
                                  'page_start': last_page, 'last_person_index': last_person})
                    save_state(state)
                    wd.quit()
                    wd = refresh_wd()
                    login(wd, username, password)
                    continue
                wd.get('https://www.linkedin.com/')
            state.update({'company_index': 0})  # Reset company index after finishing all companies for a keyword
        state.update({'keyword_index': 0})  # Reset keyword index after finishing all keywords
        save_state(state)
    except Exception as e:
        print(f'Final error occurred: {str(e)}')

    return pd.read_csv("last_saved_dataframe.csv")  # Assuming your data is saved continuously