# Linkedin Resume Scraper
## Importing Packages

In [1]:
import selenium
from selenium import webdriver as wb
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from random import randrange
from tqdm import tqdm
from config import Username,Password
import time
import os
import re
import shutil
import glob
import sys

In [2]:
def Driver_Properties(download_folder):
    '''
    Defining properties of Selenium webdriver
    download_folder: Takes resumes download folder path 
    '''
    options = Options()
    options = wb.ChromeOptions()
    prefs = {
            "download.default_directory": download_folder,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.disable_download_protection": True,
            "safebrowsing.enabled": True
            }
    options.add_experimental_option("prefs", prefs)
    options.add_argument("window-size=1200x600")
    options.add_argument('start-maximized')
    options.add_argument('disable-infobars')
    driver = wb.Chrome(options=options)
    return driver

In [3]:
def Linkedin_Login(Username,Password):
    '''
    Allows you to login into account by taking the credentials from a config file
    '''
    driver.get('https://www.linkedin.com/login')
    username = driver.find_element_by_id('username')
    username.send_keys(Username)
    passwd = driver.find_element_by_id('password')
    passwd.send_keys(Password)
    passwd.send_keys(Keys.ENTER)
    time.sleep(2)

In [4]:
def Search_Role(role):
    '''
    Searches for a particular job role 
    '''
    s = driver.find_element_by_class_name('nav-search-typeahead')
    sr = s.find_element_by_class_name('search-global-typeahead__input')
    sr.send_keys(role)
    sr.send_keys(Keys.ENTER)
    time.sleep(2)
    #navigate to People tab
    ao = driver.find_element_by_class_name('authentication-outlet')
    ng = ao.find_element_by_class_name('neptune-grid')
    sf = ng.find_element_by_class_name('search-filters-bar') 
    people = sf.find_element_by_class_name('search-vertical-filter__filter-item').click()
    time.sleep(2)

In [5]:
def Filter_Location(location):
    '''
    Filters the role according to the location given
    Clicks on the Apply button when filter is set
    '''
    l = driver.find_element_by_class_name('peek-carousel')
    l.find_element_by_class_name('search-s-facet--geoRegion').click()
    to = driver.find_element_by_xpath("//input[@role='combobox'][@placeholder='Add a country/region']") # changed from l to driver
    to.clear
    to.send_keys(location)
    time.sleep(2)
    to.send_keys(Keys.DOWN, Keys.RETURN)
    driver.find_elements_by_tag_name('button')[11].click()

In [6]:
def Get_Page_Urls(page_limit):
    '''
    Navigates to the page range as required
    '''
    page_urls = []
    initial_url = driver.current_url
    page_urls.append(initial_url)
    for i in range(1,2):
        url = initial_url+"&page=" + str(i)
        page_urls.append(url)
    return page_urls 

In [7]:
def Profile_Links(page_urls):
    '''
    Collects all the profile urls that appear from the pages we want
    '''
    profilelinks = []
    for i in tqdm(page_urls):
        driver.get(i)
        results = driver.find_elements_by_class_name("search-result__occluded-item")
        for result in results:
            hover = ActionChains(driver).move_to_element(result)
            hover.perform()
            time.sleep(randrange(3,6))
            links = result.find_element_by_class_name('search-result__result-link').get_property('href')
            profilelinks.append(links)
            time.sleep(randrange(3,6))
    return profilelinks

In [8]:
def download_wait(directory, timeout, nfiles=None):
    """
    Wait for downloads to finish with a specified timeout.

    Args
    ----
    directory : str
        The path to the folder where the files will be downloaded.
    timeout : int
        How many seconds to wait until timing out.
    nfiles : int, defaults to None
        If provided, also wait for the expected number of files.

    """
    seconds = 0
    dl_wait = True
    while dl_wait and seconds < timeout:
        time.sleep(1)
        dl_wait = False
        files = os.listdir(directory)
        if nfiles and len(files) != nfiles:
            dl_wait = True

        for fname in directory:
            if fname.endswith('.crdownload'):
                dl_wait = True

        seconds += 1
    return seconds

In [9]:
def Download_Pdf(profilelinks,download_folder):
    '''
    profile_links:
        From the urls collected it downloads and saves the profile as PDF
    dowload_folder: 
        The downloaded pdfs will be stored in the folder path given
    filename:
        The downloaded pdfs are given the name of the person
    '''
    for i in tqdm(profilelinks):
        driver.get(i)
        m = driver.find_element_by_class_name("flex-1")
        m.find_element_by_class_name("pv-s-profile-actions__overflow").click()
        p_name = driver.find_element_by_css_selector('.inline').text
        name = p_name.split(',')[0]
        m.find_element_by_class_name('pv-s-profile-actions--save-to-pdf').click()
        time.sleep(randrange(5, 10))
        filename = max(glob.iglob(download_folder +'/*'), key=os.path.getmtime)
        shutil.move(filename,os.path.join(download_folder, name + '.pdf'))
        download_wait(download_folder, 20)
        print(name)

In [10]:
# To run all the above functions
if __name__ == '__main__':
    download_folder = "C:\\Users\\Bhagya\\Resume Analyzer\\ACTUAL PROJECT\\Resumes_FL1"
    driver = Driver_Properties(download_folder)
    Linkedin_Login(Username,Password)
    role = "Data Scientist"
    Search_Role(role)    
    location = "Orange County, California Area"                #"Orange County, California Area" 
    Filter_Location(location)
    page_limit = 1
    page_urls = Get_Page_Urls(page_limit)
    profilelinks = Profile_Links(page_urls)
    Download_Pdf(profilelinks,download_folder)
    driver.close()

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:57<00:00, 88.76s/it]
  5%|████▏                                                                              | 1/20 [00:12<03:55, 12.40s/it]

Roujing Chen


 10%|████████▎                                                                          | 2/20 [00:26<03:51, 12.85s/it]

Tianyu Feng


 15%|████████████▍                                                                      | 3/20 [00:38<03:35, 12.67s/it]

Ruozhang (Olivia) Yao


 20%|████████████████▌                                                                  | 4/20 [00:49<03:16, 12.29s/it]

Farah Farouk


 25%|████████████████████▊                                                              | 5/20 [01:02<03:04, 12.30s/it]

Janet Aquino


 30%|████████████████████████▉                                                          | 6/20 [01:17<03:05, 13.24s/it]

Sharon Teo


 35%|█████████████████████████████                                                      | 7/20 [01:31<02:53, 13.34s/it]

Lawrence Wu


 40%|█████████████████████████████████▏                                                 | 8/20 [01:43<02:36, 13.08s/it]

Ashima Horra


 45%|█████████████████████████████████████▎                                             | 9/20 [01:55<02:20, 12.80s/it]

Mike Yung


 50%|█████████████████████████████████████████                                         | 10/20 [02:08<02:06, 12.66s/it]

Jackson Leung


 55%|█████████████████████████████████████████████                                     | 11/20 [02:20<01:51, 12.39s/it]

Roujing Chen


 60%|█████████████████████████████████████████████████▏                                | 12/20 [02:35<01:46, 13.36s/it]

Tianyu Feng


 65%|█████████████████████████████████████████████████████▎                            | 13/20 [02:50<01:35, 13.68s/it]

Ruozhang (Olivia) Yao


 70%|█████████████████████████████████████████████████████████▍                        | 14/20 [03:04<01:23, 13.92s/it]

Farah Farouk


 75%|█████████████████████████████████████████████████████████████▌                    | 15/20 [03:18<01:09, 13.96s/it]

Janet Aquino


 80%|█████████████████████████████████████████████████████████████████▌                | 16/20 [03:33<00:56, 14.17s/it]

Sharon Teo


 85%|█████████████████████████████████████████████████████████████████████▋            | 17/20 [03:44<00:40, 13.37s/it]

Lawrence Wu


 90%|█████████████████████████████████████████████████████████████████████████▊        | 18/20 [04:00<00:27, 13.94s/it]

Ashima Horra


 95%|█████████████████████████████████████████████████████████████████████████████▉    | 19/20 [04:14<00:13, 13.96s/it]

Mike Yung


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [04:28<00:00, 13.41s/it]

Jackson Leung



