In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import re
import itertools
import pickle
from fuzzywuzzy import fuzz
from pymongo import MongoClient

#### Get resumé page links. Each resumé appears on a different webpage.
#### Note: you might want to sign in on an account first - you get stopped from scrolling through pages if you don't sign in after a while.

In [None]:
chromedriver = '/Applications/chromedriver'
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)
driver.get("https://resumes.indeed.com/?rsrdr=1&hl=en&co=US")

roles = ['Data Scientist','Data Analyst','Data Engineer']
roles_link = []

def scrape_pages(first_page, last_page):
    for role in roles:

        time.sleep(2);
        query = driver.find_element_by_xpath('//*[@id="input-q"]')
        query.send_keys(f"{role}")
        time.sleep(1); 

        temp = driver.find_element_by_xpath('//*[@id="content"]/div/div[2]/div/div[1]/div[2]/div/form/div[3]/button')
        temp.click()
        time.sleep(5);

        # for each page, there are 50 resumé links and this is displayed in the hyperlink
        # identify how many resumés there are in the last page and input it into the function
        # e.g. first_page will always be = 50 and last_pages could be ~100000 if there are that many resumés
        for num in range(first_page, last_page, 50):
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            for person in soup.find_all('a', target='_blank'):
                try:
                    roles_link.append([person['href'],person.text])
                except:
                    pass
                
            try:
                driver.get("https://resumes.indeed.com/search?l=United%20States&q=Software%20Engineer&searchFields=jt&start={}".format(str(num)))
                time.sleep(5);
            except:
                return "Error"

        driver.close()
    
scrape_pages(50, 100000)

#### Removing duplicated links and storing in pickle.

In [None]:
roles_link.sort()
roles_link = [roles_link for roles_link,_ in itertools.groupby(roles_link)]

# with open('se_links.pkl','wb') as file:
#     pickle.dump(roles_link,file)
    
# with open('roles_link.pkl','rb') as file:
#     roles_link = pickle.load(file)

len(roles_link)

#### Removing links where titles not relevant to "Data" or are "Data Entry" roles

In [None]:
roles_target = [link for link in roles_link if (fuzz.partial_ratio(link[1], 'Data') == 100) \
                and ('Data Entry' not in link[1])]

len(roles_target)

#### Scraping the resumés from their respective webpages.

In [3]:
chromedriver = '/Applications/chromedriver'
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

def scrape_resume(resume):
    res = {}
    driver.get('https://resumes.indeed.com'+ resume[0])
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    try:
        res['Title'] = [resume[1]]
    except:
        res['Title'] = ['error']
    
    try:
        res['Companies'] = [company.text for company in soup.find_all('span', class_='icl-u-textBold')]
    except:
        res['Companies'] = ['error']

    try:
        res['Resume Summary'] = [[summary.text for summary in soup.find('div', class_='rezemp-ResumeDisplay-body').find('div')][3]]
    except:
        res['Resume Summary'] = ['error']

    try:
        res['Current Location'] = [[country.text for country in soup.find('div', class_='rezemp-u-h5 icl-u-textColor--secondary').find_all('span')][2]]
    except:
        res['Current Location'] = ['error']

    try:
        res['Start Dates'] = [re.compile("[A-Z][a-z]* [0-9]+").findall(str(start.text))[0] for start in 
                   soup.find_all('div', class_='rezemp-WorkExperience-subtitle')[-1].find_all('div', class_='icl-u-textColor--tertiary')]
    except:
        res['Start Dates'] = ['error']

    try:
        res['Work Experiences'] = [list(exp.children)[2].text for exp in soup.find_all('div', 'rezemp-WorkExperience')]
    except:
        res['Work Experiences'] = ['error']

    degree = []
    university = []
    in_ = []
    for i in soup.find_all('div',class_='rezemp-ResumeDisplaySection-content'):
        try:
            for x in i.find_all('div',class_="rezemp-ResumeDisplay-university"):
                university.append(x.find('span',class_="icl-u-textBold").text)
            res['Universities'] = university
            for x in i.find_all('span',class_='rezemp-ResumeDisplay-itemTitle'):
                degree.append(x.find('span').text)
                in_.append(x.find_all('span')[2].text)
            res['Degrees'] = degree
            res['in'] = in_
        except:
            res['Universities'] = ['error']
            res['Degrees'] = ['error']
            res['in'] = ['error']

    skill = []
    try:
        for i in soup.find_all('div',class_='rezemp-ResumeDisplaySection-content'):
            for x in i.find_all('span'):
                for y in x.find_all('span'):
                    for z in y.find_all('span'):
                        skill.append(z.text)
        res['Skills'] = skill
    except:
        res['Skills'] = ['error']

    try:
        res['Additionals'] = [[add.text for add in soup.find_all('div',class_='rezemp-ResumeDisplaySection')][-1]]
    except:
        res['Additionals'] = ['error']

    try:
        res['Whole Resume'] = [text.text for text in soup.find('div',class_='rezemp-ResumeDisplay')]
    except:
        res['Whole Resume'] = ['error']

    time.sleep(5)
    return res

#### Put each resumé entry into a dictionary.

In [4]:
def dic(lis):
    for resume in lis:
        try:
            list_of_resumes[resume[0]] = scrape_resume(resume)
        except:
            list_of_resumes[resume[0]] = 'Error'

dic(roles_target)

#### Making sure that all resumes are including - removing errors.

In [5]:
def if_error(lis):
    roles_target_e = []
    error_resumes = []
    for val in lis.values():
        if val == 'Error':
            roles_target_e.append([key,val])
    for resume in roles_target_e:
        for other_resume in roles_target:
            if resume[0] == other_resume[0]:
                error_resumes.append(other_resume)
    return error_resumes

dic(if_error(list_of_resumes))

In [77]:
# no more errors left
if_error(list_of_resumes)

[]

#### Storing resumés into dictionary.

In [3]:
# with open('resumes_list.pkl','wb') as file:
#     pickle.dump(list_of_resumes,file)

with open('resumes_list.pkl','rb') as file:
    resumes_list = pickle.load(file)

In [44]:
# putting resume link into the dictionary
for resume in resumes_list.keys():
    resumes_list[resume].update({'Link' : resume})

In [56]:
for resume in resumes_list.keys():
    print(resumes_list[resume])
    break

{'Title': ['Financial Data Analyst'], 'Companies': ['Yoox Net-A-Porter Group', 'Yoox Net-A-Porter Group', '', 'Critical Planning Assoc. LLC', 'Bloomsburg University of Pennsylvania'], 'Resume Summary': ['Goal-oriented team player with experience in various areas within finance, data analysis, sales and customer service. Proven ability to drive process efficiency and financial integrity as evidenced by years of experience leading teams and reporting data metrics. Dedicated and motivated individual seeking a position in a fast-growth company to promote the corporate strategy by achieving and exceeding business goals and objectives.'], 'Current Location': [''], 'Start Dates': ['May 2012'], 'Work Experiences': ['• Formulate sales reports and other metrics using Microsoft Excel and SPSS/SQL software.• Prepare and analyze sales and productivity reports relative to projections with the use of Pivot Tables and VLOOKUP functions.• Record, monitor and report net spend of various customer account

#### Storing dictionaries in Mongodb - decided on doing this over SQL because 1. There are different numbers of elements in each category and 2. It's convenient because I scraped dictionaries.

In [53]:
def to_mongodb(dictionary):
    client = MongoClient('localhost', 27017)
    db = client['resumes']
    resumes = db['resumes']
    for resume in dictionary.keys():
        resumes.insert_one(dictionary[resume])

In [54]:
to_mongodb(resumes_list)