# Scrape Data From Monster For All States

In [459]:
import pandas as pd # For Data Storage
import requests # For website connection
from bs4 import BeautifulSoup # For HTML parsing
import time # For sleep 
import re # Regular expressions for removing non-ascii terms
import math # Need ceiling expression
from nltk.corpus import stopwords # For filtering out words like 'is', 'the', 'of'
stop_words = set(stopwords.words("english")) #initialize stopwords    
import matplotlib.pyplot as plt
import random
import datetime

In [460]:
from selenium import webdriver # Needed to open webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # Needed to assign a user agent
from selenium.webdriver.common.by import By # Needed to locate a JS renderred page element
from selenium.webdriver.support.wait import WebDriverWait # Needed to wait until JS renderred webpage element is found
from selenium.webdriver.support import expected_conditions as EC # Also needed to wait until JS renderred webpage element is found

# Create the Dictionary

## Import the list of skills from csv

In [461]:
skill_list_file = 'data_scientist.txt'
skills = list(pd.read_csv('skill_lists/'+skill_list_file,header=None,sep=',').T[0].values)

In [462]:
# make sure there are no duplicates in the skills list
no_dups = []
[no_dups.append(x) for x in skills if x not in no_dups]
skills = no_dups

## Initialize the list of extra factors to consider in each posted resume

In [463]:
other_factors = [
    'state',
    'title',
    'total_resumes'
]

## Initialize the dictionary with keys and empty lists

In [464]:
d = {'skills':{},
     'other_factors':{}
    }

# Create an empty list for each skill
for skill in skills:
    d['skills'][skill] = []

# Create an empty list for each additional factor
for other_factor in other_factors:
    d['other_factors'][other_factor] = []
    


## The final dictionary will have two groups and will look like:

### d = {
###     'skills' : { 'skill_a' : [ ], ..., 'skill_z' : [ ] },
###      'other_factors' : { 'factor_a' : [], ..., 'factor_z' : [ ] } 
### } 


# Define Func: Add A Row of Data To Dictionary (For a Resume Description)

In [465]:
def add_vals_to_d(state,title,total_resumes,text):

    '''
    This function adds the data from each posted resume to the dictionary.

    The input is: 
    state: the state of a posted resume,
    title: the title of a resume,
    total_resumes: the total resumes in the state
    text: the processed text from the posted resume
    '''

    # Append state
    d['other_factors']['state'].append(state)
    
    # Append resume title
    d['other_factors']['title'].append(title)

    # Append total resumes
    d['other_factors']['total_resumes'].append(total_resumes)
    
    # Check if the each skill is in the wrangled text
    for skill in skills:
        if (skill in text) == True:
            d['skills'][skill].append(1)
        else:
            d['skills'][skill].append(0)

# Define States List

In [602]:
states = ['CA','TX','NY','NJ','MA','MD','PA','VA','NC','IL',
          'GA','CO','OH','MI','MN','MO','AZ','CT','IN','TN',
          'OR','SC','AL','LA','UT','KY','NM','KS','NV','IA',
          'NH','OK','AR','RI','ID','MS','NE','ME','MT','VT',
          'SD','ND','AK','DE','FL','HI','WA','WV','WI','WY']

## import proxys

#Get top 10 proxies from http://free-proxy.cz/en/proxylist/country/US/https/ping/level1
https_proxys = ['34.192.220.22:808',
              '206.189.112.106:3128',
              '209.97.191.169:3128',
              '157.230.34.190:1111',
              '157.230.45.121:1111',
              '23.108.64.65:8118',
              '157.230.33.37:1111',
              '52.128.60.130:50692',
              '12.218.209.130:53281',
              '38.134.10.106:53281'
               ]

In [467]:
# Get proxies from filter
https_proxys = list(pd.read_csv('filtered_https_proxys_US.txt',header=None)[0])

## import headers

In [468]:
user_agents = pd.read_csv('headers.txt',delimiter = '\t',header=None).T[0]

In [469]:
user_agents = user_agents[0:-1] # Get rid of last element (it is nan)

# Define Funcs: 
- # select a random index,
- # select a random proxy,
- # select a random header

In [471]:
def randIndex(lengthList):
    index = random.randint(0,lengthList-1)
    return index

def randProxy():
    index = randIndex(len(https_proxys))
    return https_proxys[index]
    
def randHeader():
    index = randIndex(len(user_agents))
    return user_agents[index]

## Define Func: Retrieve Text from A Single URL ( We will need to use phantomJS with Proxy Rotation to prevent ip blocking )
def connect_for_specific_resume(url):
    '''
    This function attempts to connect to a resume and returns the soup object for the description

    The input:
    - url: the url of the website

    The output:
    - soup: a soup object for a url
    '''
    flag = 0
    
    # Instantiate webdriver
    driver = webdriver.PhantomJS(service_args=['--load-images=no'])#desired_capabilities=dcap)#,service_args=['--load-images=no'])
    
    while flag==0:
        
        try:
            print 'connecting...'
            # Get random proxy with port
            rand_prox_port = randProxy().strip('https://').split(':')

            # Get random proxy part
            proxy = rand_prox_port[0]

            # Get rantom port part
            port = rand_prox_port[1]

            # Set user agent to driver
            # dcap = dict(DesiredCapabilities.PHANTOMJS)
            # dcap['phantomjs.page.settings.userAgent'] = randHeader()

            # Instantiate webdriver
            # driver = webdriver.PhantomJS()#desired_capabilities=dcap)#,service_args=['--load-images=no'])

            # Set proxy to driver
            driver.command_executor._commands['executePhantomScript'] = ('POST', '/session/$sessionId/phantom/execute')
            driver.execute('executePhantomScript', {'script': 'phantom.setProxy("{}","{}");'.format(proxy,port), 'args' : [] })

            # Connect driver to website
            driver.get(url)

            # We need to make the webpage wait until the total pageviews JavaScript element appears
            # before requesting the html

            try:    
                print 'checking a single resume'
                wait = WebDriverWait(driver, 10)
                wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//div[(@class='rezemp-FooterCopyright') and (contains(. ,'2019'))]")))
                soup = BeautifulSoup(driver.page_source,'html.parser')

                # Complete the loop when the page loads successfully
                flag = 1
                time.sleep(2)


            except Exception as e:
                print str(e)
                print 'assigning new proxy...'

       
        # Sometimes the driver.get(url) function throws an error when using a bad proxy
        except Exception as e:
            print str(e)
            print 'assigning new proxy...'

    driver.close()
    print('Success!')
    return soup

In [528]:
def connect_for_specific_resume(url):
    '''
    This function attempts to connect to a resume title search and returns the soup object

    The input:
    - url: the url of the website

    The output:
    - soup: a soup object for a url
    '''
    flag = 0
      
    while flag==0:

        print 'connecting...'
        
        # Set up chrome options
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--window-size=1920x1080")
        driver = webdriver.Chrome('/Users/mikegiacomazza/anaconda3/bin/chromedriver',options=chrome_options)

        # Connect driver to website
        driver.get(url)

        # We need to make the webpage wait until the total pageviews JavaScript element appears
        # before requesting the html

        try:    
            print 'attempting to retrieve resume description...'
            wait = WebDriverWait(driver, 20)
            wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//div[(@class='rezemp-FooterCopyright') and (contains(. ,'2019'))]")))
            soup = BeautifulSoup(driver.page_source,'html.parser')

            # Complete the loop when the page loads successfully
            flag = 1
            driver.close()
            time.sleep(2)

            
        except Exception as e:
            print str(e)

    print('Success!')
    return soup

## Define Func: Retrieve soup for Total Resumes Text from general search URLS for each state

In [521]:
def connect_for_total_resumes(url):
    '''
    This function attempts to connect to a resume title search and returns the soup object

    The input:
    - url: the url of the website

    The output:
    - soup: a soup object for a url
    '''
    flag = 0
      
    while flag==0:

        print 'connecting...'
        
        # Set up chrome options
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--window-size=1920x1080")
        driver = webdriver.Chrome('/Users/mikegiacomazza/anaconda3/bin/chromedriver',options=chrome_options)

        # Connect driver to website
        driver.get(url)

        # We need to make the webpage wait until the total pageviews JavaScript element appears
        # before requesting the html

        try:    
            print 'checking if any resumes exist'
            wait = WebDriverWait(driver, 20)
            wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//div[(@class='icl-u-textColor--tertiary') and (contains(. ,'resumes'))]")))
            soup = BeautifulSoup(driver.page_source,'html.parser')

            # Complete the loop when the page loads successfully
            flag = 1
            driver.close()
            time.sleep(2)

        
        except Exception as e:
            print str(e)
            
            try:    
                print "checking if resumes don't exist"
                print str(e)
                wait = WebDriverWait(driver, 20)
                wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//span[contains(. ,'not match any')]")))
                soup = BeautifulSoup(driver.page_source,'html.parser')

                # Complete the loop when the page loads successfully
                flag = 1
                # close driver to website
                driver.close()
                time.sleep(2)
                
                
            except Exception as e:
                print str(e)
                # close driver to website
                driver.close()
                
    print('Success!')
    return soup

## Define Func: Retrieve Soup for List of Individual Resume URLs for a General Resume Search for each State

In [529]:
def connect_to_get_individual_resume_urls(url):
    '''
    This function attempts to connect to a resume title search and returns the soup object

    The input:
    - url: the url of the website

    The output:
    - soup: a soup object for a url
    '''
    flag = 0
      
    while flag==0:

        print 'connecting...'
        
        # Set up chrome options
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--window-size=1920x1080")
        driver = webdriver.Chrome('/Users/mikegiacomazza/anaconda3/bin/chromedriver',options=chrome_options)

        # Connect driver to website
        driver.get(url)

        # We need to make the webpage wait until the total pageviews JavaScript element appears
        # before requesting the html

        try:    
            print 'checking if any resumes exist'
            wait = WebDriverWait(driver, 20)
            wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//div[(@class='icl-u-textColor--tertiary') and (contains(. ,'resumes'))]")))
            soup = BeautifulSoup(driver.page_source,'html.parser')

            # Complete the loop when the page loads successfully
            flag = 1
            driver.close()
            time.sleep(2)

            
        except Exception as e:
            print str(e)
            
            try:    
                print "checking if resumes don't exist"
                print str(e)
                wait = WebDriverWait(driver, 10)
                wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//span[contains(. ,'not match any')]")))
                soup = BeautifulSoup(driver.page_source,'html.parser')

                # Complete the loop when the page loads successfully
                flag = 1
                # close driver to website
                driver.close()
                time.sleep(2)
                
            except Exception as e:
                print str(e)

    print('Success!')
    return soup

# We need to render JS for every resume, so we can't use requests :(
def connect_individual_resume(url):
    '''
    This function attempts to connect to a resume and returns the soup object

    The input:
    - url: the url of the website

    The output:
    - soup: a soup object for a url
    '''
    flag = 0
      
    while flag==0:

        print 'connecting...'
        try: 
            # Assign a proxy and useragent 
            proxy['https'] = randProxy()

            # Assign random header
            header['User-Agent'] = randHeader()
            
            # Make the request
            response = requests.get(url,proxies=proxy,headers=header,timeout=3) #Get the html code
            soup = BeautifulSoup(response.text, "html.parser") #Store html in a soup object 

            # Sleeping can prevent ip blocking
            time.sleep(1)
            flag = 1
        except Exception as e: 
            print str(e)
            print 'assigning a new proxy...'
    print('Success!')
    return soup

## For Each State:

- ## Get the urls that lists up to 250 resumes

In [530]:
# Set the general search and use lowercases and dashes ('%20') in place of capitals and spaces 
# i.e a search for the term Professional Baseball Player would be search_term = 'professional%20baseball%20player'
search_term = 'data%20scientist'

In [531]:
count = 0

# Initialize Empty URL Dictionary
total_resumes_dict = {} 

while len(total_resumes_dict.keys()) < 50:
    for state in states:
        if state not in total_resumes_dict.keys():
            count += 1
            print("Number of states left to scrape: "+str(50-count))
            url = 'https://resumes.indeed.com/search?l='+state+'&q='+search_term+'&searchFields=&start=0'    

            soup = connect_for_total_resumes(url) # Connect and retrieve a soup object from a url

            # Sometimes there are no resumes and an error 'expected string or buffer' 
            # when trying to access number of resumes on the webpage when there are no resumes listed
            try: 
                #Get the total number of listed resumes 
                totalResumes = int(re.sub('[^0-9]','',soup.find('div',{'class':'icl-u-textColor--tertiary'}).contents[0])) # <- this only appears when the
                print state +' has '+str(totalResumes)+' resumes.'
            
            # when the error is thrown, there are no resumes listed and we set the total resumes to zero
            except Exception,e: 
                print str(e)
                totalResumes = 0
                print state+' has no resumes.'

            # We will store the total number of resumes for each state
            total_resumes_dict[state] = totalResumes


Number of states left to scrape: 49
connecting...
checking if any resumes exist
Success!
AL has 328 resumes.
Number of states left to scrape: 48
connecting...
checking if any resumes exist
Success!
AK has 33 resumes.
Number of states left to scrape: 47
connecting...
checking if any resumes exist
Success!
AZ has 594 resumes.
Number of states left to scrape: 46
connecting...
checking if any resumes exist
Success!
AR has 159 resumes.
Number of states left to scrape: 45
connecting...
checking if any resumes exist
Success!
CA has 6662 resumes.
Number of states left to scrape: 44
connecting...
checking if any resumes exist
Success!
CO has 1091 resumes.
Number of states left to scrape: 43
connecting...
checking if any resumes exist
Success!
CT has 568 resumes.
Number of states left to scrape: 42
connecting...
checking if any resumes exist
Success!
DE has 173 resumes.
Number of states left to scrape: 41
connecting...
checking if any resumes exist
Success!
FL has 1808 resumes.
Number of states 

# total_resumes_dict = temp_total_resumes_dict

In [493]:
temp_total_resumes_dict = total_resumes_dict

## Define Func: Process the text from a resume description 

In [532]:
def process_resume_desc(soup):
    ''' This function extracts the resume description text from a soup object for a resume url
        and processes the text into a list of individual words.
        
        The input:
        - soup: the soup object for the url of a resume description.

        The output:
        - wrangled_text: a list of individual words from the resume description
    '''

     # Get the list of descriptions from the html body
    html_body_desc = [item.text for item in soup.body.find_all(['div','p','li','strong','ul','br'])]

    # Take each item from the list and turn it into a string in another list for the join operator
    iterable_object = [item for item in html_body_desc]

    # join the list of string items into one string with each item separated by ','
    html_body_as_string = ','.join(iterable_object)

    # Convert to lower case 
    wrangle_text = html_body_as_string.lower()

    # replace (all non ascii characters,'.' ,'+' , '2' and '3') with ',' 
    wrangle_text = re.sub("[^a-zA-Z+#234]",',',wrangle_text)

    # Convert to string -> split string into a list separated by ','
    wrangle_text = str(wrangle_text).split(',')

    # remove all duplicates 
    no_dupes = []
    [no_dupes.append(word) for word in wrangle_text if word not in no_dupes]
    wrangle_text = no_dupes

    # Remove the stopwords and empty spaces
    wrangled_text = [x for x in wrangle_text if (x not in stop_words) and(not x == '')]

    return wrangled_text
    

## Define Func: Make a Dictionary of Indeed Resume Search Urls

In [488]:
# test
test_resume_count = 53
range(0,min(1+int(math.floor(float(test_resume_count-1)/float(50))),1)), math.floor(float(test_resume_count)/float(50))

([0], 1.0)

In [533]:
def generate_general_resume_urls(total_resumes,state):
    '''
    This function generates indeed URLS for up to the first 1 pages of resumes
    with 50 indeed resumes per page.
    
    input: 
    - total_resumes: the total amount of resumes for a resume search 
    - state: the state in the resume search
    
    output:
    - resume_urls_dict: the urls for up to the first 1 pages of resumes
    '''
    
    # URL e.g. - https://resumes.indeed.com/search?l=NY&q=data%20scientist&start=50
    # URL in variable form - https://resumes.indeed.com/search?l= + state + &q=data%20scientist&start= + {0,50,...}
    
    resumes_urls_dict = {}
 
    page_starts = [x*50 for x in range(0,min(1+int(math.floor(float(total_resumes-1)/float(50))),1))]
    
    # When the total resumes are not equal to zero, we will return a emty dictionary 
    if total_resumes != 0:
        for page in page_starts:
            resumes_urls_dict['page' + str(page)] = 'https://resumes.indeed.com/search?l=' + state + '&q=data%20scientist&start=' + str(page)

    return resumes_urls_dict

- ## Get the list of posted resume URLs -> For each URL: check if each skill exists and store data in dictionary

In [534]:
# We will delete a dictionary entry in the total resumes dictionary after using it successfully
# Then we will end the while loop when the dictionary is empty
while len(total_resumes_dict.keys()) > 0:
    
    # We will loop through each state in the total resumes dictionary
    # Then we will delete the state in total_resumes_dict when the for loop completes successfully 
    for state in total_resumes_dict.keys():
        
        print("Number of states left to scrape: "+str(len(total_resumes_dict.keys())))
        
        # We need the total resumes to generate the urls for the search urls for up to 250 resumes in that state
        total_resumes = total_resumes_dict[state]
        
        # Make the page URLS to connect to for each state
        resume_general_urls_dict = generate_general_resume_urls(total_resumes,state)

        # initialize a dictionary to store each specific resume url and titles
        resume_specific_urls_and_titles_dict = {}
        
        # We will replace the general urls in resume_urls_dict with all of the specific 
        # resume urls. We will also store the resume titles as the keys.

        for url_key in resume_general_urls_dict.keys():
            soup = connect_to_get_individual_resume_urls(resume_general_urls_dict[url_key])
            print("Number of general search links left to scrape: "+str(len(resume_general_urls_dict.keys())))
            for link in soup.find_all('a'):
                if link.get('href')[0:8] == '/resume/': ## a resume always begins with /resume/'
                    resume_specific_urls_and_titles_dict['https://resumes.indeed.com'+link.get('href')] = link.text
        
            # We will delete the general url from the dictionary when we are done with it
            del resume_general_urls_dict[url_key]

        # Every time a specific resume url is used successfully, it is deleted
        # When all of the resume search urls are used up, the while loop finishes
        while len(resume_specific_urls_and_titles_dict.keys()) > 0:
            
            for resume_url_key in resume_specific_urls_and_titles_dict.keys():
                
                print "Number of resumes left to scrape: "+str(len(resume_specific_urls_and_titles_dict.keys()))

                # Access one of the resume listing pages for a state
                soup = connect_for_specific_resume(resume_url_key)

                # Get the filtered list of words from the resume description in the soup object
                text_final = process_resume_desc(soup)

                # Add the values to the Dictionary
                add_vals_to_d(state, resume_specific_urls_and_titles_dict[resume_url_key], total_resumes, text_final)

                # delete the specific resume link when finished
                del resume_specific_urls_and_titles_dict[resume_url_key]

        # delete state in the total resumes dictionary when resume_urls_dict is size zero
        del total_resumes_dict[state]

Number of states left to scrape: 50
connecting...
checking if any resumes exist
Success!
Number of general search links left to scrape: 1
Number of resumes left to scrape: 50
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 49
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 48
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 47
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 46
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 45
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 44
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 43
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left

attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 24
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 23
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 22
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 21
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 20
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 19
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 18
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 17
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 16
connecting...
attempting to retrieve resume description...
Success!
N

attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 48
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 47
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 46
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 45
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 44
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 43
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 42
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 41
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 40
connecting...
attempting to retrieve resume description...
Success!
N

attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 21
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 20
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 19
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 18
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 17
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 16
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 15
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 14
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 13
connecting...
attempting to retrieve resume description...
Success!
N

attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 23
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 22
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 21
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 20
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 19
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 18
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 17
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 16
connecting...
attempting to retrieve resume description...
Success!
Number of resumes left to scrape: 15
connecting...
attempting to retrieve resume description...
Success!
N

Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connecting...
attempting to retrieve resume description...
Message: 

connectin

attempting to retrieve resume description...


KeyboardInterrupt: 

# ~400 resumes per ip address with requests with small delay

{'other_factors': {'state': ['WI',
   'WI',
   'WI',
   'WI',
   'WI',
   'AK',
   'AK',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'WA',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'DE',
   'WI',
  

# Make the DataFrame

In [540]:
# Combine the data from the dictionary into a single dataframe 
data = pd.concat([pd.DataFrame(d['other_factors']),pd.DataFrame(d['skills'])],axis = 1)

# View some of the data
data.head()

Unnamed: 0,state,title,total_resumes,alteryx,amazon,anaconda,angoss,azure,c,c#,...,tibco,torch,trifacta,tsql,unix,vba,watson,weka,yarn,zookeeper
0,WI,Scientist,436,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,WI,Data Science Expert,436,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,WI,Scientist,436,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,WI,Scientist,436,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,WI,Data Science Expert,436,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Store to File

In [537]:
now = datetime.datetime.now()
date = str(now.month)+'-'+str(now.day)+'-'+str(now.year)
data.to_csv('scraped_resume_data/scraped_resume_data_'+search_term+'_'+date+'.txt',encoding='utf-8',index=None)

# Check the File 

In [None]:
df = pd.read_csv(scraped_resume_data/scraped_resume_data_'+search_term+'_'+date+'.txt')