# Scrape Data From Monster For All States

In [28]:
import pandas as pd # For Data Storage
import requests # For website connection
from bs4 import BeautifulSoup # For HTML parsing
import time # For sleep 
import re # Regular expressions for removing non-ascii terms
from itertools import groupby # For removing duplicates from lists
import math # Need ceiling expression
from nltk.corpus import stopwords # For filtering out words like 'is', 'the', 'of'
stop_words = set(stopwords.words("english")) #initialize stopwords    
import matplotlib.pyplot as plt
import random
import datetime

# Create the Dictionary

## Import the list of skills from csv

In [29]:
skill_list_file = 'data_scientist.txt'
skills = list(pd.read_csv('skill_lists/'+skill_list_file,header=None,sep=',').T[0].values)

In [30]:
# make sure there are no duplicates in the skills list
no_dups = []
[no_dups.append(x) for x in skills if x not in no_dups]
skills = no_dups

## Initialize the list of extra factors to consider in each job post

In [31]:
other_factors = [
    'state',
    'job_title',
    'total_jobs'
]

## Initialize the dictionary with keys and empty lists

In [32]:
d = {'skills':{},
     'other_factors':{}
    }

# Create an empty list for each skill
for skill in skills:
    d['skills'][skill] = []

# Create an empty list for each additional factor
for other_factor in other_factors:
    d['other_factors'][other_factor] = []
    


## The final dictionary will have two groups and will look like:

### d = {
###     'skills' : { 'skill_a' : [ ], ..., 'skill_z' : [ ] },
###      'other_factors' : { 'factor_a' : [], ..., 'factor_z' : [ ] } 
### } 


# Define Func: Add A Row of Data To Dictionary (For a Job Description)

In [33]:
def add_vals_to_d(state,job_title,total_jobs,text):

    '''
    This function adds the data from each job post to the dictionary.

    The input is: 
    state: the state of a job post,
    job_title: the job title of a job post,
    total_jobs: the total jobs in the state
    text: the processed text from the job post
    '''

    # Append state
    d['other_factors']['state'].append(state)
    
    # Append job title
    d['other_factors']['job_title'].append(job_title)

    # Append total jobs
    d['other_factors']['total_jobs'].append(total_jobs)
    
    # Check if the each skill is in the wrangled text
    for skill in skills:
        if (skill in text) == True:
            d['skills'][skill].append(1)
        else:
            d['skills'][skill].append(0)

# Define States List

In [34]:
states = ['CA','TX','NY','NJ','MA','MD','PA','VA','NC','IL',
          'GA','CO','OH','MI','MN','MO','AZ','CT','IN','TN',
          'OR','SC','AL','LA','UT','KY','NM','KS','NV','IA',
          'NH','OK','AR','RI','ID','MS','NE','ME','MT','VT',
          'SD','ND','AK','DE','FL','HI','WA','WV','WI','WY']

## import proxys

#Get top 10 proxies from http://free-proxy.cz/en/proxylist/country/US/https/ping/level1
https_proxys = ['34.192.220.22:808',
              '206.189.112.106:3128',
              '209.97.191.169:3128',
              '157.230.34.190:1111',
              '157.230.45.121:1111',
              '23.108.64.65:8118',
              '157.230.33.37:1111',
              '52.128.60.130:50692',
              '12.218.209.130:53281',
              '38.134.10.106:53281'
               ]

In [35]:
# Get proxies from filter
https_proxys = list(pd.read_csv('filtered_https_proxys.txt',header=None)[0])

## import headers

In [36]:
user_agents = pd.read_csv('headers.txt',delimiter = '\t',header=None).T[0]

In [37]:
user_agents = user_agents[0:-1] # Get rid of last element (it is nan)

In [38]:
user_agents = list(user_agents)

# Define Funcs: 
- # select a random index,
- # select a random proxy,
- # select a random header

In [39]:
def randIndex(lengthList):
    index = random.randint(0,lengthList-1)
    return index

def randProxy():
    index = randIndex(len(https_proxys))
    return https_proxys[index]
    
def randHeader():
    index = randIndex(len(user_agents))
    return user_agents[index]

# Initialize Headers Dictionary 

In [40]:
header = {'User-Agent':randHeader()}

# Initialize Proxy Dictionary

In [41]:
proxy = {'https':randProxy()}

## Define Func: Assign a Rand Prox and Header and Connect to a url

In [42]:
def connect(url):
    '''
    This function attempts to connect to a website and returns the soup object

    The input:
    - url: the url of the website

    The output:
    - soup: a soup object for a url
    '''
    flag = 0
    while flag == 0:
        try:
            # Assign random proxy
            proxy['https'] = randProxy()

            # Assign random header
            header['User-Agent'] = randHeader()

            #Access one of the resume listing pages for a state
            response = requests.get(url,proxies=proxy,headers=header,timeout=3) #Get the html code
            soup = BeautifulSoup(response.text, "html.parser") #Store html in a soup object 

            # Sleeping can prevent ip blocking
            time.sleep(1)
            
            flag = 1 # Sometimes a connection doesn't work, so we will loop until it is successful

        except Exception,e:
            print str(e)
            

    return soup

## For Each State:

- ## Get the urls that lists up to 250 job openings

In [43]:
# Set the general search and use lowercases and dashes ('-') in place of capitals and spaces 
# i.e a search for the term Professional Baseball Player would be search_term = 'professional-baseball-player'
search_term = 'data-scientist'

In [44]:
count = 0

# Initialize Empty URL Dictionary
final_state_url_dict = {} 

# We need to keep track of the total jobs in each state
total_jobs_dict = {}

# End the while loop when a url has been obtained for every state
while len(final_state_url_dict.keys()) < 50:
    for state in states:
        try:
            if state not in final_state_url_dict.keys():                
                count += 1
                print("Number of states left to scrape: "+str(50-count))

                url = 'https://www.monster.com/jobs/search/?q='+search_term+'&where='+state+'&stpage=1'        
                soup = connect(url)
                
                # Sometimes there are no job openings and an error 'list index out of range is thrown' 
                # when trying to access number of jobs on the webpage when there are no jobs listed
                try: 
                    #Get the total number of listed jobs 
                    totalJobOpenings = int(re.sub('[^0-9]','',soup.find('header',{'title'}).contents[3].text.strip()))
                
                # when the error is thrown, we set the total jobs to zero
                except Exception,e: 
                    print str(e)
                    totalJobOpenings = 0
                    print state+' has no jobs.'
                    
                #Get the total number of pages (up to 10) and a max of 25 per page
                totalPages = int(min(math.ceil(totalJobOpenings/25),10))

                #Store the Final URL and totalJobOpenings in a tuple in a dictionary each state
                final_state_url_dict[state] = 'https://www.monster.com/jobs/search/?q='+search_term+'&where='+state+'&stpage=1&page='+str(totalPages)
                
                total_jobs_dict[state] = totalJobOpenings

        except Exception,e: 
            count -= 1
            print str(e)

Number of states left to scrape: 49
Number of states left to scrape: 48
Number of states left to scrape: 47
Number of states left to scrape: 46
Number of states left to scrape: 45
HTTPSConnectionPool(host='www.monster.com', port=443): Max retries exceeded with url: /jobs/search/?q=data-scientist&where=MA&stpage=1 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of states left to scrape: 44
HTTPSConnectionPool(host='www.monster.com', port=443): Max retries exceeded with url: /jobs/search/?q=data-scientist&where=MD&stpage=1 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1b3d3890>: Failed to establish a new connection: [Errno 61] Connection refused',)))
Number of states left to scrape: 43
Number of states left to scrape: 42
HTTPSConnectionPool(host='www.monster.com', port=443): Max retries exceeded with url: /jobs/search/?q=data-scientist&where=VA&stpage=1 (Caused by ProxyError

## Define Func: Process the text from a job description 

In [45]:
def process_job_desc(soup):
    '''This function extracts the job description text from a soup object for a job url
        and processes the text into a list of individual words.
        
        The input:
        - soup: the soup object for the url of a job description.

        The output:
        - wrangled_text: a list of individual words from the job description
    '''

    # Get the list of descriptions from the html body
    html_body_desc = soup.body.find_all(['p','li','strong','ul','br'])

    # Take each item from the list and turn it into a string in another list for the join operator
    iterable_object = [str(item) for item in html_body_desc]

    # join the list of string items into one string with each item separated by ','
    html_body_as_string = ','.join(iterable_object)

    # Convert to lower case 
    wrangle_text = html_body_as_string.lower()

    # replace (all non ascii characters,'.' ,'+' , '2' and '3') with ',' 
    wrangle_text = re.sub("[^a-zA-Z.+#234]",',',wrangle_text)

    # Convert to string -> split string into a list separated by ','
    wrangle_text = str(wrangle_text).split(',')

    # remove all consecutive duplicates 
    wrangle_text = groupby(wrangle_text)
    wrangle_text = [x[0] for x in wrangle_text]

    # Remove the stopwords and empty spaces
    wrangled_text = [x for x in wrangle_text if (x not in stop_words) and(not x == '')]

    return wrangled_text
    

- ## Get the list of job post URLs -> For each URL: check if each skill exists and store data in dictionary

In [46]:
# End the loop when all job state urls have been processed
while len(final_state_url_dict.keys()) > 0:
    
    for state in final_state_url_dict.keys():
    

        print("Number of states left to scrape: "+str(len(final_state_url_dict.keys())))


        #Access the job listing page for a state
        url = final_state_url_dict[state]
        soup = connect(url)

        total_jobs = total_jobs_dict[state]

        # We will need the job urls and their job titles
        job_url_and_title = {} 

        # Check that there are actually jobs to scrape
        if total_jobs != 0:
            # We need to access every URL within the page and store only the url for every job opening
            for url in soup.find_all('a'):
                if url.get('href')[0:21] == 'https://job-openings.': ## a job opening always begins with 'https://job-openings.monster.com'
                    job_url_and_title[url.get('href')] = url.contents[0].strip('\n').strip('\r')
        else: 
            #delete states job search url and end the loop early
            del final_state_url_dict[state]
            print state+' has no jobs'
            break


        # Access each job posting and store it's html code into a beautiful soup object
        # End the loop when all job title urls have been processed
        while len(job_url_and_title.keys()) > 0: 

            for url in job_url_and_title.keys():
                
                print("Number of jobs left to scrape: "+str(len(job_url_and_title.keys())))

                soup = connect(url)

                # Get the filtered list of words from the job description in the soup object
                text_final = process_job_desc(soup)

                # Add the values to the Dictionary
                add_vals_to_d(state, job_url_and_title[url], total_jobs, text_final)

                # delete the job post link when finished
                del job_url_and_title[url]

         

        #delete states job search url when all job urls from a state are exhausted
        del final_state_url_dict[state]

            

Number of states left to scrape: 50
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout

Number of jobs left to scrape: 160
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /senior-data-scientist-redmond-wa-us-microsoft-corporation/94fe293e-2ce9-4037-92dd-54cb6ab0bb94 (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x1a19a43790>, 'Connection to 151.106.10.51 timed out. (connect timeout=3)'))
Number of jobs left to scrape: 159
Number of jobs left to scrape: 158
Number of jobs left to scrape: 157
Number of jobs left to scrape: 156
Number of jobs left to scrape: 155
Number of jobs left to scrape: 154
Number of jobs left to scrape: 153
Number of jobs left to scrape: 152
Number of jobs left to scrape: 151
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /data-scientist-19-01679-bellevue-wa-us-akraya-inc/8308f67b-bea0-48e5-9af6-fee71de83a73 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 150
Numb

Number of jobs left to scrape: 32
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-no-desk-agency-fees-no-door-to-door-shoreline-wa-us-assurance/cc7e663b-026f-4057-bd98-8f456066fa31 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 31
Number of jobs left to scrape: 30
Number of jobs left to scrape: 29
Number of jobs left to scrape: 28
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /manager-data-scientist-ai-nlp-seattle-wa-us-kpmg/55c0057f-eafa-4502-8eaa-fcd94a87727e (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 27
Number of jobs left to scrape: 26
Number of jobs left to scrape: 25
Number of jobs left to scrape: 24
Number of jobs left to scrape: 23
Number of jobs left to scrape: 22
Number of jobs left to scrape: 21
Number of jobs left to scrape: 20
Number of jo

Number of jobs left to scrape: 56
Number of jobs left to scrape: 55
Number of jobs left to scrape: 54
Number of jobs left to scrape: 53
Number of jobs left to scrape: 52
Number of jobs left to scrape: 51
Number of jobs left to scrape: 50
Number of jobs left to scrape: 49
Number of jobs left to scrape: 48
Number of jobs left to scrape: 47
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /remote-health-insurance-agent-work-when-where-you-want-constant-flow-of-free-leads-seaford-de-us-assurance/0d9bebed-b529-4bf7-bdef-90ceca2e62f6 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 46
Number of jobs left to scrape: 45
Number of jobs left to scrape: 44
Number of jobs left to scrape: 43
Number of jobs left to scrape: 42
Number of jobs left to scrape: 41
Number of jobs left to scrape: 40
Number of jobs left to scrape: 39
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retrie

Number of jobs left to scrape: 217
Number of jobs left to scrape: 216
Number of jobs left to scrape: 215
Number of jobs left to scrape: 214
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-use-assurance-platform-think-uber-meets-insurance-milwaukee-wi-us-assurance/bbbe5e21-a9e9-4b60-ba09-5cecf61922f0 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1a001d10>: Failed to establish a new connection: [Errno 61] Connection refused',)))
Number of jobs left to scrape: 213
Number of jobs left to scrape: 212
Number of jobs left to scrape: 211
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /on-demand-life-insurance-agent-work-from-home-free-leads-madison-wi-us-assurance/856c4e40-cac4-4e77-959b-d81047435400 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to sc

Number of jobs left to scrape: 108
Number of jobs left to scrape: 107
Number of jobs left to scrape: 106
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-millions-spent-on-inbound-marketing-madison-wi-us-assurance/e84fb55c-dc0c-45ae-a93a-f0acae422121 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1b149bd0>: Failed to establish a new connection: [Errno 61] Connection refused',)))
Number of jobs left to scrape: 105
Number of jobs left to scrape: 104
Number of jobs left to scrape: 103
Number of jobs left to scrape: 102
Number of jobs left to scrape: 101
Number of jobs left to scrape: 100
Number of jobs left to scrape: 99
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /procurement-data-scientist-analyst-milwaukee-wi-us-johnson-controls-inc/76f42de0-f722-4d91-9755-a27736a616f0 (Caused by ProxyErro

Number of jobs left to scrape: 42
Number of jobs left to scrape: 41
Number of jobs left to scrape: 40
Number of jobs left to scrape: 39
Number of jobs left to scrape: 38
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-no-desk-agency-fees-no-door-to-door-parkersburg-wv-us-assurance/7560379e-e34c-4ccf-9d73-ad74d07a3e77 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1b3b2f90>: Failed to establish a new connection: [Errno 61] Connection refused',)))
Number of jobs left to scrape: 37
Number of jobs left to scrape: 36
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-remote-parkersburg-wv-us-assurance/5c07fb16-d761-4cfe-9fb6-05a02a4d6222 (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1acaf550>, 'Connection to 108.61.220.77 timed out. (co

Number of jobs left to scrape: 13
Number of jobs left to scrape: 12
Number of jobs left to scrape: 11
('Connection aborted.', BadStatusLine("''",))
Number of jobs left to scrape: 10
Number of jobs left to scrape: 9
Number of jobs left to scrape: 8
Number of jobs left to scrape: 7
Number of jobs left to scrape: 6
Number of jobs left to scrape: 5
Number of jobs left to scrape: 4
Number of jobs left to scrape: 3
Number of jobs left to scrape: 2
Number of jobs left to scrape: 1
Number of states left to scrape: 45
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
Number of jobs left to scrape: 260
Number of jobs left to scrape: 259
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-no-12-month-chargeback-window-3x-daily-payouts-pompano-beach-fl-us-assurance/c160a988-50c4-4cdc-941d-4ebcf57019e9 (Caused 

Number of jobs left to scrape: 178
Number of jobs left to scrape: 177
Number of jobs left to scrape: 176
Number of jobs left to scrape: 175
Number of jobs left to scrape: 174
Number of jobs left to scrape: 173
Number of jobs left to scrape: 172
Number of jobs left to scrape: 171
Number of jobs left to scrape: 170
Number of jobs left to scrape: 169
Number of jobs left to scrape: 168
Number of jobs left to scrape: 167
Number of jobs left to scrape: 166
Number of jobs left to scrape: 165
Number of jobs left to scrape: 164
Number of jobs left to scrape: 163
Number of jobs left to scrape: 162
Number of jobs left to scrape: 161
Number of jobs left to scrape: 160
Number of jobs left to scrape: 159
Number of jobs left to scrape: 158
Number of jobs left to scrape: 157
Number of jobs left to scrape: 156
Number of jobs left to scrape: 155
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-no-12-month-chargeback-window-3x-daily-payo

Number of jobs left to scrape: 75
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-no-12-month-chargeback-window-3x-daily-payouts-chipley-fl-us-assurance/60b2c191-ff45-475b-9e6a-e3b68fa3a950 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a19f20b90>: Failed to establish a new connection: [Errno 61] Connection refused',)))
Number of jobs left to scrape: 74
Number of jobs left to scrape: 73
Number of jobs left to scrape: 72
Number of jobs left to scrape: 71
Number of jobs left to scrape: 70
Number of jobs left to scrape: 69
Number of jobs left to scrape: 68
Number of jobs left to scrape: 67
Number of jobs left to scrape: 66
Number of jobs left to scrape: 65
Number of jobs left to scrape: 64
Number of jobs left to scrape: 63
Number of jobs left to scrape: 62
Number of jobs left to scrape: 61
Number of jobs left to scrape: 60
Number of jobs left

Number of jobs left to scrape: 17
Number of jobs left to scrape: 16
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /contract-life-insurance-agent-100-commission-autonomy-f-nashua-nh-us-assurance/222b78e5-a8ea-4cef-9aab-908cdc920b86 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 15
Number of jobs left to scrape: 14
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /contract-health-insurance-agent-work-when-where-you-want-we-generate-thousands-of-free-leads-daily-nashua-nh-us-assurance/3225117c-61f3-4e9e-838b-2d1e21ff2c9f (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 13
Number of jobs left to scrape: 12
Number of jobs left to scrape: 11
Number of jobs left to scrape: 10
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /data-analyst-bedf

Number of jobs left to scrape: 195
Number of jobs left to scrape: 194
Number of jobs left to scrape: 193
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-100-autonomy-work-when-where-you-want-free-leads-hamilton-nj-us-assurance/7a9c7551-c5a0-4b57-81ce-b0d77ec3b1a9 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 192
Number of jobs left to scrape: 191
Number of jobs left to scrape: 190
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /data-scientist-ii-florham-park-nj-us-conduent/0d9aa3e0-e7f6-441a-af11-f4b381178dda (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 189
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /devops-windows-systems-administrator-brighton-ma-us-metasys-technologies/206590669 (Caused by ProxyError

HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /ai-consultant-artificial-intelligence-consultant-fulltime-raritan-nj-us-arminus-software-llc/84441b04-173f-4413-b0c6-897ea0815bf8 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 146
Number of jobs left to scrape: 145
Number of jobs left to scrape: 144
Number of jobs left to scrape: 143
Number of jobs left to scrape: 142
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /remote-life-insurance-agent-100-commission-free-leads-no-marketing-costs-camden-nj-us-assurance/81ca62b7-6bbf-4d00-9e70-c91de4b31c28 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a19f5dbd0>: Failed to establish a new connection: [Errno 61] Connection refused',)))
Number of jobs left to scrape: 141
Number of jobs left to scrape: 140
Number of jobs

Number of jobs left to scrape: 62
Number of jobs left to scrape: 61
Number of jobs left to scrape: 60
Number of jobs left to scrape: 59
Number of jobs left to scrape: 58
Number of jobs left to scrape: 57
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /data-scientist-moorestown-nj-us-asrc-federal-holding/c665e64f-5256-4100-8243-548e97c60e31 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 56
Number of jobs left to scrape: 55
Number of jobs left to scrape: 54
Number of jobs left to scrape: 53
Number of jobs left to scrape: 52
Number of jobs left to scrape: 51
Number of jobs left to scrape: 50
Number of jobs left to scrape: 49
Number of jobs left to scrape: 48
Number of jobs left to scrape: 47
Number of jobs left to scrape: 46
Number of jobs left to scrape: 45
Number of jobs left to scrape: 44
Number of jobs left to scrape: 43
Number of jobs left to scrape: 42
Number of jobs left to scr

Number of jobs left to scrape: 199
Number of jobs left to scrape: 198
Number of jobs left to scrape: 197
Number of jobs left to scrape: 196
Number of jobs left to scrape: 195
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-take-control-change-the-way-you-sell-insurance-rio-rancho-nm-us-assurance/760675ed-a4e5-42d7-baee-929af4ea7495 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 194
Number of jobs left to scrape: 193
Number of jobs left to scrape: 192
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-great-option-of-veterans-free-qualified-leads-south-valley-nm-us-assurance/46ba3513-1ff2-43c6-a606-adfc7458b4ec (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1aa09f90>: Failed to establish a new connection: [Errno 61] C

Number of jobs left to scrape: 101
Number of jobs left to scrape: 100
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /health-insurance-agent-100-autonomy-work-when-where-you-albuquerque-nm-us-assurance/d80ab428-eb89-4644-a96f-5a67489a371a (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1b3d14d0>: Failed to establish a new connection: [Errno 61] Connection refused',)))
Number of jobs left to scrape: 99
Number of jobs left to scrape: 98
Number of jobs left to scrape: 97
Number of jobs left to scrape: 96
Number of jobs left to scrape: 95
Number of jobs left to scrape: 94
Number of jobs left to scrape: 93
Number of jobs left to scrape: 92
Number of jobs left to scrape: 91
Number of jobs left to scrape: 90
Number of jobs left to scrape: 89
Number of jobs left to scrape: 88
Number of jobs left to scrape: 87
Number of jobs left to scrape: 86
Number of jobs left to sc

Number of jobs left to scrape: 11
Number of jobs left to scrape: 10
Number of jobs left to scrape: 9
Number of jobs left to scrape: 8
Number of jobs left to scrape: 7
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-100-commission-remote-free-leads-no-admin-las-cruces-nm-us-assurance/bc8fb27d-f7cd-43e2-8903-1928bea4b142 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 6
Number of jobs left to scrape: 5
Number of jobs left to scrape: 4
Number of jobs left to scrape: 3
Number of jobs left to scrape: 2
Number of jobs left to scrape: 1
Number of states left to scrape: 40
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.com', port=443): Read timed out. (read timeout=3)
HTTPSConnectionPool(host='www.monster.

Number of jobs left to scrape: 202
Number of jobs left to scrape: 201
Number of jobs left to scrape: 200
Number of jobs left to scrape: 199
Number of jobs left to scrape: 198
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /contract-health-insurance-agent-work-when-where-you-want-we-generate-thousands-of-free-leads-daily-cedar-park-tx-us-assurance/fd0f7f6b-080c-4be2-8c6e-6b2788a669eb (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1aca9d10>, 'Connection to 108.61.220.77 timed out. (connect timeout=3)'))
Number of jobs left to scrape: 197
Number of jobs left to scrape: 196
Number of jobs left to scrape: 195
Number of jobs left to scrape: 194
Number of jobs left to scrape: 193
Number of jobs left to scrape: 192
Number of jobs left to scrape: 191
Number of jobs left to scrape: 190
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /ms-sql-application-develope

HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /remote-health-insurance-agent-work-when-where-you-want-constant-flow-of-free-leads-rowlett-tx-us-assurance/bc2953be-b955-495a-846b-0f1937f54926 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 116
Number of jobs left to scrape: 115
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /senior-data-consultant-ai-ml-saas-50-remote-austin-tx-us-cybercoders/207557969 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 114
Number of jobs left to scrape: 113
Number of jobs left to scrape: 112
Number of jobs left to scrape: 111
Number of jobs left to scrape: 110
Number of jobs left to scrape: 109
Number of jobs left to scrape: 108
Number of jobs left to scrape: 107
Number of jobs left to scrape: 106
Number of jobs left to scrape: 105
Number of jobs le

Number of jobs left to scrape: 8
Number of jobs left to scrape: 7
Number of jobs left to scrape: 6
Number of jobs left to scrape: 5
Number of jobs left to scrape: 4
Number of jobs left to scrape: 3
Number of jobs left to scrape: 2
Number of jobs left to scrape: 1
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /contract-health-insurance-agent-commission-paid-daily-100-autonomy-free-leads-no-admin-lewisville-tx-us-assurance/b1ec7415-f376-4e79-8aa2-864d4630d541 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a198b3e90>: Failed to establish a new connection: [Errno 61] Connection refused',)))
Number of states left to scrape: 39
Number of jobs left to scrape: 201
Number of jobs left to scrape: 200
Number of jobs left to scrape: 199
Number of jobs left to scrape: 198
Number of jobs left to scrape: 197
Number of jobs left to scrape: 196
Number of jobs left to scrape: 

HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-work-as-much-as-you-want-3x-daily-co-bossier-city-la-us-assurance/3fafac34-896f-4e80-a15b-9bb15398263a (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 123
Number of jobs left to scrape: 122
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-1000s-of-free-qualified-leads-100s-of-guides-prospecting-for-you-bossier-city-la-us-assurance/a33b15eb-1e31-4ad5-ae39-a48d45ff3432 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x10390bc90>: Failed to establish a new connection: [Errno 61] Connection refused',)))
Number of jobs left to scrape: 121
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /big-data-architect-iii-monroe-la-us-aptask/4dc

Number of jobs left to scrape: 41
Number of jobs left to scrape: 40
Number of jobs left to scrape: 39
Number of jobs left to scrape: 38
Number of jobs left to scrape: 37
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /on-demand-life-insurance-agent-work-from-home-free-leads-baton-rouge-la-us-assurance/98665f80-a306-402b-9e50-c05b77cb173a (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a19a79990>: Failed to establish a new connection: [Errno 61] Connection refused',)))
Number of jobs left to scrape: 36
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-1000s-of-free-qualified-leads-get-help-adding-state-licenses-baton-rouge-la-us-assurance/2ab2d133-1015-41f2-a633-8285940993c6 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 35
Number of jobs 

Number of jobs left to scrape: 236
Number of jobs left to scrape: 235
Number of jobs left to scrape: 234
Number of jobs left to scrape: 233
Number of jobs left to scrape: 232
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /application-analyst-sme-charlotte-nc-us-randstad-technologies/207203732 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1a1d9d90>: Failed to establish a new connection: [Errno 61] Connection refused',)))
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /application-analyst-sme-charlotte-nc-us-randstad-technologies/207203732 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 231
Number of jobs left to scrape: 230
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /on-demand-health-insurance-agent-work

HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-no-desk-agency-fees-no-door-to-door-cary-nc-us-assurance/151af2dd-d454-46dd-9298-f84b32e7b2b0 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 163
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /data-scientist-greensboro-nc-us-guilford-county-nc/cd6f99b4-2366-434a-8a60-2a3f20a68f97 (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1a3ebf10>, 'Connection to 200.188.151.212 timed out. (connect timeout=3)'))
Number of jobs left to scrape: 162
('Connection aborted.', BadStatusLine("''",))
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /data-scientist-charlotte-nc-us-ibm/1e5d60bd-e651-4402-b048-15b2cd6eb0f4 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.con

Number of jobs left to scrape: 73
Number of jobs left to scrape: 72
Number of jobs left to scrape: 71
Number of jobs left to scrape: 70
Number of jobs left to scrape: 69
Number of jobs left to scrape: 68
Number of jobs left to scrape: 67
Number of jobs left to scrape: 66
Number of jobs left to scrape: 65
Number of jobs left to scrape: 64
Number of jobs left to scrape: 63
Number of jobs left to scrape: 62
Number of jobs left to scrape: 61
Number of jobs left to scrape: 60
Number of jobs left to scrape: 59
Number of jobs left to scrape: 58
Number of jobs left to scrape: 57
Number of jobs left to scrape: 56
Number of jobs left to scrape: 55
Number of jobs left to scrape: 54
Number of jobs left to scrape: 53
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-100-autonomy-work-when-where-you-want-100-commission-asheville-nc-us-assurance/fb7aae49-c01c-4684-be9f-4bbec3888364 (Caused by ProxyError('Cannot connect to proxy.', New

Number of jobs left to scrape: 175
Number of jobs left to scrape: 174
Number of jobs left to scrape: 173
Number of jobs left to scrape: 172
Number of jobs left to scrape: 171
Number of jobs left to scrape: 170
Number of jobs left to scrape: 169
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-100-commission-remote-free-leads-n-wahpeton-nd-us-assurance/fbc1546c-d7fc-4565-8dfc-3eb1ec9a8711 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 168
Number of jobs left to scrape: 167
Number of jobs left to scrape: 166
Number of jobs left to scrape: 165
Number of jobs left to scrape: 164
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-remote-williston-nd-us-assurance/a058ab97-6c98-4cdb-bda4-da433e3f2abb (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSCo

Number of jobs left to scrape: 57
Number of jobs left to scrape: 56
Number of jobs left to scrape: 55
Number of jobs left to scrape: 54
Number of jobs left to scrape: 53
Number of jobs left to scrape: 52
Number of jobs left to scrape: 51
Number of jobs left to scrape: 50
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /on-demand-life-insurance-agent-work-from-home-free-leads-wahpeton-nd-us-assurance/d43217ce-d221-44ef-b995-36700252bc37 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 49
Number of jobs left to scrape: 48
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /on-demand-life-insurance-agent-think-uber-meets-insurance-williston-nd-us-assurance/cd1b1daa-f0a0-4b3f-b6c1-0bb5e2ad1745 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 47
Number of jobs left to scrape: 46
Number of jo

Number of jobs left to scrape: 104
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /life-insurance-agent-great-option-for-veterans-free-qualified-leads-papillion-ne-us-assurance/bbf94fe8-62b9-44f2-bb4a-533984371a8f (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a1a7ca0d0>: Failed to establish a new connection: [Errno 61] Connection refused',)))
('Connection aborted.', BadStatusLine("''",))
Number of jobs left to scrape: 103
Number of jobs left to scrape: 102
Number of jobs left to scrape: 101
Number of jobs left to scrape: 100
Number of jobs left to scrape: 99
Number of jobs left to scrape: 98
Number of jobs left to scrape: 97
Number of jobs left to scrape: 96
Number of jobs left to scrape: 95
Number of jobs left to scrape: 94
Number of jobs left to scrape: 93
Number of jobs left to scrape: 92
Number of jobs left to scrape: 91
Number of jobs left to scrape: 90


Number of jobs left to scrape: 21
Number of jobs left to scrape: 20
Number of jobs left to scrape: 19
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /contract-health-insurance-agent-commission-paid-daily-100-autonomy-free-leads-no-admin-hastings-ne-us-assurance/d6f00ac4-0fa9-470e-96c1-ab6caa69e5b1 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
Number of jobs left to scrape: 18
Number of jobs left to scrape: 17
Number of jobs left to scrape: 16
Number of jobs left to scrape: 15
Number of jobs left to scrape: 14
Number of jobs left to scrape: 13
Number of jobs left to scrape: 12
Number of jobs left to scrape: 11
HTTPSConnectionPool(host='job-openings.monster.com', port=443): Max retries exceeded with url: /on-demand-health-insurance-agent-contract-remote-daily-co-bellevue-ne-us-assurance/4b185205-0cba-4d80-8d9d-b599f3d7b152 (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out',)))
HTTPSConnectionPo

KeyboardInterrupt: 

# Make the DataFrame

In [None]:
# Combine the data from the dictionary into a single dataframe 
data = pd.concat([pd.DataFrame(d['other_factors']),pd.DataFrame(d['skills'])],axis = 1)

# Store to File

In [None]:
now = datetime.datetime.now()
date = str(now.month)+'-'+str(now.day)+'-'+str(now.year)
data.to_csv('scraped_data/scraped_data_'+search_term+'_'+date+'.txt',encoding='utf-8',index=None)