# Exploration using beautiful soup
## Finding companies advertising scientist jobs in Houston
- based on this: https://jessesw.com/Data-Science-Skills/

In [1]:
from bs4 import BeautifulSoup # For HTML parsing
# import urllib2 # Website connections
from urllib.request import urlopen
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
%matplotlib inline

In [97]:
def skills_info(city="Houston", state="TX",job_title='data+scientist'):
    '''
    This function will take a desired city/state and look for all new job postings
    on Indeed.com. It will crawl all of the job postings and keep track of how many
    use a preset list of typical data science skills. The final percentage for each skill
    is then displayed at the end of the collation. 
        
    Inputs: The location's city and state. These are optional. If no city/state is input, 
    the function will assume a national search (this can take a while!!!).
    Input the city/state as strings, such as skills_info('Chicago', 'IL').
    Use a two letter abbreviation for the state.
    
    Output: A bar chart showing the most commonly desired skills in the job market for 
    a data scientist. 
    '''
    #print("test1")   
    final_job = job_title # searching for data scientist exact fit("data scientist" on Indeed search)
    
    # Make sure the city specified works properly if it has more than one word (such as San Francisco)  %22&l=
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_site_list = ['https://www.indeed.com/jobs?q=', final_job, '&l=', final_city,'%2C+', state] # Join all of our strings together so that indeed will search correctly
    else:
        final_site_list = ['https://www.indeed.com/jobs?q="', final_job, '"']

    final_site = ''.join(final_site_list) # Merge the html address together into one string

    # website query today (2017-02-25) is = https://www.indeed.com/jobs?q=data+scientist&l=Houston%2CTX
        
    base_url = 'http://www.indeed.com'
    
    #print("test2")  
    try:
        html = urlopen(final_site).read() # Open up the front page of our search first
    except:
        #print('That city/state combination did not have any jobs. Exiting . . .') # In case the city is invalid
        return
    soup = BeautifulSoup(html,"lxml") # Get the html from the first page
    
    # Now find out how many jobs there were
    
    num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found
                                                                        # The 'searchCount' object has this
    #print('num_jobs_area before filtering ', num_jobs_area)
    
    job_numbers = re.findall(b'\d+', num_jobs_area) # Extract the total jobs found from the search result
     
    #print('job number before filtering ', job_numbers[2])
    
    job_numbers_decoded = int(job_numbers[2].decode('utf-8'))
    #print('job_numbers_decoded = ',job_numbers_decoded, "type is ",type(job_numbers_decoded))
    
    #if len(job_numbers[2]) > 3: # Have a total number of jobs greater than 1000
    #    total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
    #else:
    #    total_num_jobs = int(job_numbers[2]) 
    
    total_num_jobs = job_numbers_decoded
    
    city_title = city
    if city is None:
        city_title = 'Nationwide'
        
    #print('There were', total_num_jobs, 'jobs found,', city_title) # Display how many jobs were found
    
    # took out /10 here, which seemed to be assuming always 10 results per page? now 15?
    num_pages = int(float(total_num_jobs/10)) # This will be how we know the number of times we need to iterate over each new
                                      # search result page
        
    job_descriptions = [] # Store all our descriptions in this list
    
    # used to be 'xrange' here but I changed it to 'range' because I'm using python3 and xrange is python2 only
    all_company_results = []
    for i in range(1,num_pages+1): # Loop through all of our search result pages
        #print('Getting page', i)
        
        start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
        current_page = ''.join([final_site, '&start=', start_num])
        # Now that we can view the correct 10 job returns, start collecting the text samples from each
            
        html_page = urlopen(current_page).read() # Get the page
            
        page_obj = BeautifulSoup(html_page,"lxml") # Locate all of the job links
        job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist
        #print('page_obj',page_obj)  
            
        #print("link.get('href')",base_url + link.get('href')
        #print("job_link_area.find_all('a')",job_link_area.find_all('a'))
        
        for a in job_link_area.find_all('a', href=True):
            #print ("Found the URL:", a['href'])
            if "/cmp/" in a['href']: 
                #  if a['href'].find("/cmp/"):
                #print("a had /cmp/ in it using find = ",a['href'])
                arrayHrefParts_0 = a['href'].split('/cmp/')
                #print("aarrayHrefParts_0 = ",arrayHrefParts_0)
                arrayHrefParts_1 = arrayHrefParts_0[1].split('/')
                #print("arrayHrefParts_1 = ",arrayHrefParts_1)
                company = arrayHrefParts_1[0]
                #print('company = ',company)
                all_company_results.append(company)
                #print('all_company_results =',all_company_results)
            else:
                continue
                #print("/cmp/ not found in ,",a['href'])
        
        #print("[base_url + link.get('href') for link in job_link_area.find_all('a')]",[base_url + link.get('href') for link in job_link_area.find_all('a')])
        
        #job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')] # Get the URLS for the jobs
            
        #job_URLS = filter(lambda x:'clk' in x, job_URLS) # Now get just the job related URLS
            
        
        #         for j in xrange(0,len(job_URLS)):
        #             final_description = text_cleaner(job_URLS[j])
        #             if final_description: # So that we only append when the website was accessed correctly
        #                 job_descriptions.append(final_description)
        #             sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot! 
    
        sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot! 
    
    for each in range(len(all_company_results)):
        if '.html' in all_company_results[each]:
            #print('all_company_results[each]',all_company_results[each]," each = ",each)
            new_name_wo_html = all_company_results[each].split('.html')[0]
            #print('new_name_wo_html',new_name_wo_html)
            all_company_results[each] = new_name_wo_html  
            
    all_company_results = list(set(all_company_results))
    #print ('Done with collecting the job postings!')    
    #print ('all_company_results final = ',all_company_results)
    #print ("There were",len(all_company_results)," companies successfully found.")
    return all_company_results


In [70]:
skills_info_result = skills_info("Houston","TX",'data+scientist')
skills_info_result

test1
num_jobs_area before filtering  b'Jobs 1 to 10 of 151'
job number before filtering  b'151'
job_numbers_decoded =  151 type is  <class 'int'>
Done with collecting the job postings!
all_company_results final =  ['Engage-Partners', 'Pennwell-Corporation', 'Alliantgroup', 'Genpact-Headstrong-Capital-Markets', 'Texas-Children%27s-Hospital', 'Edf-Trading-North-America-LLC', 'Baylor-College-of-Medicine-%28bcm%29', 'Houston-Rockets', 'Hewlett-Packard-Enterprise', "Texas-Children's-Hospital", 'Rolls--royce', 'Wyle-Laboratories', 'Houston-Methodist', 'Bellicum-Pharmaceuticals', 'Baylor-College-of-Medicine-(bcm)', 'Stage-3-Separation-1', 'Occidental-Petroleum', 'Strategic-IT-Staffing', 'Booz-Allen-Hamilton', 'Sysco', 'Hdr', 'Compugra-Systems', 'Exxonmobil', 'Texas-A&M-University', 'Encore-Search-Partners', 'Saudi-Aramco', 'Robert-Half-Technology', 'Texas-A%26M-University', 'Sunnova-Energy-Corporation-1', 'Baylor-College-of-Medicine', 'GE-Corporate', 'Cyberonics', 'Md-Anderson-Cancer-Center'

In [71]:
skills_info_result_geo = skills_info("Houston","TX",'geologist')
skills_info_result_geo

test1
num_jobs_area before filtering  b'Jobs 1 to 10 of 41'
job number before filtering  b'41'
job_numbers_decoded =  41 type is  <class 'int'>
Done with collecting the job postings!
all_company_results final =  ['Mott-Macdonald', 'Hess-Corporation', 'High-Country-Executive-Search', 'Weatherford', 'W&T-Offshore,-Inc.', 'SAN-Jacinto-College', 'Jab-Recruitment-3', 'Radarview-LLC-1', 'Bmo-Financial-Group', 'Murray-Resources', 'Rbc', 'Targa-Resources', 'Enrud-Resources', 'Sheridan-Production', 'Lee-College', 'Sinopec-Tech-Houston', 'Occidental-Petroleum', 'Oasis-Petroleum', 'Jefferies']
There were 19  companies successfully found.


In [72]:
skills_info_result_geop = skills_info("Houston","TX",'geophysicist')
skills_info_result_geop

test1
num_jobs_area before filtering  b'Jobs 1 to 10 of 21'
job number before filtering  b'21'
job_numbers_decoded =  21 type is  <class 'int'>
Done with collecting the job postings!
all_company_results final =  ['Walker-Elliott', 'Geotrace', 'Hess-Corporation', 'Saudi-Aramco', 'Sinopec-Tech-Houston-LLC', 'Hilcorp-Energy-Company', 'Penn-Virginia-Corporation', 'Murray-Resources', 'Bhp-Billiton', 'BP', 'Petroleum-Geo--services', 'Sinopec-Tech-Houston', 'Confidential---Oil-%26-Gas-Company', 'Occidental-Petroleum']
There were 14  companies successfully found.


In [98]:
def runMultiple_skills_info(array_of_jobTitles=['puppies']):
    all_results = []
    for each in array_of_jobTitles:
        print('each= ',each)
        #temp_results1 = []
        temp_results1 = skills_info("Houston","TX",each)
        print('temp_results is ',temp_results1)
        #all_results.extend(temp_results1)
        all_results.extend(temp_results1)
    return all_results

In [99]:
array2 = ['geologist','gephysicist']
geo_and_geo = runMultiple_skills_info(array_of_jobTitles=array2)
geo_and_geo

each=  geologist
temp_results is  ['Hess-Corporation', 'Rbc', 'Tolunay--wong-Engineers,-Inc.', 'Warren-Averett-Sraffing-and-Recruiting', 'Confidential---Oil-%26-Gas-Company', 'Occidental-Petroleum', 'Mott-Macdonald', 'Hilcorp-Energy-Company', 'Radarview-LLC-1', 'Murray-Resources', 'Targa-Resources', 'Enrud-Resources', 'Sheridan-Production', 'Sinopec-Tech-Houston', 'Lee-College', 'W&T-Offshore,-Inc.', 'Penn-Virginia-Corporation', 'Lonestar-College-System-1', 'Jefferies-&-Company,-Inc.', 'BP', 'New-Tech-Global', 'Jefferies', 'University-of-Houston--downtown', 'High-Country-Executive-Search', 'Weatherford', 'SAN-Jacinto-College', 'Jab-Recruitment-3', 'Bmo-Financial-Group', 'Fugro', 'Oasis-Petroleum']
each=  gephysicist
temp_results is  ['Walker-Elliott', 'Penn-Virginia-Corporation', 'Sinopec-Tech-Houston-LLC', 'Murray-Resources', 'Sinopec-Tech-Houston', 'Confidential---Oil-%26-Gas-Company']


['Hess-Corporation',
 'Rbc',
 'Tolunay--wong-Engineers,-Inc.',
 'Warren-Averett-Sraffing-and-Recruiting',
 'Confidential---Oil-%26-Gas-Company',
 'Occidental-Petroleum',
 'Mott-Macdonald',
 'Hilcorp-Energy-Company',
 'Radarview-LLC-1',
 'Murray-Resources',
 'Targa-Resources',
 'Enrud-Resources',
 'Sheridan-Production',
 'Sinopec-Tech-Houston',
 'Lee-College',
 'W&T-Offshore,-Inc.',
 'Penn-Virginia-Corporation',
 'Lonestar-College-System-1',
 'Jefferies-&-Company,-Inc.',
 'BP',
 'New-Tech-Global',
 'Jefferies',
 'University-of-Houston--downtown',
 'High-Country-Executive-Search',
 'Weatherford',
 'SAN-Jacinto-College',
 'Jab-Recruitment-3',
 'Bmo-Financial-Group',
 'Fugro',
 'Oasis-Petroleum',
 'Walker-Elliott',
 'Penn-Virginia-Corporation',
 'Sinopec-Tech-Houston-LLC',
 'Murray-Resources',
 'Sinopec-Tech-Houston',
 'Confidential---Oil-%26-Gas-Company']