# Single Notebook With All the Code to Go From List of Science Job Titles to geoJSON of Companies advertising those jobs on a map

------------------------------------------------

# General Outline
### get list of science related jobs
    * Clean up a list of science related job titles I modified from wikipedia in excel, save as csv
### get company names that are hiring for those science related job titles in Houston, TX by scraping job search site(s)
    * use beautiful soup, a python library to gather information from websites programtically and return list of companies
    * Clean up duplication and companies that shouldn't be included
### get addresses and lat longs associated with each company name in json form
    * use google places API to search for company name and a Houston, TX location, returning json of location and company information
    * clean returned list of json data such that it is limited to a lat long box of the greater houston area
    * clean up duplicates and are false positives useing pandas and other python libraries to 
### convert results to geoJSON. Add-in additional data dimensions, such as job title used in seach, as geoJSON properties
    * use geoJSON and JSON python libraries to convert previous json into points, and then features, and then a single feature collection
### get geoJSONs from other sources that contain science location data on things like schools and hospitals
    * use .... a http get call to access the geoJSON files already created by the city of Houston.
### add geoJSONs to map. Use color, size, and shape to represent different geoJSON properties. Tell a story
    * use mapboxGL.js, leaflet.js, and basic html/CSS/JS to create maps

### importing needed python modules

In [6]:
#### Imports for parts #1 & #2
from bs4 import BeautifulSoup # For HTML parsing
from urllib.request import urlopen
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
%matplotlib inline
import json
import csv

##### Import modules for google API and geoJSON parts. Only duplicate is json
import requests
import configparser
import json
import geojson
from geojson import Point, Feature, FeatureCollection
# https://github.com/frewsxcv/python-geojson
import time
import pandas as pd

===================================================================================

# 1. Load list of job titles

------------------------------------------------------------------------------

In [7]:
with open('../Data/science_jobs_v2.csv', 'rt') as f:
    reader = csv.reader(f)
    science_jobs_list_withInfo = list(reader)

job_title_list = []
for each in science_jobs_list_withInfo:
    each = each[0]
    #print('each',each)
    job_title_list.append(each)

# take out the header 'Job_Title'
job_title_list.pop(0)

print(job_title_list)

['Bioanalytical Scientist', 'Biochemist', 'Biochemistry', 'Bioinformatics Research Scientist', 'Biological Engineer', 'Biologist', 'biology  ', 'biomechanical', 'Biomedical scientist', 'biophysics', 'Cell Line', 'Clinical Data', 'Clinical Data Research', 'Clinical Pharmacology Professor', 'Clinical Pharmacy Assistant', 'Clinical Research Associate', 'Drug Evaluator', 'Drug Regulatory Affairs Manager', 'Environmental Health Scientist', 'Genetic Counselor', 'Health Research', 'Health Technology', 'Healthcare science', 'Hospital Research Assistant', 'Immunology Scientist', 'Life Science', 'Medical', 'Medical Center', 'medical Laboratory', 'Medical laboratory scientist', 'Medical Physics Researcher', 'Medical Research Assistant', 'Medical Research Technician', 'Medical Scientist', 'Microbiologist', 'Molecular Biologist', 'Molecular physics', 'Neuroscientist', 'Nuclear', 'Oncology Researcher', 'Pathologist', 'Pharmaceutical Assistant', 'Pharmaceutical Research Analyst', 'Pharmaceutical Rese

In [43]:
short_job_title_list_VeryShort3 = ['Psychologist', 'Biologist','Geologist']

In [44]:
len(short_job_title_list_VeryShort3)

3

In [9]:
short_job_title_list_1st3rd = ['Biochemistry', 'Bioinformatics', 'Biologist', 'biomechanical', 'Biomedical', 'biophysics', 'Cell+Line', 'Clinical+Data', 'Pharmacology', 'Pharmacy', 'Drug Evaluator', 'Environmental Health', 'Genetic Counselor', 'Healthcare', 'Hospital', 'Immunology', 'Life Science', 'Medical', 'Medical Center', 'medical Laboratory', 'Medical Physics', 'Medical', 'Microbiologist', 'Neuroscientist', 'Nuclear', 'Oncology', 'Pathologist', 'Pharmaceutical', 'Photochemistry', 'Public Health Specialist', 'radiobiology', 'Radiochemistry', 'Toxicologist', 'Agricultural', 'Archaeologist', 'Associate Professor', 'Botanist', 'botany', 'Chemical', 'Chemist', 'Conservation', 'Forensic Chemist', 'Herpetologist', 'Laboratory', 'meteorologist ', 'Naturalist', 'Oceanographer', 'oceanography', 'optomitrist']

In [10]:
len(short_job_title_list_1st3rd)

49

In [11]:
short_job_title_list_2nd3rd = ['paleoecology', 'Paleontologist', 'Petroleum geologist', 'petrophysicist', 'playnology', 'reservoir', 'modeler', 'rock Laboratory', 'sedimentologist', 'stratigraphy', 'subsea', 'aeronautics', 'Aerospace', 'astrobiology', 'Astrochemistry', 'astromaterials', 'Astronaut', 'Astronomer', 'astronomy', 'astrophysics', 'Food chemistry', 'heliophysics', 'International Space Station', 'Johnson Space Center', 'lunar', 'materials', 'NASA', 'Physicist', 'physics', 'Planetary', 'propulsion engineer', 'Satellite', 'Space science', 'telemetry']

In [12]:
len(short_job_title_list_2nd3rd)

34

In [13]:
short_job_title_list_3rd3rd = ['Forensic Scientist','Physical Scientist','Psychologist', 'Research Fellow', 'Science teacher', 'Scientist', 'social science', 'Solid-state', 'STEM', 'Stem Cell', 'Stereochemistry', 'Volcanologist', 'volcanology', 'wind power', 'wind turbine ', 'zoo', 'Zoologist', 'zoology ', 'chemistry', 'Crystallography', 'Earth Science', 'Ecologist', 'Ecology', 'Environmental', 'Exploration', 'fluid dynamics', 'geochemist', 'Geochemistry', 'Geographer', 'Geologist', 'geology ', 'geophysicist', 'GiS ', 'Groundwater Technician', 'hydrology', 'Inorganic', 'lab technician', 'Materials science', 'micropaleontologist', 'operations geologist', 'paleoclimatology']

In [14]:
len(short_job_title_list_3rd3rd)

41

In [15]:
# replace some of the characters so the search is a bit better:
def cleanCompanies(company_array):
    new_array = []
    for each in company_array:
        #print(each, "and type is ",type(each))
        print(each.find("-",0))
        print(type(each.find("-",0)),"type of each.find('-',0")
        #print(type(each.find("%2C",1)))
        if each.find("%2C",1) is not -1:
            each = each.replace("%2C",",")
            print(each,"= each")
        else:
            each = each
        if each.find("%26",1) is not -1:
            each = each.replace("%26","&")
            #print(each,"= each find")
        else:
            each = each
        ##print(each, "again with each")
        if  each.find("-1",1) is not -1:
            each = each.replace("-1","")
        #    print(each,"= each find")
        else:
            each = each
        if  each.find("-2",0) is not -1:
            each = each.replace("-2","")
            print(each,"= each find")
        else: 
            each = each
        if  each.find("-3",0) is not -1:
            each = each.replace("-3","")
        #    print(each,"= each find")
        else:
            each
        if  each.find("-",1) is not -1:
            each = each.replace("-","+")
            print(each,"= each find")
        else:
            each = each
        if  each.find(" ",1) is not -1:
            each = each.replace(" ","+")
            print(each,"= each find")
        else:
            each = each
        print("each at end",each)
        new_array.extend([each])
    print('new_array',new_array)
    return new_array

------------------------------------------------------------------------------
# 2. Get company names that are hiring for those science related job titles in Houston, TX by scraping job search site(s)
------------------------------------------------------------------------------


=========================================================================================
## A data exploration using beautiful soup
- based on this: https://jessesw.com/Data-Science-Skills/

### The function 'skills_info' takes three arguments; city, state, and job title and returns a list of companies searching for that job title in the last 30 days in Houston, TX

In [16]:
def skills_info(city="Houston", state="TX",job_title='data+scientist'):
    '''
    This function will take a desired city/state and look for all new job postings
    on Indeed.com. It will crawl all of the job postings and keep track of how many
    use a preset list of typical data science skills. The final percentage for each skill
    is then displayed at the end of the collation. 
        
    Inputs: The location's city and state. These are optional. If no city/state is input, 
    the function will assume a national search (this can take a while!!!).
    Input the city/state as strings, such as skills_info('Chicago', 'IL').
    Use a two letter abbreviation for the state.
    '''
    #print("test1")   
    final_job = job_title # searching for data scientist exact fit("data scientist" on Indeed search)
    
    # Make sure the city specified works properly if it has more than one word (such as San Francisco)  %22&l=
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_site_list = ['https://www.indeed.com/jobs?q=', final_job, '&l=', final_city,'%2C+', state] # Join all of our strings together so that indeed will search correctly
    else:
        final_site_list = ['https://www.indeed.com/jobs?q="', final_job, '"']

    final_site = ''.join(final_site_list) # Merge the html address together into one string

    # website query today (2017-02-25) is = https://www.indeed.com/jobs?q=data+scientist&l=Houston%2CTX
        
    base_url = 'http://www.indeed.com'
    
    #print("test2")  
    try:
        html = urlopen(final_site).read() # Open up the front page of our search first
    except:
        #print('That city/state combination did not have any jobs. Exiting . . .') # In case the city is invalid
        return
    soup = BeautifulSoup(html,"lxml") # Get the html from the first page
    
    # Now find out how many jobs there were
    
    if type(soup.find(id = 'searchCount')) is None or type(soup.find(id = 'searchCount')) == None:
        c
    
    else:
        variableS = soup.find(id = 'searchCount')
        if isinstance(variableS,type(None)):
            all_company_results = []
            return all_company_results
        else:
            print("soup.find(id = 'searchCount') = ",soup.find(id = 'searchCount'))
            print("soup.find(id = 'searchCount') = ",type(soup.find(id = 'searchCount')))

            num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found
                                                                                # The 'searchCount' object has this
            #print('num_jobs_area before filtering ', num_jobs_area)

            job_numbers = re.findall(b'\d+', num_jobs_area) # Extract the total jobs found from the search result

            #print('job number before filtering ', job_numbers[2])

            job_numbers_decoded = int(job_numbers[2].decode('utf-8'))
            #print('job_numbers_decoded = ',job_numbers_decoded, "type is ",type(job_numbers_decoded))

            #if len(job_numbers[2]) > 3: # Have a total number of jobs greater than 1000
            #    total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
            #else:
            #    total_num_jobs = int(job_numbers[2]) 

            total_num_jobs = job_numbers_decoded

            city_title = city
            if city is None:
                city_title = 'Nationwide'

            #print('There were', total_num_jobs, 'jobs found,', city_title) # Display how many jobs were found

            # took out /10 here, which seemed to be assuming always 10 results per page? now 15?
            num_pages = int(float(total_num_jobs/10)) # This will be how we know the number of times we need to iterate over each new
                                              # search result page

            job_descriptions = [] # Store all our descriptions in this list

            # used to be 'xrange' here but I changed it to 'range' because I'm using python3 and xrange is python2 only
            all_company_results = []
            for i in range(1,num_pages+1): # Loop through all of our search result pages
                #print('Getting page', i)

                start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
                current_page = ''.join([final_site, '&start=', start_num])
                # Now that we can view the correct 10 job returns, start collecting the text samples from each

                html_page = urlopen(current_page).read() # Get the page

                page_obj = BeautifulSoup(html_page,"lxml") # Locate all of the job links
                job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist
                #print('page_obj',page_obj)  

                #print("link.get('href')",base_url + link.get('href')
                #print("job_link_area.find_all('a')",job_link_area.find_all('a'))

                for a in job_link_area.find_all('a', href=True):
                    #print ("Found the URL:", a['href'])
                    if "/cmp/" in a['href']: 
                        #  if a['href'].find("/cmp/"):
                        #print("a had /cmp/ in it using find = ",a['href'])
                        arrayHrefParts_0 = a['href'].split('/cmp/')
                        #print("aarrayHrefParts_0 = ",arrayHrefParts_0)
                        arrayHrefParts_1 = arrayHrefParts_0[1].split('/')
                        #print("arrayHrefParts_1 = ",arrayHrefParts_1)
                        company = arrayHrefParts_1[0]
                        #print('company = ',company)
                        all_company_results.append(company)
                        #print('all_company_results =',all_company_results)
                    else:
                        continue
                        #print("/cmp/ not found in ,",a['href'])

                #print("[base_url + link.get('href') for link in job_link_area.find_all('a')]",[base_url + link.get('href') for link in job_link_area.find_all('a')])

                #job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')] # Get the URLS for the jobs

                #job_URLS = filter(lambda x:'clk' in x, job_URLS) # Now get just the job related URLS


                #         for j in xrange(0,len(job_URLS)):
                #             final_description = text_cleaner(job_URLS[j])
                #             if final_description: # So that we only append when the website was accessed correctly
                #                 job_descriptions.append(final_description)
                #             sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot! 

                sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot! 

            for each in range(len(all_company_results)):
                if '.html' in all_company_results[each]:
                    #print('all_company_results[each]',all_company_results[each]," each = ",each)
                    new_name_wo_html = all_company_results[each].split('.html')[0]
                    #print('new_name_wo_html',new_name_wo_html)
                    all_company_results[each] = new_name_wo_html  

            all_company_results = list(set(all_company_results))
            #print ('Done with collecting the job postings!')    
            #print ('all_company_results final = ',all_company_results)
            #print ("There were",len(all_company_results)," companies successfully found.")
            all_company_results_json = []
            for each in all_company_results:
                print("each in all_company_results = ",each)
                print("job_title in all_company_results = ",job_title)
                temp_obj_holder = {"company":"nothing","job_title":"nothing"}
                print("temp_obj_holder = ",temp_obj_holder)
                temp_obj_holder["job_title"] = job_title
                temp_obj_holder["company"] = each
                print("temp_obj_holder['job_title'] = ",temp_obj_holder["job_title"])
                all_company_results_json.append(temp_obj_holder)
            return all_company_results_json


### The function 'runMultiple_skills_info' runs the 'skills_info' function multiple times, one for each job title in the list given to 'runMultiple_skills_info' function

In [17]:
def runMultiple_skills_info(array_of_jobTitles=['puppies']):
    all_results = []
    for each in array_of_jobTitles:
        print('each= ',each)
        #temp_results1 = []
        temp_results1 = skills_info("Houston","TX",each)
        if temp_results1 is None:
            continue
        else:
            print('temp_results is ',temp_results1)
            #all_results.extend(temp_results1) 
            all_results.extend(temp_results1)
    return all_results

In [18]:
def saveArray_of_JobSearchResults(arrayOfJobsAndSearchTerm, FilePath):
    toCSV = arrayOfJobsAndSearchTerm
    keys = toCSV[0].keys()
    with open(FilePath, 'w') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(toCSV)
    return arrayOfJobsAndSearchTerm
    

#### Review:

3 functions:
- A. skills_info(city="Houston", state="TX",job_title='data+scientist')
- B. runMultiple_skills_info(array_of_jobTitles=['puppies'])
- C. saveArray_of_JobSearchResults(arrayOfJobsAndSearchTerm = geo_and_geo, FilePath ='../DATA/someFile')

A does the web scraping. B does web scraping searching for each of the items in the input array and combines the resuls. C saves the results as a csv file with two columns. One column header is Job Title used in search. The other column is company name looking for that job title in Houston Texas. 

------------------------------------------------------------------------------
# 3. Get addresses and lat longs associated with each company name in json form
------------------------------------------------------------------------------


### Modules imported at top of this notebook

### using <a href="https://docs.python.org/3/library/configparser.html">configparser</a> to handle config file 

In [19]:
config = configparser.ConfigParser()

In [20]:
configFilePath = '../config.conf'
# configparser.ConfigParser().read(configFilePath)
config.read(configFilePath)

['../config.conf']

### next block sets a variety of useful variables used in the google places API

In [21]:
#### sets key as variable that contains the google-places-api key
key = config['google-places-api']['key1']
#### creates a string used as the prefix to the key in the url used in calling the API service
key_pre = '&key='
#### the base url in the google places API get call
base_url = 'https://maps.googleapis.com/maps/api/place/textsearch/json?query='
#### the base url when calling a next page, in other words, when results are more than 20
next_page_base_url = 'https://maps.googleapis.com/maps/api/place/textsearch/json?'

In [22]:
# replace some of the characters so the search is a bit better:
def cleanCompanies(company_array):
    company_array = company_array[1:]
    new_array = []
    for each in company_array:
        job_title = each[1]
        each = each[0]
        #print(each, "and type is ",type(each))
        print(each.find("-",0))
        print(type(each.find("-",0)),"type of each.find('-',0")
        #print(type(each.find("%2C",1)))
        if each.find("%2C",1) is not -1:
            each = each.replace("%2C",",")
            print(each,"= each")
        else:
            each = each
        if each.find("%26",1) is not -1:
            each = each.replace("%26","&")
            #print(each,"= each find")
        else:
            each = each
        ##print(each, "again with each")
        if  each.find("-1",1) is not -1:
            each = each.replace("-1","")
        #    print(each,"= each find")
        else:
            each = each
        if  each.find("-2",0) is not -1:
            each = each.replace("-2","")
            print(each,"= each find")
        else: 
            each = each
        if  each.find("-3",0) is not -1:
            each = each.replace("-3","")
        #    print(each,"= each find")
        else:
            each = each
        if  each.find("-",1) is not -1:
            each = each.replace("-","+")
            print(each,"= each find")
        else:
            each = each
        print("each at end",each)
        new_array.extend([[each,job_title]])
    print('new_array',new_array)
    return new_array

### All the functions that load and clean the company + job title list from csv into memory

In [23]:
def load_listCompanyJobtitle(path):
    df=pd.read_csv(path,header=None)
    company_array = df.values
    company_array_formatted = cleanCompanies(company_array)
    return company_array_formatted

### Next few functions run the company names through the google places API

##### Below is the main function that calls the Google Places API to get the initial results

In [52]:
# function that takes a datset that is 
def callGooglePlacesAPI(search_term,nameOfSearch,job_title):
    print('job_title put into googleAPI call is =',job_title)
    dataset_I = requests.get(base_url+search_term+key_pre+key)
    next_page_result = []
    try:
        next_page_token = dataset_I.json()['next_page_token']
        next_page_result = callNextPageResults(nameOfSearch,next_page_token,job_title)
    except: 
        pass
    #####if 'next_page_token' in dataset_I:
    #next_page_token = dataset_I.json()['next_page_token']
    #print("next_page_token",next_page_token)
    #callNextPageResults(nameOfSearch,next_page_token)
    #####next_page_results = makeFeatureCollectionsFromPlaces(dataset_I['next_page_token'],nameOfSearch).json()['results']
    ##### #else: 
    ####    next_page = "none needed"
    dataset = dataset_I.json()['results']
    #if 'next_page_token' in dataset_I:
    #    dataset.extend(next_page_results)
    #    print("dataset up top",dataset)
    #else:
    #    next_page = "none needed"
    array_of_features = []
    for each in dataset:
        photos = []
        # centerpoint for coordinates
        #print(each)
        lat = each['geometry']['location']['lat']
        long = each['geometry']['location']['lng']
        # properties
        address = each['formatted_address']
        id = each['id']
        name = each['name']
        # photos is an array
        try:
            photos = each['photos']
        except:
            photos = []
            #print("photos , attribute error but kept going")
        place_id = each['place_id']
        try:
            rating = each['rating']
        except:
            rating = 'NA'
            #print("rating , attribute error but kept going")
        reference = each['reference']
        # types is an array
        try:
            types = each['types']
        except:
            types = 'NA'
        testPoint = Point((long, lat))
        test2_geoJSOn = Feature(geometry=testPoint, properties={"job_title":job_title,"commpany_searched_for":search_term ,"name": name,"address":address,"id":id,"photos":photos,"place_id":place_id,"rating":rating,"reference":reference,"types":types})
        array_of_features.extend([test2_geoJSOn])
#     new_FeatureCollection = FeatureCollection(array_of_features) 
#     with open('../DATA/'+nameOfSearch+'_test.geojson', 'w') as f:
#         json.dump(new_FeatureCollection, f)
#     return new_FeatureCollection
    print("array_of_features = ",array_of_features," and next_page_result = ",next_page_result)
    print("types: for "+search_term+" type(next_page_result)= ",type(next_page_result)," and type(array_of_features)=  ",type(array_of_features))
    if next_page_result is None or next_page_result==[]:
        fakeVariable = 3
        #print("no second page")
    else:
        #print("next_page_result passed none test and is ",next_page_result)
        array_of_features.extend(next_page_result)
        #print("array_of_features after extension = ",array_of_features)
    #print("2nd statement of array_of_features = ",array_of_features)
    return array_of_features

##### Below is a secondary function that calls the Google Places API "next page" to get any results further additional to the first 40. 

In [53]:
def callNextPageResults(nameOfSearch,keyForNextPage,job_title):
    time.sleep(5)
    #print('next_page_base_url+"&pagetoken="+keyForNextPage+key_pre+key =',next_page_base_url+"pagetoken="+keyForNextPage+key_pre+key)
    dataset_I = requests.get(next_page_base_url+"pagetoken="+keyForNextPage+key_pre+key)
    #print("dataset_I second page full response",dataset_I.json())
    dataset = dataset_I.json()['results']
    array_of_features = []
    for each in dataset:
        photos = []
        # centerpoint for coordinates
        #print(each)
        lat = each['geometry']['location']['lat']
        long = each['geometry']['location']['lng']
        # properties
        address = each['formatted_address']
        id = each['id']
        name = each['name']
        # photos is an array
        try:
            photos = each['photos']
        except:
            photos = []
            #print("photos , attribute error but kept going")
        place_id = each['place_id']
        try:
            rating = each['rating']
        except:
            rating = 'NA'
            #print("rating , attribute error but kept going")
        reference = each['reference']
        # types is an array
        try:
            types = each['types']
        except:
            types = 'NA'
        testPoint = Point((long, lat))
        test2_geoJSOn = Feature(geometry=testPoint, properties={"job_title":job_title,"commpany_searched_for":search_term ,"name": name,"address":address,"id":id,"photos":photos,"place_id":place_id,"rating":rating,"reference":reference,"types":types})
        array_of_features.extend([test2_geoJSOn])
    #new_FeatureCollection = FeatureCollection(array_of_features) 
    #with open('../DATA/'+nameOfSearch+'_2ndPage_test.geojson', 'w') as f:
    #    json.dump(new_FeatureCollection, f)
    #print("second page results = ",new_FeatureCollection)
    #return new_FeatureCollection
    return array_of_features

##### Below is the a function that runs through a list and calls the two functions above and combines the results into a single result

In [54]:
def create_List_Features(cp_array_formatted):
    array_of_all_features = []
    print("in function create_List_Features, cp_array_formatted[0][0] company?",cp_array_formatted[0][0])
    print("in function create_List_Features, cp_array_formatted[0][1] job_title?",cp_array_formatted[0][1])
    for each in cp_array_formatted:
        #try:
        features_1 = callGooglePlacesAPI(each[0]+"+Houston+Texas","random_for_now",each[1])
        #print("features for a company name =",features_1)
        if array_of_all_features == [] or array_of_all_features is None:
            array_of_all_features = features_1
        else:
            if features_1 is None or features_1 == []:
                pass
            else:
                array_of_all_features.extend(features_1)
        #except:
        #    print("error in except",error)
        
        #print("array_of_all_features = ",array_of_all_features)
    
    return array_of_all_features

##### function that takes a list of features and returns a feature collection

In [27]:
# creates feature collection from list of features
def createFeatureCollection(list_of_features,geographic_limits):
    slimmed_featList = []
    print('geographic_limits =',geographic_limits)
    for each in list_of_features:
        print("each = ",each)
        print("each['geometry']['coordinates']",each['geometry']['coordinates'])
        print("each['geometry']['coordinates'][0] =",each['geometry']['coordinates'][0])
        if each['geometry']['coordinates'][0] > geographic_limits["west_limit"] and each['geometry']['coordinates'][0] < geographic_limits["east_limit"] and each['geometry']['coordinates'][1] < geographic_limits["north_limit"] and each['geometry']['coordinates'][1] > geographic_limits["south_limit"]:
            slimmed_featList.append(each)
    
    
    print('finished the function createFeatureCollection and slimmed featureList is slimmed_featList',slimmed_featList)
    return FeatureCollection(slimmed_featList)



##### function that takes the return from the above function and saves it as a geoJSON file

 30°15'6.44"N   96° 5'17.13"W
  29°16'56.13"N  95°57'46.47"W
   29°21'28.30"N   94°38'24.37"W
    30°26'55.88"N   94°48'6.63"W
    
    north limit = 30°26'55.88      30.44885556
    south limit = 29°16'56.13"N     29.28225833
    west limit = 96° 5'17.13"W     -96.08805556
    east limit = 94°38'24.37"W     -94.64000000
   

In [36]:
# convert feature collection to geoJSON
def convertFeatureCollectionToGeoJSON(featCollection,fileNamePath):
    print('got to the function convertFeatureCollectionToGeoJSON')
    with open(fileNamePath, 'w') as f:
        json.dump(featCollection, f)



#### function that calls the others. It takes as arguments the list of companies and job_titles & the path to save the resulting geojson to

In [37]:
def make_geojson_from_CompanyJobList(company_list_formatted,path,geographic_limits):
    #### function that calls the google api and returns results in terms of a list of feature
    print("make_geojson_from_CompanyJobList(company_list_formatted = ",company_list_formatted)
    list_of_features = create_List_Features(company_list_formatted)
    #### function that creates a feature collection from the list of features
    featCollection = createFeatureCollection(list_of_features,geographic_limits)
    #### function that takes the feature collection and file path and saves the feature collection as a geoJSON file
    convertFeatureCollectionToGeoJSON(featCollection,path)
    return featCollection
    

### The geographic limits that trim down the results to only things around Houston as some jobs are advertised in Houston that aren't located in Houston

In [38]:
geographic_limits = {"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}

------------------------------------------------------------------------------
## Run Everything As Single Function
------------------------------------------------------------------------------

In [39]:
def getJSON_from_jobsList_everything(array_of_job_Titles,FilePath,path,geographic_limits):
    #### returns list of companies advertising each job in an array of job titles
    list_of_CompaniesAndJobs = runMultiple_skills_info(array_of_job_Titles)
    #### saves the list of lists from above... and formats it a bit further in prep for next step
    Array_of_JobCompany_edA = saveArray_of_JobSearchResults(list_of_CompaniesAndJobs, FilePath)
    print("in geoJSON_from_jobsList_everything, Array_of_JobCompany is =",Array_of_JobCompany_edA)
    ##### opens and cleans the list of companies 
    Array_of_JobCompany_cln = load_listCompanyJobtitle(FilePath)
    #### runs the companies through the google places API
    #### combines the results into an array of features
    #### turns that into a feature collection 
    #### then saves as geojson
    feature_collection = make_geojson_from_CompanyJobList(Array_of_JobCompany_cln,path,geographic_limits)
    #### returns the previously created feature collection for testing of result without reloading from geojson
    return feature_collection



short_job_title_list_VeryShort2

In [45]:
Input_List_JobTitles = short_job_title_list_VeryShort3

FilePath = '../DATA/test_jobTitleCompany_Short2_v1.csv'

path="../DATA/test_geojson_Everything_Short2_v1.geojson"

geographic_limits ={"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}

In [46]:
funcArg = {"ListJobTitles":short_job_title_list_VeryShort3,"pCSV":'../DATA/test_jobTitleCompany_Short2_v1.csv',"pGeoJSON":"../DATA/test_geojson_Everything_Short2_v1.geojson","geo_lim":{"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}}

In [47]:
featCollect_short2  = getJSON_from_jobsList_everything(funcArg["ListJobTitles"],funcArg["pCSV"],funcArg["pGeoJSON"],funcArg["geo_lim"])
featCollect_short2][0]][0]

each=  Psychologist
soup.find(id = 'searchCount') =  <div id="searchCount">Jobs 1 to 10 of 126</div>
soup.find(id = 'searchCount') =  <class 'bs4.element.Tag'>
each in all_company_results =  Harris-County
job_title in all_company_results =  Psychologist
temp_obj_holder =  {'company': 'nothing', 'job_title': 'nothing'}
temp_obj_holder['job_title'] =  Psychologist
each in all_company_results =  Clinical-Psychology-Doctoral-Interns-in-Texas
job_title in all_company_results =  Psychologist
temp_obj_holder =  {'company': 'nothing', 'job_title': 'nothing'}
temp_obj_holder['job_title'] =  Psychologist
each in all_company_results =  Rescare
job_title in all_company_results =  Psychologist
temp_obj_holder =  {'company': 'nothing', 'job_title': 'nothing'}
temp_obj_holder['job_title'] =  Psychologist
each in all_company_results =  Lamb-Behavioral-Health-Center%2C-LLC
job_title in all_company_results =  Psychologist
temp_obj_holder =  {'company': 'nothing', 'job_title': 'nothing'}
temp_obj_holder[

In [55]:
Input_List_JobTitles = short_job_title_list_VeryShort3

FilePath = '../DATA/test_jobTitleCompany_Short3_v1.csv'

path="../DATA/test_geojson_Everything_Short3_v1.geojson"

geographic_limits ={"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}

In [56]:
funcArg = {"ListJobTitles":short_job_title_list_VeryShort3,"pCSV":'../DATA/test_jobTitleCompany_Short3_v1.csv',"pGeoJSON":"../DATA/test_geojson_Everything_Short3_v1.geojson","geo_lim":{"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}}

In [57]:
featCollect_short3  = getJSON_from_jobsList_everything(funcArg["ListJobTitles"],funcArg["pCSV"],funcArg["pGeoJSON"],funcArg["geo_lim"])

each=  Psychologist
soup.find(id = 'searchCount') =  <div id="searchCount">Jobs 1 to 10 of 128</div>
soup.find(id = 'searchCount') =  <class 'bs4.element.Tag'>
each in all_company_results =  Harris-County
job_title in all_company_results =  Psychologist
temp_obj_holder =  {'company': 'nothing', 'job_title': 'nothing'}
temp_obj_holder['job_title'] =  Psychologist
each in all_company_results =  Clinical-Psychology-Doctoral-Interns-in-Texas
job_title in all_company_results =  Psychologist
temp_obj_holder =  {'company': 'nothing', 'job_title': 'nothing'}
temp_obj_holder['job_title'] =  Psychologist
each in all_company_results =  Rescare
job_title in all_company_results =  Psychologist
temp_obj_holder =  {'company': 'nothing', 'job_title': 'nothing'}
temp_obj_holder['job_title'] =  Psychologist
each in all_company_results =  Lamb-Behavioral-Health-Center%2C-LLC
job_title in all_company_results =  Psychologist
temp_obj_holder =  {'company': 'nothing', 'job_title': 'nothing'}
temp_obj_holder[

# Running this 3 sets of the job titles list to minimize possible problem of wireless dropping in middle:
short_job_title_list_1st3rd
short_job_title_list_2nd3rd
short_job_title_list_3rd3rd 

Input_List_JobTitles = short_job_title_list_1st3rd

FilePath = '../DATA/test_jobTitleCompany_1of3_v1.csv'

path="../DATA/test_geojson_Everything_1of3_v1.geojson"

geographic_limits ={"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}

In [None]:
funcArg = {"ListJobTitles":short_job_title_list_1st3rd,"pCSV":'../DATA/test_jobTitleCompany_1of3_v1.csv',"pGeoJSON":"../DATA/test_geojson_Everything_1of3_v1.geojson","geo_lim":{"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}}

In [None]:
featCollect_1of3  = getJSON_from_jobsList_everything(funcArg["ListJobTitles"],funcArg["pCSV"],funcArg["pGeoJSON"],funcArg["geo_lim"])

Input_List_JobTitles = short_job_title_list_2nd3rd

FilePath = '../DATA/test_jobTitleCompany_2of3_v1.csv'

path="../DATA/test_geojson_Everything_2of3_v1.geojson"

geographic_limits ={"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}

In [None]:
funcArg = {"ListJobTitles":short_job_title_list_2nd3rd,"pCSV":'../DATA/test_jobTitleCompany_2of3_v1.csv',"pGeoJSON":"../DATA/test_geojson_Everything_2of3_v1.geojson","geo_lim":{"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}}

In [None]:
featCollect_2of3  = getJSON_from_jobsList_everything(funcArg["ListJobTitles"],funcArg["pCSV"],funcArg["pGeoJSON"],funcArg["geo_lim"])

Input_List_JobTitles = short_job_title_list_3rd3rd 

FilePath = '../DATA/test_jobTitleCompany_2of3_v1.csv'

path="../DATA/test_geojson_Everything_2of3_v1.geojson"

geographic_limits ={"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}

In [None]:
funcArg = {"ListJobTitles":short_job_title_list_3rd3rd,"pCSV":'../DATA/test_jobTitleCompany_3of3_v1.csv',"pGeoJSON":"../DATA/test_geojson_Everything_3of3_v1.geojson","geo_lim":{"north_limit":30.454961,"south_limit":28.956857,"west_limit":-96.206159,"east_limit":-94.64000000}}

In [None]:
featCollect_3of3  = getJSON_from_jobsList_everything(funcArg["ListJobTitles"],funcArg["pCSV"],funcArg["pGeoJSON"],funcArg["geo_lim"])