# House-keeping

In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time

from IPython.display import Audio, display
import pickle

# Handy functions

In [2]:
def allDone():
    '''this function outputs a short audio when called. 
    Typically this is used to signal a task completion'''
    
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))
    

# collect job postings with predefined category

In [3]:
def indeedScrape(tSearch = 'data scientist', nMax = 50):
    
    columns = ['location', 'company_name', 'job_title', 'summary', 'full_info', 'ref']
    df = pd.DataFrame(columns = columns)

    metaUrl = 'https://www.indeed.co.uk/jobs?q=%(search)s&l=United+Kingdom&radius=100&start=' % {'search':tSearch.replace(' ', '+')}

    for start in range(0, nMax, 10):
        
        try:
            url = metaUrl + str(start)
            page = requests.get(url)
#             print('retriving url: ', url)
        except:
            print('---Failed to retrieve---')
            print('url:', url)
            continue
            
        soup = BeautifulSoup(page.text, 'html.parser')
#         time.sleep(1)
        
        ## metadata from mainpage
        # extract info from class:row
        # company name and job title
        companies, jobs = [], []
        for div in soup.find_all(name = 'div', attrs = {'class':'row'}):
            company = div.find_all(name = 'span', attrs = {'class':'company'})
            if len(company) > 0:
                for b in company:
                    companies.append(b.text.strip())
                else:
                    sec_try = div.find_all(name = 'span', attrs = {'class':'result-link-source'})
                    for span in sec_try:
                        companies.append(span.text.strip())
            for a in div.find_all(name = 'a', attrs = {'data-tn-element':"jobTitle"}):
                jobs.append(a['title'])

        # extract location
        locations = []
        spans = soup.find_all('span', attrs={'class':'location'})
        for span in spans:
            locations.append(span.text)

        # extract summaries
        summaries = []
        divs = soup.find_all('div', attrs = {'class':'summary'})
        for i, div in enumerate(divs):
            summaries.append(div.text.strip())

        ## crawl to subpages
        descriptions = []
        link_list = []
        for adlink in soup.select('a[onmousedown*="return rclk(this,jobmap["]'):
            suburl = "https://www.indeed.com" + adlink['href']
            link_list.append(suburl)
            try:
                subpage = requests.get(suburl)
                subsoup = BeautifulSoup(subpage.text)
            except:
                print('--- Failed to retrieve sub-URL ---')
                print('url: ', suburl)
            # extract descriptions
            for des in subsoup.select('div[class*="jobsearch-JobComponent-description"]'): 
                 descriptions.append(des.get_text())

        df_temp = list(zip(locations, companies, jobs, summaries, descriptions, link_list))
        df_temp = pd.DataFrame(df_temp, columns = columns)
        df = df.append(df_temp).reset_index(drop = True)
    return df

In [4]:
# Financial part - Done!
# # 13-2099.01
# Financial_Quantitative_Analysts = indeedScrape(tSearch='Financial Quantitative Analysts', nMax=300)
# Financial_Quantitative_Analysts['soc'] = '13-2099.01'
# # 13-2051.00
# Financial_Analysts = indeedScrape(tSearch='Financial Analysts', nMax=1000)
# Financial_Analysts['soc'] = '13-2051.00'
# # 13-2052.00
# Financial_Advisors = indeedScrape(tSearch='Financial Advisors', nMax=2000)
# Financial_Advisors['soc'] = '13-2052.00'

# df = pd.concat([Financial_Analysts, Financial_Advisors, Financial_Quantitative_Analysts], ignore_index=True)
# dirPData = '../data/'
# f_name = dirPData + 'financial_jobs_with_soc.pickle'
# with open(f_name, "wb") as f:
#     pickle.dump(df, f)
    
# allDone()

In [5]:
# Telecommunication
# 17-2071.00
Electrical_Engineers = indeedScrape(tSearch='Electrical Engineers', nMax = 1000)
Electrical_Engineers['soc'] = '17-2071.00'
# 17-2141.00
Mechanical_Engineers = indeedScrape(tSearch='Mechanical Engineers', nMax = 1000)
Mechanical_Engineers['soc'] = '17-2141.00'

In [10]:
df = pd.concat([Electrical_Engineers, Mechanical_Engineers], ignore_index=True)
dirPData = '../data/'
f_name = dirPData + 'telecom_jobs_with_soc.pickle'
with open(f_name, "wb") as f:
    pickle.dump(df, f)