In [1]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime 

In [2]:
def get_str(tup):
    if tup[0] == '':
        return tup[1]
    return tup[0]

In [3]:
def write_files(companies, titles, texts):
    end_point = len(companies) 
    for i in range(0, end_point):
        post_id = 'postings/' + str(i) + '-' + re.sub("[^A-Za-z]", "", companies[i].strip()) + '-' + re.sub("[^A-Za-z]", "", titles[i].strip()) + '.rtf'
        out_file = open(post_id,'w', encoding="utf-8", errors = 'ignore')
        out_file.write(texts[i])
        out_file.close()

In [4]:
def get_text_indeed(companies, titles, urls):
    text_lst = []
    edu_lst = []
    for url in urls:
        education = []
        myRequest = requests.get(url)
        soup = str(BeautifulSoup(myRequest.text, "html.parser")).replace("\n", "")
        paragraph_text = r'<p>(.*?)</p>'
        list_text = r'<li>(.*?)</li>'
        clean = re.compile('<.*?>')
        text = " ".join([re.sub(clean, '', get_str(x)).replace("amp;", "") for x in re.findall(paragraph_text + "|" + list_text, soup)])
        if len(re.findall("(?i) GED |High School", text)) > 0:
            education.append("High School")
        if len(re.findall(" BS | BA | B.S. | B.A. |Bachelor", text)) > 0:
            education.append("Bachelor's")
        if len(re.findall(" MS | M.S. |Master", text)) > 0:
            education.append("Master's")
        if len(re.findall("PhD | Ph.D. |Doctorate", text)) > 0:
            education.append("PhD")
        text_lst.append(text)
        edu_lst.append(" ".join(education))
        
    return text_lst, edu_lst

In [5]:
def add_terms_indeed(df, terms):
    for term in terms:
        url = "https://www.indeed.com/jobs?q=" + term.replace(" ", "+") + "&fromage=1"
        myRequest = requests.get(url)
        soup = str(BeautifulSoup(myRequest.text, "html.parser"))

        title_match = 'rel="noopener nofollow" target="_blank" title="([^"]+)'
        titles = [x.replace("amp;", "") for x in re.findall(title_match, soup)]

        match_1 = r'<span class="company">\n([^<]+)'
        match_2 = r'rel="noopener" target="_blank">\n([^<]+)'

        companies = [get_str(x).replace("amp;", "") for x in re.findall(match_1 + '|' + match_2, soup)]

        url_match = r'data-tn-element="jobTitle" href="([^"]+)' 
        urls = ["https://www.indeed.com" + x.replace("amp;", "") for x in re.findall(url_match, soup)]

        texts, educations = get_text_indeed(companies, titles, urls)
        location_match = r'data-rc-loc="([^"]+)'
        locations = re.findall(location_match, soup)

        new_df = pd.DataFrame()

        new_df['Platform'] = ["Indeed"] * len(companies)
        new_df['Search_Term'] = [term] * len(companies)
        new_df['Title'] = titles
        new_df['Company'] = companies
        new_df['Location'] = locations
        new_df['URL'] = urls
        new_df['Text'] = texts
        new_df['Education'] = educations
        new_df['Date_Queried'] = [datetime.now().date()] * len(companies)

        df = df.append(new_df, sort = False).drop_duplicates().reset_index(drop=True)
        
    return df

In [6]:
def get_info_tjfg(urls):
    titles = [] 
    companies = []  
    locations = []  
    educations = []  
    texts = [] 
    for url in urls:
        education = []
        myRequest = requests.get(url)
        soup = str(BeautifulSoup(myRequest.text, "html.parser"))
        paragraph_text = r'<p>(.*?)</p>'
        list_text = r'<li>(.*?)</li>'
        clean = re.compile('<.*?>')
        text = " ".join([re.sub(clean, '', get_str(x)).replace("amp;", "") for x in re.findall(paragraph_text + "|" + list_text, soup)])
        texts.append(text)
        title_match = '<h1 itemprop="title">(.*?)</h1>'
        titles.append(re.findall(title_match, soup)[0])
        loc_match = '<span itemprop="jobLocation">(.*?)</span>'
        locations.append(re.findall(loc_match, soup)[0])
        company_match = '>(.*?)</a></span>'
        companies.append(re.findall(company_match, soup)[0])
        if len(re.findall("(?i) GED |High School", text)) > 0:
            education.append("High School")
        if len(re.findall(" BS | BA | B.S. | B.A. |Bachelor", text)) > 0:
            education.append("Bachelor's")
        if len(re.findall(" MS | M.S. |Master", text)) > 0:
            education.append("Master's")
        if len(re.findall("PhD | Ph.D. |Doctorate", text)) > 0:
            education.append("PhD")
        educations.append(" ".join(education))
    return titles, companies, locations, educations, texts

In [7]:
def add_terms_tjfg(df, terms):
    for term in terms:
        url = url = "https://techjobsforgood.com/?q=" + term
        myRequest = requests.get(url)
        soup = str(BeautifulSoup(myRequest.text, "html.parser"))
        soup = re.sub("A-1", "", soup)
        
        url_match = "/jobs/(\d+)"
        urls = list(set(["http://techjobsforgood.com/jobs/" + x + "/" for x in re.findall(url_match, soup)]))
        
        titles, companies, locations, educations, texts = get_info_tjfg(urls)

        new_df = pd.DataFrame()

        new_df['Platform'] = ["TJFG"] * len(companies)
        new_df['Search_Term'] = [term] * len(companies)
        new_df['Title'] = titles
        new_df['Company'] = companies
        new_df['Location'] = locations
        new_df['URL'] = urls
        new_df['Text'] = texts
        new_df['Education'] = educations
        new_df['Date_Queried'] = [datetime.now().date()] * len(companies)

        df = df.append(new_df, sort = False).drop_duplicates().reset_index(drop=True)
        
    return df

In [8]:
search_terms = ["Engineer", "Analyst", "Research"]

In [9]:
#New DataFrame code below
#df = pd.DataFrame(columns = ['Platform','Search_Term','Title','Company','Location',
#                             'URL','Education','Text','Date_Queried'])
#start_point = 0

df = pd.read_csv("ectj_postings.csv")

In [10]:
df = add_terms_tjfg(df, search_terms)

In [11]:
df = add_terms_indeed(df, search_terms)

In [14]:
df = df.drop_duplicates(subset=['URL']).reset_index(drop=True)
df = df.loc[df["Text"].notna()].reset_index(drop=True)
df = df.loc[df["Text"] != ""].reset_index(drop=True)

In [15]:
#write_files(df["Company"], df["Title"], df["Text"])

In [16]:
df.tail()

Unnamed: 0,Platform,Search_Term,Title,Company,Location,URL,Education,Text,Date_Queried
273,Indeed,Research,"Research Assistant l, 40 Hours, Days, BWH - Sl...",Brigham & Women's Hospital(BWH),"Boston, MA",https://www.indeed.com/rc/clk?jk=b6948d92e43b1...,,,2020-09-09
274,Indeed,Research,Assistant Research Scientist,New York University,"New York, NY",https://www.indeed.com/company/Center-for-Lati...,Bachelor's,To provide a full range of research related su...,2020-09-09
275,Indeed,Research,Research Coordinator,Texas State University,"Round Rock, TX",https://www.indeed.com/rc/clk?jk=6835710be0e13...,,This position is located on the Round Rock Cam...,2020-09-09
276,Indeed,Research,Senior Research Associate (6610U) - Terner Cen...,University of California Berkeley,"Berkeley, CA",https://www.indeed.com/rc/clk?jk=98b99853d96aa...,Bachelor's,"Exceptional organizational, project management...",2020-09-09
277,Indeed,Research,Epidemiologist II,City & County of San Francisco,"San Francisco, CA",https://www.indeed.com/rc/clk?jk=6533fc9170313...,,2803 Epidemiologist II Department of Public He...,2020-09-09


In [18]:
df.to_csv("ectj_postings.csv", index = False)