In [1]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime 

In [2]:
def get_str(tup):
    if tup[0] == '':
        return tup[1]
    return tup[0]

In [3]:
def write_files(companies, titles, texts, start_point):
    end_point = len(companies) - 1
    for i in range(start_point, end_point):
        post_id = 'postings/' + str(i) + '-' + re.sub("[^A-Za-z]", "", companies[i].strip()) + '-' + re.sub("[^A-Za-z]", "", titles[i].strip()) + '.rtf'
        out_file = open(post_id,'w', encoding="utf-8", errors = 'ignore')
        out_file.write(texts[i])
        out_file.close()

In [4]:
def get_text_indeed(companies, titles, urls):
    text_lst = []
    for url in urls:
        myRequest = requests.get(url)
        soup = str(BeautifulSoup(myRequest.text, "html.parser")).replace("\n", "")
        paragraph_text = r'<p>(.*?)</p>'
        list_text = r'<li>(.*?)</li>'
        clean = re.compile('<.*?>')
        text = " ".join([re.sub(clean, '', get_str(x)).replace("amp;", "") for x in re.findall(paragraph_text + "|" + list_text, soup)])
        text_lst.append(text)
    return text_lst

In [5]:
def add_terms_indeed(df, terms):
    for term in terms:
        url = "https://www.indeed.com/jobs?q=" + term.replace(" ", "+") + "&fromage=1"
        myRequest = requests.get(url)
        soup = str(BeautifulSoup(myRequest.text, "html.parser"))

        title_match = 'rel="noopener nofollow" target="_blank" title="([^"]+)'
        titles = [x.replace("amp;", "") for x in re.findall(title_match, soup)]

        match_1 = r'<span class="company">\n([^<]+)'
        match_2 = r'rel="noopener" target="_blank">\n([^<]+)'

        companies = [get_str(x).replace("amp;", "") for x in re.findall(match_1 + '|' + match_2, soup)]

        url_match = r'data-tn-element="jobTitle" href="([^"]+)' 
        urls = ["https://www.indeed.com" + x.replace("amp;", "") for x in re.findall(url_match, soup)]

        texts = get_text_indeed(companies, titles, urls)
        
        location_match = r'data-rc-loc="([^"]+)'
        locations = re.findall(location_match, soup)

        new_df = pd.DataFrame()

        new_df['Platform'] = ["Indeed"] * len(companies)
        new_df['Search_Term'] = [term] * len(companies)
        new_df['Title'] = titles
        new_df['Company'] = companies
        new_df['Location'] = locations
        new_df['URL'] = urls
        new_df['Text'] = texts
        new_df['Date_Queried'] = [datetime.now().date()] * len(companies)

        df = df.append(new_df).drop_duplicates().reset_index(drop=True)
        
    return df

In [6]:
def get_text_linkedin(companies, titles, urls):
    text_lst = []
    for url in urls:
        myRequest = requests.get(url)
        soup = str(BeautifulSoup(myRequest.text, "html.parser")).replace("\n", "")
        match = r'<br/>(.*?)<br/>'  
        clean = re.compile('<.*?>')
        text = " ".join([re.sub(clean, '', x).replace("amp;", "") for x in re.findall(match, soup)])
        text_lst.append(text)
    return text_lst

In [7]:
def add_terms_linkedin(df, terms):
    for term in terms:
        url =("https://www.linkedin.com/jobs/search?keywords=" + term.replace(" ", "%20") + 
              "&location=United%20States&trk=public_jobs_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&f_TP=1")
        myRequest = requests.get(url)
        soup = str(BeautifulSoup(myRequest.text, "html.parser"))

        title_match = r'<span class="screen-reader-text">([^<]+)'
        titles = [x.replace("amp;", "") for x in re.findall(title_match, soup)]

        company_match = r'alt="([^"]+)'
        companies = [x.replace("amp;", "") for x in re.findall(company_match, soup)][:-1]

        url_match = r'href="https://www.linkedin.com/jobs/view([^"]+)' 
        urls = ["https://www.linkedin.com/jobs/view" + x for x in re.findall(url_match, soup)]

        location_match = r'<span class="job-result-card__location">([^<]+)'
        locations = re.findall(location_match, soup)

        new_df = pd.DataFrame()
        new_df['Platform'] = ["LinkedIn"] * len(companies)
        new_df['Search_Term'] = [term] * len(companies)
        new_df['Title'] = titles
        new_df['Company'] = companies
        new_df['Location'] = locations
        new_df['URL'] = urls
        new_df['Date_Queried'] = [datetime.now().date()] * len(companies)

        df = df.append(new_df).drop_duplicates().reset_index(drop=True)
        
    return df

In [9]:
key_terms = ["Trust", "Privacy", "Diversity", "Equity", 
"Equality", "Inclusion", "Ethics","Policy", "Educational", 
"Compliance","Emerging","Responsible", "Accountability"]

search_terms = [x + " Technology" for x in key_terms]

In [10]:
#New DataFrame code below
#df = pd.DataFrame(columns = ['Platform','Search_Term','Title','Company','Location','URL', 'Text','Date_Queried'])
#start_point = 0

df = pd.read_csv("postings.csv")
start_point = df.shape[0]

In [11]:
#df = add_terms_linkedin(df, search_terms)

In [12]:
df = add_terms_indeed(df, search_terms)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [13]:
df = df.drop_duplicates(subset=['URL']).reset_index(drop=True)

In [14]:
write_files(df["Company"], df["Title"], df["Text"], start_point)

In [15]:
df.to_csv("postings.csv")