In [3]:
import sys
import requests as rq
from bs4 import BeautifulSoup as bs
from time import sleep
from time import time
from random import randint
from warnings import warn
import json
import pandas as pd

Most websites do not leave job postings for more than 2-5 months hence getting job posting of previous years from the jobsites directly is impossible. 

To get old job postings, I'm extracting data from a web archive called [the wayback machine](https://archive.org/web/). The drawback to this is that the links are not clickable hence being able to extract the job posting descriptions might be difficult.

# Scraping Indeed job site 

In [4]:
roles = [
    'Marketing Technologist',
    'SEO Consultant',
    'Web analytics Developer',
    'Digital Marketing Manager',
    'Social media manager',
    'Content Manager',
    'Information Architect',
    'UX designer',
    'UI Designer',
    'Front end designer',
    'Front end developer',
    'Mobile Developer',
    'Full stack developer',
    'Software Developer',
    'WordPress Developer',
    'Python Developer',
    'Systems Engineer',
    'Data Architect',
    'Database Administrator',
    'Data Analyst', 
    'Data scientist',
    'Cloud Architect',
    'DevOps Manager',
    'Agile project manager',
    'Product Manager',
    'Security specialist',
    'QA (Quality Assurance) specialist',
    'Game developer',
    'Computer Graphics animator',
    'Information security analyst',
    'Network and system administrator',
    'Product owner'

]

In [7]:
def check_availability(link):
  url = f'http://archive.org/wayback/available?url={link}'
  urls = rq.get(url).text
  parse_url = json.loads(urls)  
  return parse_url

In [8]:
check_availability('www.indeed.com/jobs?q=data+scientist')

{'url': 'www.indeed.com/jobs?q=data scientist',
 'archived_snapshots': {'closest': {'status': '200',
   'available': True,
   'url': 'http://web.archive.org/web/20201112020605/https://www.indeed.com/jobs?q=data%20scientist',
   'timestamp': '20201112020605'}}}

In [22]:
url = f'http://web.archive.org/cdx/search/cdx?url=www.indeed.com/jobs?q=data+scientist&amp;explvl=entry_level&from=20130101&to=20230215&output=json'
urls = rq.get(url).text
parse_url = json.loads(urls)

In [33]:
def get_archive_link(link):
  url = f'http://web.archive.org/cdx/search/cdx?url={link}&from=20130101&to=20230215&output=json'
  urls = rq.get(url).text
  parse_url = json.loads(urls) #parses the JSON from urls.
  # print(parse_url)
  url_list = []
  for i in range(1,len(parse_url)):
    orig_url = parse_url[i][2]
    tstamp = parse_url[i][1]
    waylink = tstamp+'/'+orig_url
    url_list.append(waylink)
  # print(url_list)

  ## Compiles final url pattern.
  final_list = []
  for url in url_list:
    final_url = 'https://web.archive.org/web/'+url
    final_list.append(final_url)
    # print(final_list)

  return final_list

In [6]:
# extract company
def extract_company(div): 
    company = div.find_all(name="span", attrs={"class":"company"})
    if len(company) > 0:
      for b in company:
        return (b.text.strip())
    else:
      sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
      for span in sec_try:
          return (span.text.strip())
    return 'NOT_FOUND'


# extract job salary
def extract_salary(div): 
    salaries = []
    try:
      return (div.find('nobr').text)
    except:
      try:
        div_two = div.find(name='div', attrs={'class':'salary no-wrap'})
        div_three = div_two.find('div')
        salaries.append(div_three.text.strip())
        return salaries
      except:
        try:
          div_two = div.find(name='div', attrs={'class':'sjcl'})
          div_three = div_two.find('div')
          salaries.append(div_three.text.strip())
          return salaries
        except:
          return ('NOT_FOUND')
    return 'NOT_FOUND'


# extract job location
def extract_location(div):
  for span in div.findAll('span', attrs={'class': 'location'}):
    return (span.text)
  return 'NOT_FOUND'


# extract job title
def extract_job_title(div):
  for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
    return (a['title'])
  return('NOT_FOUND')


# extract jd summary
def extract_summary(div): 
  spans = div.findAll('span', attrs={'class': 'summary'})
  for span in spans:
    return (span.text.strip())
  return 'NOT_FOUND'
 

# extract link of job description 
def extract_link(div): 
  for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
    return (a['href'])
  return('NOT_FOUND')


# extract date of job when it was posted
def extract_date(div):
  try:
    spans = div.findAll('span', attrs={'class': 'date'})
    for span in spans:
      return (span.text.strip())
  except:
    return 'NOT_FOUND'
  return 'NOT_FOUND'


# extract full job description from link
def extract_fulltext(url):
  try:
    page = rq.get('http://www.indeed.com' + url)
    soup = bs(page.text, "lxml", from_encoding="utf-8")
    spans = soup.findAll('span', attrs={'class': 'summary'})
    for span in spans:
      return (span.text.strip())
  except:
    return 'NOT_FOUND'
  return 'NOT_FOUND'

In [7]:
# define dataframe columns
df = pd.DataFrame(columns = ['unique_id', 'job_qry','job_title', 
                             'company_name', 'location', 'summary', 
                             'salary', 'link', 'date', 'full_text'])

In [None]:
for role in roles:
  role_ = role.lower().replace(' ', '+')
  link = f'www.indeed.com/jobs?q={role_}&amp;explvl=entry_level'
  archive_url_list = get_archive_link(link)
  for url in archive_url_list:
    for i in range(3): #retry 3 times if connection error
      while True:
        try:
          pg = rq.get(url).text
          sleep(3) #ensuring 5 seconds sleep after every grab
        except ConnectionError:
          sleep(3)
          continue
        break

    soup = bs(pg,'html.parser')
    divs = soup.find_all(name="div", attrs={"class":"row"})

    cnt = 0
    for div in divs:
      #specifying row num for index of job posting in dataframe
      num = (len(df) + 1) 
      cnt = cnt + 1
      #job data after parsing
      job_post = [] 

      #append unique id
      job_post.append(div['id'])

      #append job qry
      job_post.append(role)

      #grabbing job title
      job_post.append(extract_job_title(div))

      #grabbing company
      job_post.append(extract_company(div))

      #grabbing location name
      job_post.append(extract_location(div))

      #grabbing summary text
      job_post.append(extract_summary(div))

      #grabbing salary
      job_post.append(extract_salary(div))

      #grabbing link
      link = extract_link(div)
      job_post.append(link)

      #grabbing date
      job_post.append(extract_date(div))

      #grabbing full_text
      job_post.append(extract_fulltext(link))

      #appending list of job post info to dataframe at index num
      df.loc[num] = job_post
  roles.remove(role)
  print(roles)

  sleep(5)

df.to_csv('job_data_indeed.csv', index=False)

In [None]:
# todo 
# check availability of all the websites and their links 
# get salary estimates for the years 

In [None]:
df

In [None]:
df.sample(10)

Unnamed: 0,unique_id,job_qry,job_title,company_name,location,summary,salary,link,date,full_text
682,p_f06e91bfa997ad1f,Data Architect,Data Warehouse Architect,AffluentTEK,"Richmond, VA",NOT_FOUND,[AffluentTEK],/web/20190708124257/https://www.indeed.com/rc/...,1 hour ago,NOT_FOUND
907,pj_47c11da7428e0b9f,Data Analyst,Junior Data Analyst,Abbyson Living LLC,"Moorpark, CA 93021",Junior Data Analyst. Data analysts work with l...,NOT_FOUND,/web/20171123153424/https://www.indeed.com/pag...,NOT_FOUND,NOT_FOUND
71,pj_cec48dfd230b35f0,Social media manager,Social Media Manager,Luv N Care,NOT_FOUND,NOT_FOUND,[Luv N Care],/web/20190805185449/https://www.indeed.com/pag...,1 hour ago,NOT_FOUND
952,pj_dd9ac3669ecaa314,Data Analyst,Clinical Trials Data Analyst I,Baylor Scott & White Health,"Temple, TX",Works in-tandem with Clinical Trails Data Anal...,NOT_FOUND,/web/20171228171517/https://www.indeed.com/pag...,NOT_FOUND,NOT_FOUND
148,pj_311e2f802189c34c,Front end developer,JavaScript Developer,Indeed Prime,"Portland, OR",Apply to 100+ top companies with 1 simple appl...,NOT_FOUND,/web/20170901193501/https://www.indeed.com/pag...,NOT_FOUND,NOT_FOUND
107,p_58ae0d22ebaabfcf,Front end developer,"UI Developer - W2 (H1 Transfer, GC and USC)",AAA Global Technologies,"Atlanta, GA","UI Developer - W2 (H1 Transfer, GC and USC). T...",NOT_FOUND,/web/20170811223815/https://www.indeed.com/com...,1 day ago,NOT_FOUND
687,p_6d0ab5dd347455d5,Data Architect,Data Architect,Core Software,"Brooklyn, NY",NOT_FOUND,[Core Software\n\n\n2 reviews],/web/20190708124257/https://www.indeed.com/rc/...,30+ days ago,NOT_FOUND
497,p_b39789d5af33a063,Software Developer,Junior Software Development Engineer,Arxan Technologies,"San Francisco, CA 94108 (Financial District area)",Deliver quality software within the committed ...,NOT_FOUND,/web/20171123163650/https://www.indeed.com/rc/...,23 days ago,NOT_FOUND
734,p_d9038819923f6e99,Data Analyst,Data Quality Analyst,NerdWallet,"San Francisco, CA",NerdWallet is seeking a Data Quality Analyst t...,NOT_FOUND,/web/20170811210024/https://www.indeed.com/rc/...,3 days ago,NOT_FOUND
991,p_7c071c09ee23b304,Data Analyst,Junior Data Analyst,Good Eggs,"San Francisco, CA",Support data gathering and provide review and ...,NOT_FOUND,/web/20180118164443/https://www.indeed.com/rc/...,30 days ago,NOT_FOUND
