In [1]:
import sys
import requests as rq
from bs4 import BeautifulSoup as bs
from time import sleep
from time import time
from random import randint
from warnings import warn
import json
import pandas as pd

Most websites do not leave job postings for more than 2-5 months hence getting job posting of previous years from the jobsites directly is impossible. 

To get old job postings, I'm extracting data from a web archive called [the wayback machine](https://archive.org/web/). The drawback to this is that the links are not clickable hence being able to extract the job posting descriptions might be difficult.

# Scraping Indeed job site 

In [2]:
roles = [
    'Marketing Technologist',
    'SEO Consultant',
    'Web analytics Developer',
    'Digital Marketing Manager',
    'Social media manager',
    'Content Manager',
    'Information Architect',
    'UX designer',
    'UI Designer',
    'Front end designer',
    'Front end developer',
    'Mobile Developer',
    'Full stack developer',
    'Software Developer',
    'WordPress Developer',
    'Python Developer',
    'Systems Engineer',
    'Data Architect',
    'Database Administrator',
    'Data Analyst', 
    'Data scientist',
    'Cloud Architect',
    'DevOps Manager',
    'Agile project manager',
    'Product Manager',
    'Security specialist',
    'QA (Quality Assurance) specialist',
    'Game developer',
    'Computer Graphics animator',
    'Information security analyst',
    'Network and system administrator',
    'Product owner'

]

In [10]:
def check_availability(link):
  url = f'http://archive.org/wayback/available?url={link}'
  urls = rq.get(url).text
  parse_url = json.loads(urls)  
  return parse_url

In [11]:
check_availability('www.indeed.com/jobs?q=data+scientist')

{'url': 'www.indeed.com/jobs?q=data scientist',
 'archived_snapshots': {'closest': {'status': '200',
   'available': True,
   'url': 'http://web.archive.org/web/20201112020605/https://www.indeed.com/jobs?q=data%20scientist',
   'timestamp': '20201112020605'}}}

In [14]:
url = f'http://web.archive.org/cdx/search/cdx?url=www.indeed.com/jobs?q=data+scientist&amp;explvl=entry_level&from=20130101&to=20230215&output=json'
urls = rq.get(url).text
parse_url = json.loads(urls)

KeyboardInterrupt: ignored

In [None]:
def get_archive_link(link):
  url = f'http://web.archive.org/cdx/search/cdx?url={link}&from=20130101&to=20230215&output=json'
  urls = rq.get(url).text
  parse_url = json.loads(urls) #parses the JSON from urls.
  # print(parse_url)
  url_list = []
  for i in range(1,len(parse_url)):
    orig_url = parse_url[i][2]
    tstamp = parse_url[i][1]
    waylink = tstamp+'/'+orig_url
    url_list.append(waylink)
  # print(url_list)

  ## Compiles final url pattern.
  final_list = []
  for url in url_list:
    final_url = 'https://web.archive.org/web/'+url
    final_list.append(final_url)
    # print(final_list)

  return final_list

In [None]:
# extract company
def extract_company(div): 
    company = div.find_all(name="span", attrs={"class":"company"})
    if len(company) > 0:
      for b in company:
        return (b.text.strip())
    else:
      sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
      for span in sec_try:
          return (span.text.strip())
    return 'NOT_FOUND'


# extract job salary
def extract_salary(div): 
    salaries = []
    try:
      return (div.find('nobr').text)
    except:
      try:
        div_two = div.find(name='div', attrs={'class':'salary no-wrap'})
        div_three = div_two.find('div')
        salaries.append(div_three.text.strip())
        return salaries
      except:
        try:
          div_two = div.find(name='div', attrs={'class':'sjcl'})
          div_three = div_two.find('div')
          salaries.append(div_three.text.strip())
          return salaries
        except:
          return ('NOT_FOUND')
    return 'NOT_FOUND'


# extract job location
def extract_location(div):
  for span in div.findAll('span', attrs={'class': 'location'}):
    return (span.text)
  return 'NOT_FOUND'


# extract job title
def extract_job_title(div):
  for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
    return (a['title'])
  return('NOT_FOUND')


# extract jd summary
def extract_summary(div): 
  spans = div.findAll('span', attrs={'class': 'summary'})
  for span in spans:
    return (span.text.strip())
  return 'NOT_FOUND'
 

# extract link of job description 
def extract_link(div): 
  for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
    return (a['href'])
  return('NOT_FOUND')


# extract date of job when it was posted
def extract_date(div):
  try:
    spans = div.findAll('span', attrs={'class': 'date'})
    for span in spans:
      return (span.text.strip())
  except:
    return 'NOT_FOUND'
  return 'NOT_FOUND'


# extract full job description from link
def extract_fulltext(url):
  try:
    page = rq.get('http://www.indeed.com' + url)
    soup = bs(page.text, "lxml", from_encoding="utf-8")
    spans = soup.findAll('span', attrs={'class': 'summary'})
    for span in spans:
      return (span.text.strip())
  except:
    return 'NOT_FOUND'
  return 'NOT_FOUND'

In [None]:
# define dataframe columns
df = pd.DataFrame(columns = ['unique_id', 'job_qry','job_title', 
                             'company_name', 'location', 'summary', 
                             'salary', 'link', 'date', 'full_text'])

In [None]:
for role in roles:
  role_ = role.lower().replace(' ', '+')
  link = f'www.indeed.com/jobs?q={role_}&amp;explvl=entry_level'
  archive_url_list = get_archive_link(link)
  for url in archive_url_list:
    for i in range(3): #retry 3 times if connection error
      while True:
        try:
          pg = rq.get(url).text
          sleep(3) #ensuring 5 seconds sleep after every grab
        except ConnectionError:
          sleep(3)
          continue
        break

    soup = bs(pg,'html.parser')
    divs = soup.find_all(name="div", attrs={"class":"row"})

    cnt = 0
    for div in divs:
      #specifying row num for index of job posting in dataframe
      num = (len(df) + 1) 
      cnt = cnt + 1
     
      #job data after parsing
      job_post = [] 

      #append unique id
      job_post.append(div['id'])

      #append job qry
      job_post.append(role)

      #grabbing job title
      job_post.append(extract_job_title(div))

      #grabbing company
      job_post.append(extract_company(div))

      #grabbing location name
      job_post.append(extract_location(div))

      #grabbing summary text
      job_post.append(extract_summary(div))

      #grabbing salary
      job_post.append(extract_salary(div))

      #grabbing link
      link = extract_link(div)
      job_post.append(link)

      #grabbing date
      job_post.append(extract_date(div))

      #grabbing full_text
      job_post.append(extract_fulltext(link))

      #appending list of job post info to dataframe at index num
      df.loc[num] = job_post
  roles.remove(role)
  print(roles)

  sleep(5)

df.to_csv('job_data_indeed.csv', index=False)

In [3]:
xx = 'Data Analyst'
xx.lower() in ('|'.join(x for x in roles))

False

In [None]:
# filter by role extract all the jobs from the archive links directly
link = f'www.indeed.com/jobs?'
archive_url_list = get_archive_link(link)
for url in archive_url_list:
  for i in range(3): #retry 3 times if connection error
    while True:
      try:
        pg = rq.get(url).text
        sleep(3) #ensuring 5 seconds sleep after every grab
      except ConnectionError:
        sleep(3)
        continue
      break

  soup = bs(pg,'html.parser')
  divs = soup.find_all(name="div", attrs={"class":"row"})

  cnt = 0
  for div in divs:
    #specifying row num for index of job posting in dataframe
    num = (len(df) + 1) 
    cnt = cnt + 1
    
    #job data after parsing
    job_post = [] 

    #append unique id
    job_post.append(div['id'])

    #append job qry
    job_post.append(role)

    #grabbing job title
    job_post.append(extract_job_title(div))

    #grabbing company
    job_post.append(extract_company(div))

    #grabbing location name
    job_post.append(extract_location(div))

    #grabbing summary text
    job_post.append(extract_summary(div))

    #grabbing salary
    job_post.append(extract_salary(div))

    #grabbing link
    link = extract_link(div)
    job_post.append(link)

    #grabbing date
    job_post.append(extract_date(div))

    #grabbing full_text
    job_post.append(extract_fulltext(link))

    #appending list of job post info to dataframe at index num
    df.loc[num] = job_post



In [None]:
df.to_csv('job_data_indeed.csv', index=False)

In [None]:
## Do this if your data has a lot of non tech jobs 
## please update the list of roles with relevant roles in your data 
## you can also inverse it by creating a list of non-tech roles if the 
## amount of non-tech roles are smaller than tech roles

In [56]:
#df = pd.read_csv('./drive/MyDrive/Job-trend-analysis-/job_data_indeed.csv')

In [57]:
df

Unnamed: 0,unique_id,job_qry,job_title,company_name,location,summary,salary,link,date,full_text
0,pj_b9c0fe264fb6191d,Social media manager,Social Media Manager,360SWEATER,NOT_FOUND,NOT_FOUND,['360SWEATER'],/web/20190708121400/https://www.indeed.com/pag...,NOT_FOUND,NOT_FOUND
1,pj_1b5c54a6b7341e61,Social media manager,Social Media Community Manager,A Shoc Beverage,NOT_FOUND,NOT_FOUND,['A Shoc Beverage'],/web/20190708121400/https://www.indeed.com/pag...,NOT_FOUND,NOT_FOUND
2,pj_557f226535d37d66,Social media manager,Social Media Manager - (FT),Advanced Plastic Surgery Solutions,NOT_FOUND,NOT_FOUND,['Advanced Plastic Surgery Solutions'],/web/20190708121400/https://www.indeed.com/pag...,NOT_FOUND,NOT_FOUND
3,pj_88f7ec126a72b559,Social media manager,Social Engagement Manager,Margaritaville Resort and Spa and Margaritavil...,NOT_FOUND,NOT_FOUND,['Margaritaville Resort and Spa and Margaritav...,/web/20190708121400/https://www.indeed.com/pag...,NOT_FOUND,NOT_FOUND
4,pj_c9872f45c191dcea,Social media manager,Social Media Manager,AMResorts,NOT_FOUND,NOT_FOUND,['AMResorts\n\n\n26 reviews'],/web/20190708121400/https://www.indeed.com/pag...,NOT_FOUND,NOT_FOUND
...,...,...,...,...,...,...,...,...,...,...
1224,p_f574e968bf0ef913,Security specialist,Security/Protection Specialist,Yes Sir Security,"Los Angeles, CA","Security guard, security officer, security age...",NOT_FOUND,/web/20180201191539/https://www.indeed.com/com...,17 days ago,NOT_FOUND
1225,p_a0f8d025f0085b6a,Security specialist,Security Specialist 1,Los Alamos National Laboratory,"Los Alamos, NM",Personnel Security ensures that granting a wor...,NOT_FOUND,/web/20180201191539/https://www.indeed.com/rc/...,6 days ago,NOT_FOUND
1226,p_32e6d41f53d528ec,Security specialist,Personnel Security/Industrial Security Specialist,Advanced Integration Technology,"Plano, TX",Previous experience as a personnel security sp...,NOT_FOUND,/web/20180201191539/https://www.indeed.com/rc/...,16 days ago,NOT_FOUND
1227,pj_8b3ccda4b1691a7d,Security specialist,Security Specialist,"Security Industry Specialists, Inc.","Cupertino, CA",The Security Specialist reports to the Securit...,['$18 an hour'],/web/20180201191539/https://www.indeed.com/pag...,20 hours ago,NOT_FOUND


In [59]:
df = df.convert_dtypes(infer_objects=False)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1229 entries, 0 to 1228
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   unique_id     1229 non-null   string
 1   job_qry       1229 non-null   string
 2   job_title     1229 non-null   string
 3   company_name  1229 non-null   string
 4   location      1229 non-null   string
 5   summary       1229 non-null   string
 6   salary        1229 non-null   string
 7   link          1229 non-null   string
 8   date          1229 non-null   string
 9   full_text     1229 non-null   string
dtypes: string(10)
memory usage: 96.1 KB


In [66]:
df.job_title = df.job_title.apply(lambda row: row.lower())

In [68]:
df2 = df[(df.job_title.str.contains('|'.join(x.lower() for x in roles)))]

  df2 = df[(df.job_title.str.contains('|'.join(x.lower() for x in roles)))]


In [None]:
## to inverse this with a list of non-tech roles do this instead
## where non_tech_roles is the created list of non tech jobs in your data that you wish to remove
# df2 = df[~(df.job_title.str.contains('|'.join(x.lower() for x in non_tech_roles)))]

In [None]:
df2.to_csv('job_data_indeed.csv', index=False)

In [None]:
# todo 
# check availability of all the websites and their links 
# get salary estimates for the years 