In [1]:
import re
import csv
import json
from time import sleep
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def extract_salary_info(job_title, job_city):
  """Extract and return salary information"""
  template = 'https://www.salary.com/research/salary/listing/{}-salary/{}'
  url = template.format (job_title, job_city)
  try:
    response = requests.get(url)
    if response.status_code != 200:
      return None
  except requests.exceptions.ConnectionError:
    return None
  soup = BeautifulSoup(response. text, 'html.parser')
  pattern = re.compile(r'Occupation')
  script = soup.find('script', {'type':'application/ld+json'}, text = pattern)
  json_raw = script.contents[0]
  json_data = json.loads(json_raw)

  job_title = json_data['name']
  location = json_data['occupationLocation'][0]['name']
  description = json_data['description']
  responsibilities = json_data['responsibilities']
  ntile_10 = json_data['estimatedSalary'][0]['percentile10']
  ntile_25 = json_data['estimatedSalary'][0]['percentile25']
  ntile_50 = json_data['estimatedSalary'][0]['median']
  ntile_75 = json_data['estimatedSalary'][0]['percentile75']
  ntile_90 = json_data['estimatedSalary'][0]['percentile90']

  salary_data = (job_title, location, description, responsibilities, ntile_10, ntile_25, ntile_50, ntile_75, ntile_90)
  return salary_data

In [3]:
def main(job_title):
    """Extract salary data from top us cities"""

    # get the list of us states
    states = pd.read_csv('https://raw.githubusercontent.com/Umerfarooq122/Data_sets/main/tile_data.csv')

    states = states['Abbreviation'].tolist()

    # extract salary data for each state
    salary_data = []
    for state in states:
        result = extract_salary_info(job_title, state)
        if result:
            salary_data.append(result)
            sleep(0.5)

    # save data to csv file
    with open('salary-results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Title','Location', 'Description', 'nTile10', 'nTile25', 'nTile50', 'nTile75', 'nTile90'])
        writer.writerows(salary_data)

    return salary_data

In [4]:
data_scientist = main('data-scientist')
df_sci = pd.DataFrame.from_records(data_scientist)

In [5]:
data_analyst = main('data-analyst')
df_anl = pd.DataFrame.from_records(data_analyst)

In [6]:
bi = main('data-engineer')
df_bi = pd.DataFrame.from_records(bi)

In [7]:
arch = main('data-architect')
df_arc = pd.DataFrame.from_records(arch)

In [8]:
ba = main('business-analyst')
df_ba = pd.DataFrame.from_records(ba)

In [23]:
frames = [df_sci, df_anl, df_bi,df_arc,df_ba]
result = pd.concat(frames)
result.rename(columns = {'1':'STATE'})
result.columns = ['Title', 'State', 'Descr','Responsibilities','Percent10','Percent25','PercentMedian','Percent75','Percent90']


In [25]:
result.head()
result.to_excel('2-salary.xlsx')

In [None]:
## Scrape top 10 skills mentioned in job descriptions
j_url = 'https://www.ziprecruiter.com/career/Data-Scientist/Resume-Keywords-and-Skills'
j_response = requests.get(j_url)
j_soup = BeautifulSoup(j_response.text, 'html.parser')
print(j_soup.prettify())

In [None]:
script = j_soup.find('div', {'class':"skills-required-by-employers-table"}) 
script.contents[3]
result.to_excel('4-Job Description Skills.xlsx')