# Task 1: Scrape Information per profession 

This is from the O*Net website, which is a database of occupational information. The information includes fields like average wages, required skills, etc.

In [None]:
from requests import get
from bs4 import BeautifulSoup
from IPython.core.display import clear_output
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

### Scrape 'page codes' for occupations to create urls:

In [None]:
url = 'https://www.onetonline.org/find/all'
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}

response = get(url,headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
codes_page = html_soup.find('div', id ='content')

codes_divs = codes_page.find_all('td', class_='text-end w-15 mw-8e')

codes = []
for cd in codes_divs:
  if cd.text is not None:
    codes.append(cd.text)

print(codes[0:10])

### Test example for scraping a single profession (dentist):

In [None]:
url = 'https://www.onetonline.org/link/summary/29-1021.00'
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}

In [None]:
response = get(url,headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
content = html_soup.find('div', id ='content')

In [None]:
## profession name:

name_div = content.find('span', class_ = 'main')
if name_div is not None:
  name = name_div.text

In [None]:
## wage:

content.find('div', id='WagesEmployment').find('dd', class_="col-sm-9 col-form-label pt-xso-0").text

'$74.54 hourly, $155,040 annual'

In [None]:
## occupational specific tasks:

tasks_con = content.find('div', class_="reportsection", id='Tasks')
tasks_divs_all = tasks_con.find_all('div', class_="order-2 flex-grow-1")

tasks_list = []
for td in tasks_divs_all:
  tasks_list.append(td.text.strip())

print(tasks_list)

['Use masks, gloves, and safety glasses to protect patients and self from infectious diseases.', 'Examine teeth, gums, and related tissues, using dental instruments, x-rays, or other diagnostic equipment, to evaluate dental health, diagnose diseases or abnormalities, and plan appropriate treatments.', 'Administer anesthetics to limit the amount of pain experienced by patients during procedures.', 'Use dental air turbines, hand instruments, dental appliances, or surgical implements.', "Formulate plan of treatment for patient's teeth and mouth tissue.", 'Diagnose and treat diseases, injuries, or malformations of teeth, gums, or related oral structures and provide preventive or corrective services.', 'Write prescriptions for antibiotics or other medications.', 'Advise or instruct patients regarding preventive dental care, the causes and treatment of dental problems, or oral health care services.', 'Design, make, or fit prosthodontic appliances, such as space maintainers, bridges, or dentu

In [None]:
## occupational specific technology skills:

techskills_con = content.find('div', class_="reportsection", id='TechnologySkills')
techskills_divs_all = techskills_con.find_all('div', class_="order-2 flex-grow-1")

techskills_list = []
for td in techskills_divs_all:
  text = td.text
  # print(text)
  if '—' in text:
    title, desc = text.split('—')
  else:
    title = text
    desc = None

  sw_list = []
  if desc is not None:
    sw_list = [sw.strip() for sw in desc.split(';') if 'more' not in sw]

  techskills_list.append((title.strip(), sw_list))

print(techskills_list)

[('Accounting software', []), ('Internet browser software', ['Web browser software']), ('Medical software', ['AlphaDent', 'eClinicalWorks EHR software', 'Henry Schein Dentrix', 'Windent SQL']), ('Office suite software', ['Microsoft Office software']), ('Spreadsheet software', ['Microsoft Excel']), ('Word processing software', [])]


In [None]:
## worker requirments skills

worker_skills_con = content.find('div', class_="reportsection", id='Skills')
worker_skills_divs_all = worker_skills_con.find_all('div', class_="order-2 flex-grow-1")

worker_skills_list = []
for sd in worker_skills_divs_all:
  text = sd.text
  if '—' in text:
    title, desc = text.split('—')
  else:
    title = text
    desc = None
  worker_skills_list.append((title.strip(), desc.strip()))

print(worker_skills_list)

[('Critical Thinking', 'Using logic and reasoning to identify the strengths and weaknesses of alternative solutions, conclusions, or approaches to problems.'), ('Judgment and Decision Making', 'Considering the relative costs and benefits of potential actions to choose the most appropriate one.'), ('Active Listening', 'Giving full attention to what other people are saying, taking time to understand the points being made, asking questions as appropriate, and not interrupting at inappropriate times.'), ('Complex Problem Solving', 'Identifying complex problems and reviewing related information to develop and evaluate options and implement solutions.'), ('Monitoring', 'Monitoring/Assessing performance of yourself, other individuals, or organizations to make improvements or take corrective action.'), ('Reading Comprehension', 'Understanding written sentences and paragraphs in work-related documents.'), ('Speaking', 'Talking to others to convey information effectively.'), ('Active Learning', 

In [None]:
## worker knowledge

knowledge_con = content.find('div', class_="reportsection", id='Knowledge')
knowledge_divs_all = worker_skills_con.find_all('div', class_="order-2 flex-grow-1")

knowledge_list = []
for kd in knowledge_divs_all:
  text = kd.text
  if '—' in text:
    title, desc = text.split('—')
  else:
    title = text
    desc = None
  knowledge_list.append((title.strip(), desc.strip()))

print(knowledge_list)

[('Critical Thinking', 'Using logic and reasoning to identify the strengths and weaknesses of alternative solutions, conclusions, or approaches to problems.'), ('Judgment and Decision Making', 'Considering the relative costs and benefits of potential actions to choose the most appropriate one.'), ('Active Listening', 'Giving full attention to what other people are saying, taking time to understand the points being made, asking questions as appropriate, and not interrupting at inappropriate times.'), ('Complex Problem Solving', 'Identifying complex problems and reviewing related information to develop and evaluate options and implement solutions.'), ('Monitoring', 'Monitoring/Assessing performance of yourself, other individuals, or organizations to make improvements or take corrective action.'), ('Reading Comprehension', 'Understanding written sentences and paragraphs in work-related documents.'), ('Speaking', 'Talking to others to convey information effectively.'), ('Active Learning', 

In [None]:
## education level

education_con = content.find('div', class_="reportsection", id='Education')
education_divs_all = education_con.find_all('li', class_="d-flex flex-nowrap mb-1")

education_list = []
for ed in education_divs_all:
  spans = ed.find_all('span')

  education_level = None
  for index, span in enumerate(spans):
    if span.text.strip() == 'responded:':
        if index + 1 < len(spans):
            education_level = spans[index + 1].text.replace('more info', '').strip()
            break

  precentage = ed.find('span').text.strip()

  if '%' in precentage and education_level is not None:
    education_list.append((education_level.strip('\n'), precentage))

education_list

[('Doctoral degree required', '81%'),
 ('Some college, no degree required', '7%'),
 ('Associate’s degree required', '6%')]

In [None]:
## work styles

style_con = content.find('div', class_="reportsection", id='WorkStyles')
style_divs_all = style_con.find_all('div', class_="order-2 flex-grow-1")

style_list = []
for sd in style_divs_all:
  text = sd.text
  if '—' in text:
    title, desc = text.split('—')
  else:
    title = text
    desc = None
  style_list.append((title.strip(), desc.strip()))

print(style_list)

[('Attention to Detail', 'Job requires being careful about detail and thorough in completing work tasks.'), ('Integrity', 'Job requires being honest and ethical.'), ('Dependability', 'Job requires being reliable, responsible, and dependable, and fulfilling obligations.'), ('Concern for Others', "Job requires being sensitive to others' needs and feelings and being understanding and helpful on the job."), ('Self-Control', 'Job requires maintaining composure, keeping emotions in check, controlling anger, and avoiding aggressive behavior, even in very difficult situations.'), ('Stress Tolerance', 'Job requires accepting criticism and dealing calmly and effectively with high-stress situations.'), ('Leadership', 'Job requires a willingness to lead, take charge, and offer opinions and direction.'), ('Cooperation', 'Job requires being pleasant with others on the job and displaying a good-natured, cooperative attitude.'), ('Persistence', 'Job requires persistence in the face of obstacles.'), ('

### Scrape for a sample of all proffesions in the website:

In [None]:
from tqdm import tqdm
import json

len(codes)

1016

In [None]:
prof_dicts = []

for code in tqdm(codes[0:300]):

  url = 'https://www.onetonline.org/link/summary/' + code
  response = get(url,headers=headers)
  html_soup = BeautifulSoup(response.text, 'html.parser')
  content = html_soup.find('div', id ='content')

  ## profession name:
  name_div = content.find('span', class_ = 'main')
  if name_div is not None:
    name = name_div.text
  else:
    break

  ## wage:
  wage_div = content.find('div', id='WagesEmployment')
  if wage_div is not None:
    wage = wage_div.find('dd', class_="col-sm-9 col-form-label pt-xso-0").text

  ## occupational specific tasks:
  tasks_con = content.find('div', class_="reportsection", id='Tasks')
  if tasks_con is not None:
    tasks_divs_all = tasks_con.find_all('div', class_="order-2 flex-grow-1")
    tasks_list = []
    for td in tasks_divs_all:
      tasks_list.append(td.text.strip())

  ## occupational specific technology skills:
  techskills_con = content.find('div', class_="reportsection", id='TechnologySkills')
  if techskills_con is not None:
    techskills_divs_all = techskills_con.find_all('div', class_="order-2 flex-grow-1")
    techskills_list = []
    for td in techskills_divs_all:
      text = td.text
      if '—' in text:
        title, desc = text.split('—')
      else:
        title = text
        desc = None
      sw_list = []
      if desc is not None:
        sw_list = [sw.strip() for sw in desc.split(';') if 'more' not in sw]
      techskills_list.append((title.strip(), sw_list))

  ## worker requirments skills
  worker_skills_con = content.find('div', class_="reportsection", id='Skills')
  if worker_skills_con is not None:
    worker_skills_divs_all = worker_skills_con.find_all('div', class_="order-2 flex-grow-1")
    worker_skills_list = []
    for sd in worker_skills_divs_all:
      text = sd.text
      if '—' in text:
        title, desc = text.split('—')
      else:
        title = text
        desc = None
      worker_skills_list.append((title.strip(), desc.strip()))

  ## worker knowledge
  knowledge_con = content.find('div', class_="reportsection", id='Knowledge')
  if knowledge_con is not None:
    knowledge_divs_all = worker_skills_con.find_all('div', class_="order-2 flex-grow-1")
    knowledge_list = []
    for kd in knowledge_divs_all:
      text = kd.text
      if '—' in text:
        title, desc = text.split('—')
      else:
        title = text
        desc = None
      knowledge_list.append((title.strip(), desc.strip()))

  ## education level
  education_con = content.find('div', class_="reportsection", id='Education')
  if education_con is not None:
    education_divs_all = education_con.find_all('li', class_="d-flex flex-nowrap mb-1")
    education_list = []
    for ed in education_divs_all:
      spans = ed.find_all('span')
      education_level = None
      for index, span in enumerate(spans):
        if span.text.strip() == 'responded:':
            if index + 1 < len(spans):
                education_level = spans[index + 1].text.replace('more info', '').strip()
                break
      precentage = ed.find('span').text.strip()
      if '%' in precentage and education_level is not None:
        education_list.append((education_level.strip('\n'), precentage))

  ## work styles
  style_con = content.find('div', class_="reportsection", id='WorkStyles')
  if style_con is not None:
    style_divs_all = style_con.find_all('div', class_="order-2 flex-grow-1")
    style_list = []
    for sd in style_divs_all:
      text = sd.text
      if '—' in text:
        title, desc = text.split('—')
      else:
        title = text
        desc = None
      style_list.append((title.strip(), desc.strip()))

  ## save all data:
  profession_dict = {
      "name": name,
      "wage": wage if not None else '',
      "tasks": tasks_list if not None else [],
      "skills": worker_skills_list if not None else [],
      "tech_skills": techskills_list if not None else [],
      "knowledge": knowledge_list if not None else [],
      "education": education_list if not None else [],
      "work_style": style_list if not None else []
  }
  prof_dicts.append(profession_dict)

100%|██████████| 300/300 [04:17<00:00,  1.16it/s]


In [None]:
len(prof_dicts)

300

In [None]:
# save the collected data:

file_path = 'data/professions_data_300.json'
with open(file_path, 'w') as json_file:
    json.dump(prof_dicts, json_file, indent=4)

# Task 2: Scrape Examples of Interview Question

This data is from 'MockQuestions.com' which is a website that provides examples of interview questions, specifically by company.

### Example for one page - Amazon:

In [None]:
url = 'https://www.mockquestions.com/company/Amazon/'
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}
response = get(url,headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')

page_div = html_soup.find('div', class_="float w100 answers")

In [None]:
company_name = "Amazon"
questions_list = []

q1_con = page_div.find('li', class_="float w100 list-none mt18 list-answers")
q1type = q1_con.find('div').find('p', class_="float w100 title-font tfont600 color-B2B2B1 text16 lh1-9").text
q1 = q1_con.find('h2').text.strip()
questions_list.append((q1, q1type))

question_con = page_div.find_all('li', class_="float w100 list-none mt50 list-answers")
for q_con in question_con:
  qtype = q_con.find('p', class_="float w100 title-font tfont600 color-B2B2B1 text16 lh1-9").text
  q = q_con.find('h2').text.strip()
  questions_list.append((q, qtype))

questions_list

[('1. Tell me about your greatest work-related accomplishment.',
  'Accomplishment'),
 ('2. What questions do you have for me?', 'Ask The Interviewer'),
 ('3. What makes you a good fit for Amazon?', 'Behavioral'),
 ('4. Tell me about someone you admire and why you admire them.',
  'Behavioral'),
 ("5. When starting at a new position, how do you earn your coworkers' trust?",
  'Behavioral'),
 ('6. Do you consider yourself a calculated risk-taker?', 'Behavioral'),
 ('7. When have you had to change a major component of your project due to new information?',
  'Behavioral'),
 ('8. Culture fit is important to us at Amazon. How would you describe your personality?',
  'Behavioral'),
 ('9. Do you think it is important to have fun at work?', 'Behavioral'),
 ('10. Can you discuss a time you had to be frugal in the workplace?',
  'Behavioral'),
 ('11. We seek to hire highly ambitious people. Where would you like your career with Amazon to take you?',
  'Career Goals'),
 ('12. Why do you want to 

## Find all companies in MockQuestions website:

In [None]:
url = 'https://www.mockquestions.com/companies/Newest/'
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}

response = get(url,headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')

page_div = html_soup.find('div', class_="float w30")

In [None]:
# <div class="float w100 mt4 pp-mt8 pl-mt8 h52 tll-h40 pp-h34 pp-h28"><h4 class="title-font tfont900 dark lh1-5 lt-lh1-3 pp-lh1-3 pl-lh1-3 text22 lt-text18 tll-text16 tlp-text16 pp-text12 pl-text12 lh1-7 tll-lh1-3 tlp-lh1-3 pp-lh1-3 pl-lh1-3">UBS</h4></div>
company_list = []

comps = page_div.find('li', class_="float w100 mt4 pp-mt8 pl-mt8 h52 tll-h40 pp-h34 pp-h28")
q1type = comps.find('div').find('p', class_="float w100 title-font tfont600 color-B2B2B1 text16 lh1-9").text
q1 = comps.find('h4').text.strip()
questions_list.append((q1, q1type))

question_con = page_div.find_all('li', class_="float w100 list-none mt50 list-answers")
for q_con in question_con:
  qtype = q_con.find('p', class_="float w100 title-font tfont600 color-B2B2B1 text16 lh1-9").text
  q = q_con.find('h2').text.strip()
  questions_list.append((q, qtype))

questions_list

In [None]:
import requests
from bs4 import BeautifulSoup

# Send a GET request to the webpage
url = "https://www.mockquestions.com/companies/Newest/"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all <h4> elements
    h4_elements = soup.find_all('h4')

    # Print the text content of each <h4> element
    for h4 in h4_elements:
        print(h4.text)
else:
    print("Failed to retrieve the webpage.")

Bristol-Myers Squibb Company
Airbnb, Inc.
Kaiser Permanente
Jacobs
UBS
Goldman Sachs
Amazon
NSA (National Security Agency)
Facebook
BNSF Railway Company
Huron Consulting Group
IQVIA


In [None]:
import requests
from bs4 import BeautifulSoup

categories = []

# Send an HTTP request to the webpage
url = "https://www.mockquestions.com/companies/Newest/"
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the webpage
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the navigation list
    navigation_list = soup.find("nav", class_="float w100 mt44 pp-mt32 mb24")

    # Find all the items in the navigation list
    if navigation_list:
        items = navigation_list.find_all("a")
        # Extract and print the text of each item
        for item in items:
            categories.append(item.text.strip())
    else:
        print("Navigation list not found on the page.")
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

categories = categories[1:]

In [None]:
categories

['Newest',
 'Featured',
 'Popular',
 'Business',
 'Consulting',
 'Energy',
 'Engineering',
 'Finance',
 'Food',
 'Agriculture',
 'Beverages',
 'Breweries',
 'Candy',
 'Casual Dining',
 'Coffee Shops',
 'Dairy Farmers',
 'Delivery Services',
 'Food Service',
 'Liquors',
 'Manufacturer',
 'Meats',
 'Producer',
 'Restaurant Management',
 'Restaurants (Asian)',
 'Restaurants (Bakery & Restaurant)',
 'Restaurants (Bar & Grill)',
 'Restaurants (Breakfast)',
 'Restaurants (Burgers)',
 'Restaurants (Chicken)',
 'Restaurants (Fast Food)',
 'Restaurants (Italian)',
 'Restaurants (Mexican)',
 'Government',
 'Healthcare',
 'Law',
 'Logistics',
 'Media',
 'Products',
 'Real Estate',
 'Retail',
 'Services',
 'Tech',
 'Travel',
 'Careers',
 'Schools',
 'Topics',
 'Industries',
 'Aptitude Tests',
 'Articles']

In [None]:
import re
def remove_symbols(input_string):
    # Define a regular expression pattern to match non-alphanumeric characters
    pattern = r'[^a-zA-Z0-9\s]'  # Matches anything that is not a letter, a number, or whitespace

    # Use re.sub() to replace all matches of the pattern with an empty string
    clean_string = re.sub(pattern, '', input_string)

    return clean_string

remove_symbols('ajsv 45 sdhc ##$vf')

'ajsv 45 sdhc vf'

In [None]:
import requests
from bs4 import BeautifulSoup

comps = set()

for cat in categories:
  # Send a GET request to the webpage
  url = f"https://www.mockquestions.com/companies/{cat}/"
  response = requests.get(url)

  if response.status_code == 200:
      # Parse the HTML content
      soup = BeautifulSoup(response.content, "html.parser")

      # Find and click the desired navigation elements
      nav_elements = soup.find("nav", class_="float").find_all("a")
      sub_cat = set([el.text.strip() for el in nav_elements]) - set(categories)

      for nav_element in nav_elements:

        if nav_element.text.strip() in sub_cat:
          nav_url = nav_element.get("href")
          print(nav_url)
          nav_response = requests.get(nav_url)
          if nav_response.status_code == 200:
                # Parse the page after clicking the navigation element
                nav_soup = BeautifulSoup(nav_response.content, "html.parser")

                # Extract the h4 elements
                h4_elements = nav_soup.find_all("h4")
                for h4 in h4_elements:
                    comps.add(h4.text)
                    print(h4.text)
  else:
      print("Failed to fetch the webpage")

In [None]:
import pandas as pd
df = pd.DataFrame(comps2, columns=['companies'])
df.to_csv('companies_floop.csv')

In [None]:
print(len(comps))
comps2 = [comp[1:-3] for comp in comps]
comps2

In [None]:
for comp in comps:
  print(f'"{comp}",')

### Get Questions For All 33 Companies in website & data

In [None]:
chosen_comps = ['American Water',
 'K&L Gates',
 'Guidehouse',
 'Scholastic',
 'Harbor Freight Tools',
 'U.S. Bank',
 'UC Health',
 'Kohler Co.',
 'Care Ambulance Service Inc.',
 'BECU',
 'American Electric Power',
 'Merck',
 'VyStar Credit Union',
 'Nucor Corporation',
 'Kaiser Permanente',
 'AAA',
 'Northwestern Mutual',
 'Duke Energy Corporation',
 'Ipsos North America',
 'City National Bank',
 'Rockwell Collins',
 'Legacy Health System',
 'Western Digital',
 'State Farm',
 'Honeywell',
 'Northbay Healthcare Group',
 'Officeworks',
 'Amazon',
 'JCPenney',
 'Whirlpool Corporation',
 'Magellan Health',
 'Ripple',
 'iHeartMedia']

In [None]:
url = 'https://www.mockquestions.com/company/'
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}

data = []
columns = ['company', 'category', 'question']
type_counter = 0

for comp in chosen_comps:

    questions_list = []

    cur_url = url + comp.replace(" ", "+") + '/'
    response = get(cur_url,headers=headers)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    page_div = html_soup.find('div', class_="float w100 answers")

    q1_con = page_div.find('li', class_="float w100 list-none mt18 list-answers")
    q1type = q1_con.find('div').find('p', class_="float w100 title-font tfont600 color-B2B2B1 text16 lh1-9")
    if q1type:
      q1type = q1type.text
      type_counter += 1
    else:
      q1type = None
    q1 = q1_con.find('h2').text.strip()
    data.append((comp, q1type, q1))

    question_con = page_div.find_all('li', class_="float w100 list-none mt50 list-answers")
    for q_con in question_con:
        qtype = q_con.find('p', class_="float w100 title-font tfont600 color-B2B2B1 text16 lh1-9")
        if qtype:
          qtype = qtype.text
        else:
          qtype = None
        q = q_con.find('h2').text.strip()
        data.append((comp, qtype, q))

In [None]:
type_counter

7

In [None]:
q_df = pd.DataFrame(data, columns=columns)

In [None]:
q_df

Unnamed: 0,company,category,question
0,American Water,,1. The utilities industry is highly regulated....
1,American Water,,2. Our industry is highly competitive. In your...
2,American Water,,3. Have you personally used the services of Am...
3,American Water,,4. American Water has many different departmen...
4,American Water,,5. Why do you want to work for American Water?
...,...,...,...
1027,iHeartMedia,,"26. In a streaming-centric time, how can iHear..."
1028,iHeartMedia,,27. If you only had social media platforms for...
1029,iHeartMedia,,28. iHeartMedia produces many events every yea...
1030,iHeartMedia,,29. iHeartMedia is behind some of the world's ...


## Questions Per Industry

In [None]:
leading_company_meta_industry = {
    "Miscellaneous": ['Nestle', 'McDonalds'],
    "Services": ["U.S. Bank", "Holiday Inn Club Vacations"],
    "Transportation and Logistics": ["Rockwell Collins"],
    "Retail and Consumer Goods": ["Honeywell"],
    "Healthcare and Medical": ["Kaiser Permanente"],
    "Government and Public Policy": ["NASA"],
    "Education and Training": ["University Administrator"],
    "Technology": ["Amazon", "Apple", "Facebook"],
    "Real Estate and Construction": None,
    "Media and Entertainment": ["Netflix"],
    "Manufacturing": ["Northwestern Mutual"],
    "Financial and Investment": None
}

# Task 3: LinkedIn position search

This data is from LinkedIn, using the Bright-Data API. This data allows us to connect between a company and the positions they offer, as a proof of concept for our product (which will be located on the position page and have access to this information).

This data is not used to train the model, but to show the potential of the product.

In [None]:
# urls for all 33 companies we chose to scrape:

urls_dict = {
 'American Water': 'https://www.linkedin.com/jobs/search/?currentJobId=3865337616&f_C=9226&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'K&L Gates': 'https://www.linkedin.com/jobs/search/?currentJobId=3800445166&f_C=164569&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Guidehouse': 'https://www.linkedin.com/jobs/search/?currentJobId=3856001387&f_C=28974109&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Scholastic': 'https://www.linkedin.com/jobs/search/?currentJobId=3874214594&f_C=4505&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Harbor Freight Tools': 'https://www.linkedin.com/jobs/search/?currentJobId=3830098864&f_C=28923&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'U.S. Bank': 'https://www.linkedin.com/jobs/search/?currentJobId=3856723627&f_C=2532&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'UC Health': 'https://www.linkedin.com/jobs/search/?currentJobId=3872522579&f_C=1252954&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Kohler Co.': 'https://www.linkedin.com/jobs/search/?currentJobId=3846973039&f_C=5670&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
#  'Care Ambulance Service Inc.',
 'BECU': 'https://www.linkedin.com/jobs/search/?currentJobId=3874093526&f_C=13985&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'American Electric Power': 'https://www.linkedin.com/jobs/search/?currentJobId=3853857226&f_C=162419&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Merck': 'https://www.linkedin.com/jobs/search/?currentJobId=3871368541&f_C=1486&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'VyStar Credit Union': 'https://www.linkedin.com/jobs/search/?currentJobId=3875468457&f_C=31580&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Nucor Corporation': 'https://www.linkedin.com/jobs/search/?currentJobId=3875768012&f_C=165257&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Kaiser Permanente': 'https://www.linkedin.com/jobs/search/?currentJobId=3860228467&f_C=1550&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'AAA': 'https://www.linkedin.com/jobs/search/?currentJobId=3626729635&f_C=2987&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R', # only 2 resluts
 'Northwestern Mutual': 'https://www.linkedin.com/jobs/search/?currentJobId=3741594331&f_C=2445&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Duke Energy Corporation': 'https://www.linkedin.com/jobs/search/?currentJobId=3874097583&f_C=4066&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Ipsos North America': 'https://www.linkedin.com/jobs/search/?currentJobId=3831510379&f_C=2562378&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'City National Bank': 'https://www.linkedin.com/jobs/search/?currentJobId=3871346956&f_C=163069&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
#  'Rockwell Collins',
#  'Legacy Health System',
 'Western Digital': 'https://www.linkedin.com/jobs/search/?currentJobId=3863210866&f_C=4593&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'State Farm': 'https://www.linkedin.com/jobs/search/?currentJobId=3870750858&f_C=2381&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Honeywell': 'https://www.linkedin.com/jobs/search/?currentJobId=3780202641&f_C=1344&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
#  'Northbay Healthcare Group',
#  'Officeworks',
 'Amazon': 'https://www.linkedin.com/jobs/search/?currentJobId=3860870660&f_C=1586&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'JCPenney': 'https://www.linkedin.com/jobs/search/?currentJobId=3621010710&f_C=3688&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Whirlpool Corporation': 'https://www.linkedin.com/jobs/search/?currentJobId=3768491914&f_C=2551&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Magellan Health': 'https://www.linkedin.com/jobs/search/?currentJobId=3867309964&f_C=5438&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'Ripple': 'https://www.linkedin.com/jobs/search/?currentJobId=3872221890&f_C=3249443&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R',
 'iHeartMedia': 'https://www.linkedin.com/jobs/search/?currentJobId=3832625720&f_C=2539&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R'
}

In [None]:
def get_url(jobID, f_C):
  return 'https://www.linkedin.com/jobs/search/?currentJobId=' + jobID + '&f_C=' + f_C + '&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R'

In [None]:
get_url('3871252360', '1586')

'https://www.linkedin.com/jobs/search/?currentJobId=3871252360&f_C=1586&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R'

In [None]:
# Get list of job ids
html_soup = BeautifulSoup(html, 'html.parser')
nav_list = html_soup.find('ul', class_="scaffold-layout__list-container")
job_elements = nav_list.find_all('li', class_='ember-view jobs-search-results__job-card-search--generic-occludable-area jobs-search-results__list-item occludable-update p0 relative scaffold-layout__list-item')
job_ids = [el.get('data-occludable-job-id') for el in job_elements]

In [None]:
# get job title
job_title = html_soup.find('span', class_="job-details-jobs-unified-top-card__job-title-link").text

'Paid Intern'

In [None]:
# get job details
def get_txt(detail):
  return detail.text.strip().replace('\n\n \n', ' · ').split(' · ')

det = html_soup.find('div', class_='mt3 mb2')
details = html_soup.find_all('li', class_='job-details-jobs-unified-top-card__job-insight')

suitcase = get_txt(details[0])
size, type_ = get_txt(details[1])

In [None]:
# get about the job
about_con = html_soup.find('div', class_='jobs-box__html-content jobs-description-content__text t-14 t-normal jobs-description-content__text--stretch')

In [None]:
# Extract all the text about the job
all_text = about_con.get_text(separator="\n").replace(",\n", ",").replace(";\n", ";").replace('\n \n', ' ').replace(', \n', ', ').replace('\n,', ',').replace('\n\n', '\n')

### Scrape with Bright Data

In [None]:
def get_url(og_url, job_id):

  # Split the URL string based on the currentJobId parameter
  url_parts = og_url.split("currentJobId=")

  # Construct the new URL with the updated job ID
  new_url = url_parts[0] + "currentJobId=" + job_id

  # Find the next '&' character to append the rest of the URL
  new_url += url_parts[1][url_parts[1].find("&"):]

  return new_url

def get_txt(detail):
  return detail.text.strip().replace('\n\n \n', ' · ').split(' · ')

headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}

proxies = {
   'http':  'http://brd-customer-hl_80709a30-zone-dcp_renana_gili_naomi_example:k29lttxt622h@brd.superproxy.io:22225',
   'https': 'http://brd-customer-hl_80709a30-zone-dcp_renana_gili_naomi_example:k29lttxt622h@brd.superproxy.io:22225',
}

data = []
columns = ['Company', 'Job title', 'Suitcase', 'Size', 'Sub Company', 'About the job']

urls_dict = {'Amazon': 'https://www.linkedin.com/jobs/search/?currentJobId=3860870660&f_C=1586&f_CR=103644278&geoId=92000000&origin=JOB_SEARCH_PAGE_JOB_FILTER&sortBy=R'}

In [None]:
for comp_name, url in urls_dict.items():

  # get first page
  res = get(url, proxies=proxies, headers=headers, verify=True)
  res.raise_for_status()
  res_body = res.text
  html_soup = BeautifulSoup(res_body, "html.parser")
  break

  # Get list of job ids
  nav_list = html_soup.find('ul', class_="scaffold-layout__list-container")
  job_elements = nav_list.find_all('li', class_='ember-view jobs-search-results__job-card-search--generic-occludable-area jobs-search-results__list-item occludable-update p0 relative scaffold-layout__list-item')
  job_ids = [el.get('data-occludable-job-id') for el in job_elements]

  for job_id in job_ids[:3]:

    # get html for current job id
    cur_url = get_url(url, job_id)
    cur_res = get(cur_url, proxies=proxies, verify=False)
    cur_res.raise_for_status()
    cur_res_body = cur_res.text
    cur_html_soup = BeautifulSoup(cur_res_body, "html.parser")

    # get job title
    job_title = cur_html_soup.find('span', class_="job-details-jobs-unified-top-card__job-title-link").text

    # get job details
    details = cur_html_soup.find_all('li', class_='job-details-jobs-unified-top-card__job-insight')
    suitcase = get_txt(details[0])
    size, sub_comp = get_txt(details[1])

    # get about the job
    about_con = cur_html_soup.find('div', class_='jobs-box__html-content jobs-description-content__text t-14 t-normal jobs-description-content__text--stretch')

    # Extract all the text about the job
    about = about_con.get_text(separator="\n").replace(",\n", ",").replace(";\n", ";").replace('\n \n', ' ').replace(', \n', ', ').replace('\n,', ',').replace('\n\n', '\n')

    data.append((comp_name, job_title, suitcase, size, sub_comp, about))

In [None]:
df = pd.DataFrame(data, columns=columns)
df

In [None]:
nav_list = html_soup.find('ul', class_='jobs-search__results-list')

In [None]:
job_elements = nav_list.find_all('li', class_='ember-view jobs-search-results__job-card-search--generic-occludable-area jobs-search-results__list-item occludable-update p0 relative scaffold-layout__list-item')
job_ids = [el.get('data-occludable-job-id') for el in job_elements]

In [None]:
job_elements

[]

In [None]:
# Get list of job ids
nav_list = html_soup.find('ul', class_="scaffold-layout__list-container")
job_elements = nav_list.find_all('li', class_='ember-view jobs-search-results__job-card-search--generic-occludable-area jobs-search-results__list-item occludable-update p0 relative scaffold-layout__list-item')
job_ids = [el.get('data-occludable-job-id') for el in job_elements]

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
# Save the HTML content to a file
with open('output.html', 'w') as f:
    f.write(html_soup.prettify())

### Jobs with info

In [None]:
proxies = {
   'http':  'http://brd-customer-hl_80709a30-zone-dcp_renana_gili_naomi_example:k29lttxt622h@brd.superproxy.io:22225',
   'https': 'http://brd-customer-hl_80709a30-zone-dcp_renana_gili_naomi_example:k29lttxt622h@brd.superproxy.io:22225',
}

data = []
columns = ['Company', 'Job title', 'Seniority level', 'Employment type', 'Job function', 'Industries']
urls_dict = {'State Farm': 'https://www.linkedin.com/jobs/search?location=United%2BStates&geoId=103644278&f_C=2381&pageNum=0&currentJobId=3850442241&position=1'}

In [None]:
for comp_name, url in urls_dict.items():

  # get first page
  res = get(url, proxies=proxies, verify=False)
  res.raise_for_status()
  res_body = res.text
  html_soup = BeautifulSoup(res_body, "html.parser")

  # Get list of job ids
  nav_list = html_soup.find('ul', class_="jobs-search__results-list")
  job_elements = nav_list.find_all('li')
  job_ids = [el.find('div').get('data-entity-urn')[18:] for el in job_elements]

  for job_id in job_ids[:3]:

    # get html for current job id
    cur_url = get_url(url, job_id)
    cur_res = get(cur_url, proxies=proxies, verify=False)
    cur_res.raise_for_status()
    cur_res_body = cur_res.text
    cur_html_soup = BeautifulSoup(cur_res_body, "html.parser")

    # get job title
    job_title = cur_html_soup.find('a', class_="topcard__link").find('h2').text

    # get job details
    description_con = cur_html_soup.find('ul', class_='description__job-criteria-list')
    details = description_con.find_all('li')
    sen_level, emp_type, job_func, industries = [det.find('span').text for det in details]

    data.append((comp_name, job_title, sen_level, emp_type, job_func, industries))



In [None]:
def get_url(og_url, job_id, position_num):

  # Split the URL string based on the currentJobId parameter
  url_parts = og_url.split("currentJobId=")

  # Construct the new URL with the updated job ID
  new_url = url_parts[0] + "currentJobId=" + job_id + '&position=' + str(position_num)

  return new_url

In [None]:
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,Company,Job title,Seniority level,Employment type,Job function,Industries


# Final Scraping for Positions from LinkedIn & Interview questions

In [None]:
urls_dict = {
'American Water': ('American Water', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=9226&f_TPR=&position=1&pageNum=0'),
'K&L Gates': ('K&L Gates', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_TPR=&f_C=164569&position=1&pageNum=0'),
'Guidehouse': ('Guidehouse', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=28974109&f_TPR=&position=1&pageNum=0'),
'Scholastic': ('Scholastic', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=4505&f_TPR=&position=1&pageNum=0'),
'Harbor Freight Tools': ('Harbor Freight Tools', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=28923&f_TPR=&position=1&pageNum=0'),
'U.S. Bank': ('U.S. Bank', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2532&f_TPR=&position=1&pageNum=0'),
'UC Health': ('UC Health', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1252954&f_TPR=&position=1&pageNum=0'),
'Kohler Co.': ('Kohler Co.', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=5670&f_TPR=&position=1&pageNum=0'),
'BECU': ('BECU', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=13985&f_TPR=&position=1&pageNum=0'),
'American Electric Power': ('American Electric Power', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=162419&f_TPR=&position=1&pageNum=0'),
'Merck': ('Merck', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1486&f_TPR=&position=1&pageNum=0'),
'VyStar Credit Union': ('VyStar Credit Union', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=31580&f_TPR=&position=1&pageNum=0'),
'Nucor Corporation': ('Nucor Corporation', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=165257&f_TPR=&position=1&pageNum=0'),
'Kaiser Permanente': ('Kaiser Permanente', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1550&f_TPR=&position=1&pageNum=0'),
'Northwestern Mutual': ('Northwestern Mutual', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2445&f_TPR=&position=1&pageNum=0'),
'Duke Energy Corporation': ('Duke Energy Corporation', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=4066&f_TPR=&position=1&pageNum=0'),
'Ipsos North America': ('Ipsos North America', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2562378&f_TPR=&position=1&pageNum=0'),
'City National Bank': ('City National Bank', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=163069&f_TPR=&position=1&pageNum=0'),
'Western Digital': ('Western Digital', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=4593&f_TPR=&position=1&pageNum=0'),
'State Farm': ('State Farm', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2381&f_TPR=&position=1&pageNum=0'),
'Honeywell': ('Honeywell', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1344&f_TPR=&position=1&pageNum=0'),
'Amazon': ('Amazon', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1586&f_TPR=&position=1&pageNum=0'),
'JCPenney': ('JCPenney', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=3688&f_TPR=&position=1&pageNum=0'),
'Whirlpool Corporation': ('Whirlpool Corporation', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2551&f_TPR=&position=1&pageNum=0'),
'Magellan Health': ('Magellan Health', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=5438&f_TPR=&position=1&pageNum=0'),
'Ripple': ('Ripple', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=3249443&f_TPR=&position=1&pageNum=0'),
'iHeartMedia': ('iHeartMedia', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2539&f_TPR=&position=1&pageNum=0'),
'Wal-Mart': ('Walmart', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2646&f_TPR=&position=1&pageNum=0'),
'USPS': ('United States Postal Service', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=3694&f_TPR=&position=1&pageNum=0'),
'Wells Fargo': ('Wells Fargo', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1235&f_TPR=&position=1&pageNum=0'),
'AT&T Inc.': ('AT&T', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1052&f_TPR=&position=1&pageNum=0'),
'Bank of America': ('Bank of America', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1123&f_TPR=&position=1&pageNum=0'),
'Target': ('Target', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1512&f_TPR=&position=1&pageNum=0'),
'Microsoft': ('Microsoft', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1035&f_TPR=&position=1&pageNum=0'),
'Google': ('Google', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1441&f_TPR=&position=1&pageNum=0'),
'Boeing': ('Boeing', 'https://linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1384&f_TPR=&position=1&pageNum=0'),
'Lockheed Martin': ('Lockheed Martin', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1319&f_TPR=&position=1&pageNum=0'),
# 'Apple': ('Apple', ),
'JPMorgan Chase': ('JPMorgan Chase & Co.', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1068&f_TPR=&position=1&pageNum=0'),
'Deloitte': ('Deloitte', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1038&f_TPR=&position=1&pageNum=0'),
'Department of Veterans Affairs': ('U.S. Department of Veterans Affairs', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=23789&f_TPR=&position=1&pageNum=0'),
'UPS': ('UPS', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1523&f_TPR=&position=1&pageNum=0'),
'General Motors': ('General Motors', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1472&f_TPR=&position=1&pageNum=0'),
'Home Depot': ('The Home Depot', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1534&f_TPR=&position=1&pageNum=0'),
'IBM': ('IBM', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1009&f_TPR=&position=1&pageNum=0'),
'Northrop Grumman': ('Northrop Grumman', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1412&f_TPR=&position=1&pageNum=0'),
'Walgreens': ('Walgreens', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=3589&f_TPR=&position=1&pageNum=0'),
'Ford Motor Company': ('Ford Motor Company', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1483&f_TPR=&position=1&pageNum=0'),
'Starbucks': ('Starbucks', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2271&f_TPR=&position=1&pageNum=0'),
'Accenture': ('Accenture', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1033&f_TPR=&position=1&pageNum=0'),
'PwC': ('PwC', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1044&f_TPR=&position=1&pageNum=0'),
'EY': ('EY', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1073&f_TPR=&position=1&pageNum=0'),
'American Airlines': ('American Airlines', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2640&f_TPR=&position=1&pageNum=0'),
'Capital One Bank': ('Capital One', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1419&f_TPR=&position=1&pageNum=0'),
'Intel Corporation': ('Intel Corporation', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1053&f_TPR=&position=1&pageNum=0'),
'UnitedHealth Group': ('UnitedHealth Group', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1720&f_TPR=&position=1&pageNum=0'),
'PNC Bank': ('PNC', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=4167&f_TPR=&position=1&pageNum=0'),
'Kroger': ('Kroger', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=4914&f_TPR=&position=1&pageNum=0'),
'T-Mobile': ('T-Mobile', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1392&f_TPR=&position=1&pageNum=0'),
'Mayo Clinic': ('Mayo Clinic', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=4725&f_TPR=&position=1&pageNum=0'),
'Oracle': ('Oracle', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1028&f_TPR=&position=1&pageNum=0'),
'Costco': ('Costco Wholesale', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=163225&f_TPR=&position=1&pageNum=0'),
'Best Buy': ('Best Buy', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2127&f_TPR=&position=1&pageNum=0'),
'Facebook': ('Meta', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=10667&f_TPR=&position=1&pageNum=0'),
'Cleveland Clinic': ('Cleveland Clinic', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=5656&f_TPR=&position=1&pageNum=0'),
'Farmers Insurance': ('Farmers Insurance', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=3240&f_TPR=&position=1&pageNum=0'),
'United Airlines': ('United Airlines', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2380&f_TPR=&position=1&pageNum=0'),
'Allstate': ('Allstate', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1835&f_TPR=&position=1&pageNum=0'),
'Medtronic': ('Medtronic', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1841&f_TPR=&position=1&pageNum=0'),
'Comcast': ('Comcast', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1703&f_TPR=&position=1&pageNum=0'),
'Humana': ('Humana', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=4257&f_TPR=&position=1&pageNum=0'),
'Tesla Motors Inc.': ('Tesla', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=15564&f_TPR=&position=1&pageNum=0'),
'Southwest Airlines': ('Southwest Airlines', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=4599&f_TPR=&position=1&pageNum=0'),
'USAA': ('USAA', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=4501&f_TPR=&position=1&pageNum=0'),
'Optum, Inc.': ('Optum', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=3617422&f_TPR=&position=1&pageNum=0'),
'Edward Jones': ('Edward Jones', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=3677&f_TPR=&position=1&pageNum=0'),
'Cisco': ('Cisco', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1063&f_TPR=&position=1&pageNum=0'),
'H&R Block Inc.': ('H&R Block', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=3671&f_TPR=&position=1&pageNum=0'),
'Morgan Stanley': ('Morgan Stanley', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=497017&f_TPR=&position=1&pageNum=0'),
'Delta Air Lines': ('Delta Air Lines', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2272&f_TPR=&position=1&pageNum=0'),
'PepsiCo': ('PepsiCo', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1431&f_TPR=&position=1&pageNum=0'),
'McDonalds': ("McDonald's", 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2677&f_TPR=&position=1&pageNum=0'),
'IRS': ('Internal Revenue Service', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=4922&f_TPR=&position=1&pageNum=0'),
'Nordstrom': ('Nordstrom', 'https://linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=3379&f_TPR=&position=1&pageNum=0'),
'Whole Foods': ('Whole Foods Market', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=157353&f_TPR=&position=1&pageNum=0'),
'Progressive': ('Progressive Insurance', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=3264&f_TPR=&position=1&pageNum=0'),
'Marriott': ('Marriott International', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2311&f_TPR=&position=1&pageNum=0'),
'Raytheon': ('Raytheon', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1505&f_TPR=&position=1&pageNum=0'),
'Pfizer': ('Pfizer', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1185&f_TPR=&position=1&pageNum=0'),
'Publix': ('Publix Super Markets', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=7242&f_TPR=&position=1&pageNum=0'),
'Liberty Mutual': ('Liberty Mutual Insurance', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=2701&f_TPR=&position=1&pageNum=0'),
'Procter and Gamble': ('Procter & Gamble', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1116&f_TPR=&position=1&pageNum=0'),
'3M': ('3M', 'https://www.linkedin.com/jobs/search?keywords=&location=United%20States&geoId=103644278&f_C=1864&f_TPR=&position=1&pageNum=0')
}

## Questions

In [None]:
url = 'https://www.mockquestions.com/company/'
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}

data = []
columns = ['mock_name', 'data_name', 'category', 'question']
type_counter = 0

for mock_name, value in urls_dict.items():

    questions_list = []

    cur_url = url + mock_name.replace(" ", "+") + '/'
    response = get(cur_url,headers=headers)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    page_div = html_soup.find('div', class_="float w100 answers")

    print(mock_name, page_div == None)

    if not page_div:
      continue

    q1_con = page_div.find('li', class_="float w100 list-none mt18 list-answers")
    q1type = q1_con.find('div').find('p', class_="float w100 title-font tfont600 color-B2B2B1 text16 lh1-9")

    if q1type:
      q1type = q1type.text
      type_counter += 1
    else:
      q1type = None
    q1 = q1_con.find('h2').text.strip()
    data.append((mock_name, value[0], q1type, q1))

    question_con = page_div.find_all('li', class_="float w100 list-none mt50 list-answers")
    for q_con in question_con:
        qtype = q_con.find('p', class_="float w100 title-font tfont600 color-B2B2B1 text16 lh1-9")
        if qtype:
          qtype = qtype.text
        else:
          qtype = None
        q = q_con.find('h2').text.strip()
        data.append((mock_name, value[0], qtype, q))

In [None]:
q_df = pd.DataFrame(data, columns=columns)

In [None]:
q_df.to_csv('questions.csv')

## Job titles

In [None]:
proxies = {
   'http':  'http://brd-customer-hl_80709a30-zone-dcp_renana_gili_naomi_example:k29lttxt622h@brd.superproxy.io:22225',
   'https': 'http://brd-customer-hl_80709a30-zone-dcp_renana_gili_naomi_example:k29lttxt622h@brd.superproxy.io:22225',
}

data = []
columns = ['company', 'company_in_data', 'job']

for mock_name, (data_name, url) in urls_dict.items():

  # get page
  res = get(url, proxies=proxies, verify=False)
  res.raise_for_status()
  res_body = res.text
  html_soup = BeautifulSoup(res_body, "html.parser")

  # Get list of job names
  nav_list = html_soup.find('ul', class_="jobs-search__results-list")
  job_elements = nav_list.find_all('li')
  job_names = [el.find('span').text.strip() for el in job_elements]

  for job in job_names:
    data.append((mock_name, data_name, job))

In [None]:
jobs_df = pd.DataFrame(data, columns=columns)
jobs_df.to_csv('jobs_final.csv')