In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import re

In [2]:
URL = 'https://realpython.github.io/fake-jobs/'

In [3]:
response = requests.get(URL)

In [4]:
type(response)

requests.models.Response

In [48]:
response.text

'<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="utf-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    <title>Fake Python</title>\n    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css">\n  </head>\n  <body>\n  <section class="section">\n    <div class="container mb-5">\n      <h1 class="title is-1">\n        Fake Python\n      </h1>\n      <p class="subtitle is-3">\n        Fake Jobs for Your Web Scraping Journey\n      </p>\n    </div>\n    <div class="container">\n    <div id="ResultsContainer" class="columns is-multiline">\n    <div class="column is-half">\n<div class="card">\n  <div class="card-content">\n    <div class="media">\n      <div class="media-left">\n        <figure class="image is-48x48">\n          <img src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1" alt="Real Python Logo">\n        </figure>\n      </div>\n      <div class="media-content"

In [6]:
soup = BS(response.content, 'html.parser')

In [7]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Fake Python
  </title>
  <link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
 </head>
 <body>
  <section class="section">
   <div class="container mb-5">
    <h1 class="title is-1">
     Fake Python
    </h1>
    <p class="subtitle is-3">
     Fake Jobs for Your Web Scraping Journey
    </p>
   </div>
   <div class="container">
    <div class="columns is-multiline" id="ResultsContainer">
     <div class="column is-half">
      <div class="card">
       <div class="card-content">
        <div class="media">
         <div class="media-left">
          <figure class="image is-48x48">
           <img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
          </figure>
         </div>
         <div class="media-content">
          <h2 c

In [8]:
soup.find('title')

<title>Fake Python</title>

In [9]:
soup.find('h2', class_='title')

<h2 class="title is-5">Senior Python Developer</h2>

In [10]:
first_job = soup.find('h2', class_='title')
first_job_title = first_job.text.strip()
first_job_title

'Senior Python Developer'

In [11]:
job_findall = soup.find_all('h2', class_='title')
job_titles = [job.text.strip() for job in job_findall]
job_titles

['Senior Python Developer',
 'Energy engineer',
 'Legal executive',
 'Fitness centre manager',
 'Product manager',
 'Medical technical officer',
 'Physiological scientist',
 'Textile designer',
 'Television floor manager',
 'Waste management officer',
 'Software Engineer (Python)',
 'Interpreter',
 'Architect',
 'Meteorologist',
 'Audiological scientist',
 'English as a second language teacher',
 'Surgeon',
 'Equities trader',
 'Newspaper journalist',
 'Materials engineer',
 'Python Programmer (Entry-Level)',
 'Product/process development scientist',
 'Scientist, research (maths)',
 'Ecologist',
 'Materials engineer',
 'Historic buildings inspector/conservation officer',
 'Data scientist',
 'Psychiatrist',
 'Structural engineer',
 'Immigration officer',
 'Python Programmer (Entry-Level)',
 'Neurosurgeon',
 'Broadcast engineer',
 'Make',
 'Nurse, adult',
 'Air broker',
 'Editor, film/video',
 'Production assistant, radio',
 'Engineer, communications',
 'Sales executive',
 'Software Deve

In [12]:
job_cards = soup.find_all('div', class_='card-content')

companies = []
locations = []
dates = []

for card in job_cards:
    company = card.find('h3', class_='company').text.strip()
    location = card.find('p', class_='location').text.strip()
    date = card.find('time')['datetime'].strip()

    companies.append(company)
    locations.append(location)
    dates.append(date)

print(companies[:3], locations[:3], dates[:3])


['Payne, Roberts and Davis', 'Vasquez-Davidson', 'Jackson, Chambers and Levy'] ['Stewartbury, AA', 'Christopherville, AA', 'Port Ericaburgh, AA'] ['2021-04-08', '2021-04-08', '2021-04-08']


In [13]:
jobs = pd.DataFrame({
    'Job Title': job_titles,
    'Company': companies,
    'Location': locations,
    'Date Posted': dates})


In [14]:
jobs

Unnamed: 0,Job Title,Company,Location,Date Posted
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08
...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","Lake Abigail, AE",2021-04-08
96,"Radiographer, diagnostic",Holder LLC,"Jacobshire, AP",2021-04-08
97,Database administrator,Yates-Ferguson,"Port Susan, AE",2021-04-08
98,Furniture designer,Ortega-Lawrence,"North Tiffany, AA",2021-04-08


In [15]:
apply_links = []

for card in job_cards:
   apply_tag = card.find_next('a', string='Apply')
   apply_links.append(apply_tag['href'])
print(apply_links)

['https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html', 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html', 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html', 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html', 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html', 'https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html', 'https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html', 'https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html', 'https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html', 'https://realpython.github.io/fake-jobs/jobs/waste-management-officer-9.html', 'https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html', 'https://realpython.github.io/fake-jobs/jobs/interpreter-11.html', 'https://realpython.github.io/fake-jobs/jobs/architect-12.html', 'https://realpython.github.io/fake-

In [50]:
jobs = pd.DataFrame({
    'Job Title': job_titles,
    'Company': companies,
    'Location': locations,
    'Date Posted': dates,
    'Apply URL': apply_links})

jobs.head()

Unnamed: 0,Job Title,Company,Location,Date Posted,Apply URL
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...


In [17]:
job_cards = soup.find_all('div', class_='card-content')

job_titles = []
companies = []
locations = []
dates = []
apply_urls = []

for card in job_cards:
    title = card.find('h2', class_='title').text.strip()
    company = card.find('h3', class_='company').text.strip()
    location = card.find('p', class_='location').text.strip()
    date = card.find('time')['datetime'].strip()
    
    
    apply_link = card.find_next('a', string='Apply')['href']
    
    job_titles.append(title)
    companies.append(company)
    locations.append(location)
    dates.append(date)
    apply_urls.append(apply_link)

jobs = pd.DataFrame({
    'Job Title': job_titles,
    'Company': companies,
    'Location': locations,
    'Date Posted': dates,
    'Apply URL': apply_urls
})

jobs.head()

Unnamed: 0,Job Title,Company,Location,Date Posted,Apply URL
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...


In [18]:
print(jobs['Apply URL'])

0     https://realpython.github.io/fake-jobs/jobs/se...
1     https://realpython.github.io/fake-jobs/jobs/en...
2     https://realpython.github.io/fake-jobs/jobs/le...
3     https://realpython.github.io/fake-jobs/jobs/fi...
4     https://realpython.github.io/fake-jobs/jobs/pr...
                            ...                        
95    https://realpython.github.io/fake-jobs/jobs/mu...
96    https://realpython.github.io/fake-jobs/jobs/ra...
97    https://realpython.github.io/fake-jobs/jobs/da...
98    https://realpython.github.io/fake-jobs/jobs/fu...
99    https://realpython.github.io/fake-jobs/jobs/sh...
Name: Apply URL, Length: 100, dtype: object


In [19]:
description = soup.find('div', class_='content').find('p').text.strip()

In [61]:
def get_description(url):
    response = requests.get(url)
    soup = BS(response.content, 'html.parser')
    content = soup.find('div', class_='content')
    if content:
        p = content.find('p')
        if p:
            return p.text.strip()

jobs['Description'] = jobs['Apply URL'].apply(get_description)

In [64]:
jobs[['Job Title', 'Apply URL', 'Description']].head()

Unnamed: 0,Job Title,Apply URL,Description
0,Senior Python Developer,https://realpython.github.io/fake-jobs/jobs/se...,Professional asset web application environment...
1,Energy engineer,https://realpython.github.io/fake-jobs/jobs/en...,Party prevent live. Quickly candidate change a...
2,Legal executive,https://realpython.github.io/fake-jobs/jobs/le...,Administration even relate head color. Staff b...
3,Fitness centre manager,https://realpython.github.io/fake-jobs/jobs/fi...,Tv program actually race tonight themselves tr...
4,Product manager,https://realpython.github.io/fake-jobs/jobs/pr...,Traditional page a although for study anyone. ...
