In [5]:
import requests
from bs4 import BeautifulSoup

url = 'https://realpython.github.io/fake-jobs/'
resp = requests.get(url)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, 'html.parser')

1a. Find the first job title
We know job titles live in an <h2> with class title.

In [6]:
first_title_tag = soup.find('h2', class_='title')
first_title     = first_title_tag.get_text(strip=True)
print(first_title)
# → "Senior Python Developer"

Senior Python Developer


1b. Extract all job titles into a list

In [7]:
title_tags = soup.find_all('h2', class_='title')
titles     = [tag.get_text(strip=True) for tag in title_tags]
print(titles[:5])  # show first five

['Senior Python Developer', 'Energy engineer', 'Legal executive', 'Fitness centre manager', 'Product manager']


1c. Extract companies, locations, and posting dates

On this page:

Company is in <h3> with class company
Location is in <p> with class location
Date is in <time> (use its datetime attribute)

In [8]:
company_tags  = soup.find_all('h3', class_='company')
location_tags = soup.find_all('p', class_='location')
time_tags     = soup.find_all('time')

companies = [tag.get_text(strip=True) for tag in company_tags]
locations = [tag.get_text(strip=True) for tag in location_tags]
dates     = [tag['datetime'] for tag in time_tags]

# quick sanity check
print(companies[0], locations[0], dates[0])
# → Payne, Roberts and Davis Stewartbury, AA 2021-04-08

Payne, Roberts and Davis Stewartbury, AA 2021-04-08


1d. Combine into a pandas DataFrame

In [9]:
import pandas as pd

df = pd.DataFrame({
    'title':       titles,
    'company':     companies,
    'location':    locations,
    'date_posted': dates
})

print(df.head())

                     title                     company              location  \
0  Senior Python Developer    Payne, Roberts and Davis       Stewartbury, AA   
1          Energy engineer            Vasquez-Davidson  Christopherville, AA   
2          Legal executive  Jackson, Chambers and Levy   Port Ericaburgh, AA   
3   Fitness centre manager              Savage-Bradley     East Seanview, AP   
4          Product manager                 Ramirez Inc   North Jamieview, AP   

  date_posted  
0  2021-04-08  
1  2021-04-08  
2  2021-04-08  
3  2021-04-08  
4  2021-04-08  


In [13]:
from urllib.parse import urljoin

base_url   = 'https://realpython.github.io/fake-jobs/'
# assume you already have: job_cards = soup.find_all('div', class_='card')
# and df has one row per card

# Method A: loop over each card
apply_urls = [
    urljoin(base_url,
            card.find('a', class_='card-footer-item')['href'])
    for card in job_cards
]

df['apply_url_bs'] = apply_urls

# Quick sanity check:
print(len(df), len(apply_urls))
# should both print 100 (or however many cards you have)


100 100


In [15]:
import re
from urllib.parse import urljoin

base_url = 'https://realpython.github.io/fake-jobs/'

def make_slug(title: str) -> str:
    s = title.lower()
    s = s.replace('&', 'and')                    # “&” → “and”
    s = re.sub(r'[()\[\],/?:!\'"]+', '', s)       # strip punctuation
    s = re.sub(r'[^a-z0-9]+', '-', s)             # non-alnum → dash
    return s.strip('-')

# 1) turn each title into its “raw” slug
raw_slugs = [make_slug(t) for t in titles]

# 2) assign the -0, -1, … suffix based on first/second occurrence
counts = {}
constructed_hrefs = []
for slug in raw_slugs:
    n = counts.get(slug, 0)
    counts[slug] = n + 1
    constructed_hrefs.append(f'jobs/{slug}-{n}.html')

# 3) build absolute URLs
apply_urls_calc = [urljoin(base_url, href) for href in constructed_hrefs]

# 4) stick ’em in the DataFrame
df['apply_url_calc'] = apply_urls_calc

# sanity-check: these should all be True
print((df['apply_url_bs'] == df['apply_url_calc']).all())

# example: inspect the two tricky titles
for t in ["Software Engineer (Python)", "Scientist, research (maths)"]:
    print(t, "→", make_slug(t))


False
Software Engineer (Python) → software-engineer-python
Scientist, research (maths) → scientist-research-maths


Number 3

In [16]:
import requests
from bs4 import BeautifulSoup

# 1) Fetch & parse the Senior Python Developer page
job_url  = 'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html'
resp     = requests.get(job_url)
resp.raise_for_status()
job_soup = BeautifulSoup(resp.text, 'html.parser')

# 2a) Method 1: grab the first <p> inside the main “content” container
content_div     = job_soup.find('div', class_='content')
first_paragraph = content_div.find('p').get_text(strip=True)
print(first_paragraph)


# 2b) Method 2: find the company header, then grab its very next <p> sibling
company_h3      = job_soup.find('h3', class_='subtitle')
first_paragraph = company_h3.find_next_sibling('p').get_text(strip=True)
print(first_paragraph)


Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.


AttributeError: 'NoneType' object has no attribute 'find_next_sibling'

In [17]:
import requests
from bs4 import BeautifulSoup

def get_job_description(url: str) -> str:
    """
    Fetches the given fake-jobs URL and returns the first
    paragraph of the job description (stripped of extra whitespace).
    """
    resp = requests.get(url)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, 'html.parser')
    # the description lives in the first <p> inside <div class="content">
    content_div = soup.find('div', class_='content')
    if not content_div:
        return ''
    p = content_div.find('p')
    return p.get_text(strip=True) if p else ''

# Example
url = "https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html"
description = get_job_description(url)
print(description)
# → "At be than always different American address. Former claim chance prevent why measure too. Almost before some military outside baby interview. Face top individual win suddenly. Parent do ten after those scientist. Medical effort assume teacher wall. Significant his himself clearly very. Expert stop area along individual. Three own bank recognize special good along."


At be than always different American address. Former claim chance prevent why measure too. Almost before some military outside baby interview. Face top individual win suddenly. Parent do ten after those scientist. Medical effort assume teacher wall. Significant his himself clearly very. Expert stop area along individual. Three own bank recognize special good along.
