In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [7]:
# 1.	Start by performing a GET request on the url above and convert the response into a BeautifulSoup object. 

URL = 'https://realpython.github.io/fake-jobs/'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/122.0.0.0 Safari/537.36"
}

response = requests.get(URL, headers=headers)

soup = BeautifulSoup(response.content, "html.parser")

# a. Use the .find method to find the tag containing the first job title ("Senior Python Developer"). Hint: can you
# find a tag type and/or a class that could be helpful for extracting this information? Extract the text from this
# title. 
first_job_title = soup.find("h2", class_="title is-5")
print(first_job_title.text)

Senior Python Developer


In [8]:
# b. Now, use what you did for the first title, but extract the job title for all jobs on this page. Store the results
# in a list. 
job_title_tags = soup.find_all("h2", class_="title is-5")

# Get the text from each tag
job_titles = [tag.text for tag in job_title_tags]

print(f"Found {len(job_titles)} job titles")
print(job_titles[:10])  # preview the first 1

Found 100 job titles
['Senior Python Developer', 'Energy engineer', 'Legal executive', 'Fitness centre manager', 'Product manager', 'Medical technical officer', 'Physiological scientist', 'Textile designer', 'Television floor manager', 'Waste management officer']


In [17]:
# c. Finally, extract the companies, locations, and posting dates for each job.
# Example: first job has Company = "Payne, Roberts and Davis",
# Location = "Stewartbury, AA", Date = "2021-04-08"

# Find all company tags
company_tags = soup.find_all("h3", class_="subtitle is-6 company")
companies = [tag.text.strip() for tag in company_tags]

# Find all location tags
location_tags = soup.find_all("p", class_="location")
locations = [tag.text.strip() for tag in location_tags]

# Find all posting date tags (no class needed)
date_tags = soup.find_all("time")
dates = [tag.text.strip() for tag in date_tags]

# Make sure we only loop over the shortest list length
min_len = min(len(companies), len(locations), len(dates))

for i in range(min_len):
    print(f"Company: {companies[i]}, Location: {locations[i]}, Date: {dates[i]}")

Company: Payne, Roberts and Davis, Location: Stewartbury, AA, Date: 2021-04-08
Company: Vasquez-Davidson, Location: Christopherville, AA, Date: 2021-04-08
Company: Jackson, Chambers and Levy, Location: Port Ericaburgh, AA, Date: 2021-04-08
Company: Savage-Bradley, Location: East Seanview, AP, Date: 2021-04-08
Company: Ramirez Inc, Location: North Jamieview, AP, Date: 2021-04-08
Company: Rogers-Yates, Location: Davidville, AP, Date: 2021-04-08
Company: Kramer-Klein, Location: South Christopher, AE, Date: 2021-04-08
Company: Meyers-Johnson, Location: Port Jonathan, AE, Date: 2021-04-08
Company: Hughes-Williams, Location: Osbornetown, AE, Date: 2021-04-08
Company: Jones, Williams and Villa, Location: Scotttown, AP, Date: 2021-04-08
Company: Garcia PLC, Location: Ericberg, AE, Date: 2021-04-08
Company: Gregory and Sons, Location: Ramireztown, AE, Date: 2021-04-08
Company: Clark, Garcia and Sosa, Location: Figueroaview, AA, Date: 2021-04-08
Company: Bush PLC, Location: Kelseystad, AA, Date:

In [19]:
# d. Take the lists that you have created and combine them into a pandas DataFrame.
jobs_df = pd.DataFrame({
    "Title": job_titles,
    "Company": companies,
    "Location": locations,
    "Date": dates
})

print(jobs_df.head())
print(f"DataFrame shape: {jobs_df.shape}")

                     Title                     Company              Location  \
0  Senior Python Developer    Payne, Roberts and Davis       Stewartbury, AA   
1          Energy engineer            Vasquez-Davidson  Christopherville, AA   
2          Legal executive  Jackson, Chambers and Levy   Port Ericaburgh, AA   
3   Fitness centre manager              Savage-Bradley     East Seanview, AP   
4          Product manager                 Ramirez Inc   North Jamieview, AP   

         Date  
0  2021-04-08  
1  2021-04-08  
2  2021-04-08  
3  2021-04-08  
4  2021-04-08  
DataFrame shape: (100, 4)


In [20]:
# 2.	Next, add a column that contains the url for the "Apply" button. Try this in two ways.  

# a. First, use the BeautifulSoup find_all method to extract the urls. 
apply_tags = soup.find_all("a", class_="card-footer-item")
apply_urls = [tag["href"].strip() for tag in apply_tags]

print(f"Found {len(apply_urls)} apply URLs")
print(apply_urls[:5])  # preview first 5


Found 200 apply URLs
['https://www.realpython.com', 'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html', 'https://www.realpython.com', 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html', 'https://www.realpython.com']


In [23]:
# b. Next, get those same urls in a different way. Examine the urls and see if you can spot the pattern of how they 
# are constructed. Then, build the url using the elements you have already extracted. Ensure that the urls that you 
# created match those that you extracted using BeautifulSoup. Warning: You will need to do some string cleaning and 
# prep in constructing the urls this way. For example, look carefully at the urls for the "Software Engineer (Python)"
# job and the "Scientist, research (maths)" job.
base_url = "https://realpython.github.io/fake-jobs/jobs/"

# Build slugs from job titles (slugs = URL friendly version of a job title)
job_slugs = []
for i, title in enumerate(job_titles):
    # Lowercase
    slug = title.lower()
    # Replace spaces with hyphens
    slug = slug.replace(" ", "-")
    # Remove parentheses, commas, and other punctuation
    slug = re.sub(r"[^\w\-]", "", slug)
    # Append index
    slug = f"{slug}-{i}.html"
    job_slugs.append(base_url + slug)

# Compare with scraped apply_urls
print(job_slugs[:5])
print(apply_urls[:5])

# Verify they match
print(job_slugs == apply_urls)


['https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html', 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html', 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html', 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html', 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html']
['https://www.realpython.com', 'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html', 'https://www.realpython.com', 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html', 'https://www.realpython.com']
False


In [29]:
# 3. Finally, we want to get the job description text for each job. 

# a. Start by looking at the page for the first job, 
# https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html. Using BeautifulSoup, extract the job
# description paragraph.  
url = "https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html"

headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers)
soup_job = BeautifulSoup(response.content, "html.parser")

# Find all paragraphs in the job description section
paragraphs = soup_job.find_all("p")
description = " ".join([p.text.strip() for p in paragraphs])

print(description)


Fake Jobs for Your Web Scraping Journey Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset. Location: Stewartbury, AA Posted: 2021-04-08


In [36]:
# b. We want to be able to do this for all pages. Write a function which takes as input a url and returns the
# description text on that page. For example, if you input
# "https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html" into your function, it should return
# the string "At be than always different American address. Former claim chance prevent why measure too. Almost before
# some military outside baby interview. Face top individual win suddenly. Parent do ten after those scientist. Medical
# effort assume teacher wall. Significant his himself clearly very. Expert stop area along individual. Three own bank
# recognize special good along.".

headers = {"User-Agent": "Mozilla/5.0"}

# Function to fetch job description from a job detail page
def get_job_description(url):
    response = requests.get(url, headers=headers)
    soup_job = BeautifulSoup(response.content, "html.parser")
    paragraphs = soup_job.find_all("p")
    return " ".join([p.text.strip() for p in paragraphs])

# First, scrape only the "Apply" button URLs (avoid "More Info")
apply_tags = soup.find_all("a", class_="card-footer-item", string="Apply")
apply_urls = [tag["href"].strip() for tag in apply_tags]

print(len(apply_urls))  # should be 100

# Add Apply_URL column before using it
jobs_df["Apply_URL"] = apply_urls

# c. Use the .apply method on the url column you created above to retrieve the description text for all of the jobs.

# Now apply the function to each Apply_URL
jobs_df["Description"] = jobs_df["Apply_URL"].apply(get_job_description)

# Preview the first few rows
print(jobs_df.head())

# Optional: check if any descriptions are missing
print(jobs_df["Description"].isna().sum(), "missing descriptions")


100
                     Title                     Company              Location  \
0  Senior Python Developer    Payne, Roberts and Davis       Stewartbury, AA   
1          Energy engineer            Vasquez-Davidson  Christopherville, AA   
2          Legal executive  Jackson, Chambers and Levy   Port Ericaburgh, AA   
3   Fitness centre manager              Savage-Bradley     East Seanview, AP   
4          Product manager                 Ramirez Inc   North Jamieview, AP   

         Date                                          Apply_URL  \
0  2021-04-08  https://realpython.github.io/fake-jobs/jobs/se...   
1  2021-04-08  https://realpython.github.io/fake-jobs/jobs/en...   
2  2021-04-08  https://realpython.github.io/fake-jobs/jobs/le...   
3  2021-04-08  https://realpython.github.io/fake-jobs/jobs/fi...   
4  2021-04-08  https://realpython.github.io/fake-jobs/jobs/pr...   

                                         Description  
0  Fake Jobs for Your Web Scraping Journey Profes..

In [24]:
# Webscraping
# In this exercise, you'll practice using BeautifulSoup to parse the content of a web page. The page that you'll be
# scraping, https://realpython.github.io/fake-jobs/, contains job listings. Your job is to extract the data on each 
# job and convert into a pandas DataFrame.

# 1.	Start by performing a GET request on the url above and convert the response into a BeautifulSoup object. 

# a. Use the .find method to find the tag containing the first job title ("Senior Python Developer"). Hint: can you
# find a tag type and/or a class that could be helpful for extracting this information? Extract the text from this
# title.  

# b. Now, use what you did for the first title, but extract the job title for all jobs on this page. Store the results
# in a list.  

# c. Finally, extract the companies, locations, and posting dates for each job. For example, the first job has a 
# company of "Payne, Roberts and Davis", a location of "Stewartbury, AA", and a posting date of "2021-04-08". Ensure
# that the text that you extract is clean, meaning no extra spaces or other characters at the beginning or end.  

# d. Take the lists that you have created and combine them into a pandas DataFrame.

# 2.	Next, add a column that contains the url for the "Apply" button. Try this in two ways.  

# a. First, use the BeautifulSoup find_all method to extract the urls.  

# b. Next, get those same urls in a different way. Examine the urls and see if you can spot the pattern of how they 
# are constructed. Then, build the url using the elements you have already extracted. Ensure that the urls that you 
# created match those that you extracted using BeautifulSoup. Warning: You will need to do some string cleaning and 
# prep in constructing the urls this way. For example, look carefully at the urls for the "Software Engineer (Python)"
# job and the "Scientist, research (maths)" job.

# 3.	Finally, we want to get the job description text for each job.  

# a. Start by looking at the page for the first job, 
# https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html. Using BeautifulSoup, extract the job
# description paragraph.  

# b. We want to be able to do this for all pages. Write a function which takes as input a url and returns the
# description text on that page. For example, if you input
# "https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html" into your function, it should return
# the string "At be than always different American address. Former claim chance prevent why measure too. Almost before
# some military outside baby interview. Face top individual win suddenly. Parent do ten after those scientist. Medical
# effort assume teacher wall. Significant his himself clearly very. Expert stop area along individual. Three own bank
# recognize special good along.".

# c. Use the .apply method on the url column you created above to retrieve the description text for all of the jobs.