In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# 1 Start by performing a GET request on the url above and convert the response into a BeautifulSoup object.

url = 'https://realpython.github.io/fake-jobs/'

# GET request
response = requests.get(url)

# Convert response to  BeautifulSoup object
soup = BeautifulSoup(response.content, 'html.parser')

In [3]:
# 1a.Find the first job title
first_job_title_tag = soup.find('h2', class_='title')

# Extract the text
if first_job_title_tag:
    first_job_title = first_job_title_tag.text.strip()
    print("First job title:", first_job_title)
else:
    print("Could not find the job title element.")

First job title: Senior Python Developer


In [4]:
# 1b. find all job titles 
job_titles = [title.text.strip() for title in soup.find_all('h2', class_='title')]
print(job_titles)

['Senior Python Developer', 'Energy engineer', 'Legal executive', 'Fitness centre manager', 'Product manager', 'Medical technical officer', 'Physiological scientist', 'Textile designer', 'Television floor manager', 'Waste management officer', 'Software Engineer (Python)', 'Interpreter', 'Architect', 'Meteorologist', 'Audiological scientist', 'English as a second language teacher', 'Surgeon', 'Equities trader', 'Newspaper journalist', 'Materials engineer', 'Python Programmer (Entry-Level)', 'Product/process development scientist', 'Scientist, research (maths)', 'Ecologist', 'Materials engineer', 'Historic buildings inspector/conservation officer', 'Data scientist', 'Psychiatrist', 'Structural engineer', 'Immigration officer', 'Python Programmer (Entry-Level)', 'Neurosurgeon', 'Broadcast engineer', 'Make', 'Nurse, adult', 'Air broker', 'Editor, film/video', 'Production assistant, radio', 'Engineer, communications', 'Sales executive', 'Software Developer (Python)', 'Futures trader', 'Tour

In [5]:
# 1c. extract the companies, locations, and posting dates for each job
companies = [company.get_text(strip=True) for company in soup.find_all("h3", class_="subtitle is-6 company")]
locations = [location.get_text(strip=True) for location in soup.find_all("p", class_="location")]
date = [date.get_text(strip=True) for date in soup.find_all("time")]

In [6]:
# 1d. convert lists into df
jobs_df = pd.DataFrame({
    'Title': job_titles,
    'Company': companies,
    'Location': locations,
    'Date Posted': date})
# print(jobs_df.head())

In [7]:
# 2a.  beautifulSoup find_all method
base_url = "https://realpython.github.io/fake-jobs/"
apply_links = [base_url + a["href"] for a in soup.find_all("a", string="Apply")]
jobs_df["Apply URL"] = apply_links

In [8]:
# # 2b. manually construct urls
# def generate_slug(title):
#     return title.lower().replace(",", "").replace("(", "").replace(")", "").replace(" ", "-")

# slugs = [generate_slug(title) for title in job_titles]
# apply_urls_manual = [f"https://realpython.github.io/fake-jobs/jobs/{slug}-{i+1}.html" for i, slug in enumerate(slugs)]
# pd.set_option('display.max_colwidth', None)
# jobs_df["Apply URL"].head(1)

In [25]:
# 3a. extract job desccription
job_url = "https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html"
job_response = requests.get(job_url)
job_soup = BeautifulSoup(job_response.content, "html.parser")

job_description = job_soup.find("div", class_="content").get_text(strip=True)
print(job_description)
print(jobs_df.head())

Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.Location:Stewartbury, AAPosted:2021-04-08
                     Title                     Company              Location  \
0  Senior Python Developer    Payne, Roberts and Davis       Stewartbury, AA   
1          Energy engineer            Vasquez-Davidson  Christopherville, AA   
2          Legal executive  Jackson, Chambers and Levy   Port Ericaburgh, AA   
3   Fitness centre manager              Savage-Bradley     

In [23]:
# 3b. extract description from anyurl
def get_job_description(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    description_tag = soup.find("div", class_="content")

    # Check if description_tag exists before calling get_text()
    if description_tag:
        return description_tag.get_text(strip=True)
    else:
        return "Description not found"

desc_url = "https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html"
print(get_job_description(desc_url))

At be than always different American address. Former claim chance prevent why measure too. Almost before some military outside baby interview. Face top individual win suddenly. Parent do ten after those scientist. Medical effort assume teacher wall. Significant his himself clearly very. Expert stop area along individual. Three own bank recognize special good along.Location:Osbornetown, AEPosted:2021-04-08
                     Title                     Company              Location  \
0  Senior Python Developer    Payne, Roberts and Davis       Stewartbury, AA   
1          Energy engineer            Vasquez-Davidson  Christopherville, AA   
2          Legal executive  Jackson, Chambers and Levy   Port Ericaburgh, AA   
3   Fitness centre manager              Savage-Bradley     East Seanview, AP   
4          Product manager                 Ramirez Inc   North Jamieview, AP   

  Date Posted                                          Apply URL  \
0  2021-04-08  https://realpython.github.i

In [21]:
jobs_df["Job Description"] = jobs_df["Apply URL"].apply(get_job_description)

In [17]:
print(jobs_df.head())

                     Title                     Company              Location  \
0  Senior Python Developer    Payne, Roberts and Davis       Stewartbury, AA   
1          Energy engineer            Vasquez-Davidson  Christopherville, AA   
2          Legal executive  Jackson, Chambers and Levy   Port Ericaburgh, AA   
3   Fitness centre manager              Savage-Bradley     East Seanview, AP   
4          Product manager                 Ramirez Inc   North Jamieview, AP   

  Date Posted                                          Apply URL  \
0  2021-04-08  https://realpython.github.io/fake-jobs/https:/...   
1  2021-04-08  https://realpython.github.io/fake-jobs/https:/...   
2  2021-04-08  https://realpython.github.io/fake-jobs/https:/...   
3  2021-04-08  https://realpython.github.io/fake-jobs/https:/...   
4  2021-04-08  https://realpython.github.io/fake-jobs/https:/...   

         Job Description  
0  Description not found  
1  Description not found  
2  Description not found  
3 