In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd

#### 1. Start by performing a GET request on the url above and convert the response into a BeautifulSoup object.  


In [3]:
response = requests.get('https://realpython.github.io/fake-jobs/')

In [4]:
type(response)

requests.models.Response

In [5]:
# A 200 status code is the standard response for a successful request. 400 or 404 is bad.
response.status_code

200

In [6]:
response.text

'<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="utf-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    <title>Fake Python</title>\n    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css">\n  </head>\n  <body>\n  <section class="section">\n    <div class="container mb-5">\n      <h1 class="title is-1">\n        Fake Python\n      </h1>\n      <p class="subtitle is-3">\n        Fake Jobs for Your Web Scraping Journey\n      </p>\n    </div>\n    <div class="container">\n    <div id="ResultsContainer" class="columns is-multiline">\n    <div class="column is-half">\n<div class="card">\n  <div class="card-content">\n    <div class="media">\n      <div class="media-left">\n        <figure class="image is-48x48">\n          <img src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1" alt="Real Python Logo">\n        </figure>\n      </div>\n      <div class="media-content"

In [7]:
# to make the above look better, use BeautifulSoup
soup = BS(response.text)
print(soup)

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Fake Python</title>
<link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
</head>
<body>
<section class="section">
<div class="container mb-5">
<h1 class="title is-1">
        Fake Python
      </h1>
<p class="subtitle is-3">
        Fake Jobs for Your Web Scraping Journey
      </p>
</div>
<div class="container">
<div class="columns is-multiline" id="ResultsContainer">
<div class="column is-half">
<div class="card">
<div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
</figure>
</div>
<div class="media-content">
<h2 class="title is-5">Senior Python Developer</h2>
<h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3>
</div>
</div>


In [8]:
# now make it even prettier
pretty_soup = soup.prettify()
print(pretty_soup)

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Fake Python
  </title>
  <link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
 </head>
 <body>
  <section class="section">
   <div class="container mb-5">
    <h1 class="title is-1">
     Fake Python
    </h1>
    <p class="subtitle is-3">
     Fake Jobs for Your Web Scraping Journey
    </p>
   </div>
   <div class="container">
    <div class="columns is-multiline" id="ResultsContainer">
     <div class="column is-half">
      <div class="card">
       <div class="card-content">
        <div class="media">
         <div class="media-left">
          <figure class="image is-48x48">
           <img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
          </figure>
         </div>
         <div class="media-content">
          <h2 c

In [9]:
# find first title
soup.find('title')

<title>Fake Python</title>

#### 1a. Use the .find method to find the tag containing the first job title ("Senior Python Developer"). Hint: can you find a tag type and/or a class that could be helpful for extracting this information? Extract the text from this title.  


In [11]:
soup.find('h2')

<h2 class="title is-5">Senior Python Developer</h2>

In [12]:
soup.find('h2').text

'Senior Python Developer'

#### 1b. Now, use what you did for the first title, but extract the job title for all jobs on this page. Store the results in a list.  


In [14]:
jobs = soup.findAll('h2')
jobs[0:4]

[<h2 class="title is-5">Senior Python Developer</h2>,
 <h2 class="title is-5">Energy engineer</h2>,
 <h2 class="title is-5">Legal executive</h2>,
 <h2 class="title is-5">Fitness centre manager</h2>]

In [15]:
type(jobs)

bs4.element.ResultSet

In [16]:
job_list = [x.text for x in jobs]
job_list[0:4]

['Senior Python Developer',
 'Energy engineer',
 'Legal executive',
 'Fitness centre manager']

In [17]:
type(job_list)

list

In [18]:
#alt method
#job_list_2 = []
#for x in jobs:
#    job_list_2.append(x.text)
#job_list_2

#### 1c. Finally, extract the companies, locations, and posting dates for each job. For example, the first job has a company of "Payne, Roberts and Davis", a location of "Stewartbury, AA", and a posting date of "2021-04-08". Ensure that the text that you extract is clean, meaning no extra spaces or other characters at the beginning or end.  


In [20]:
companies = soup.findAll('h3')
companies[0:4]

[<h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3>,
 <h3 class="subtitle is-6 company">Vasquez-Davidson</h3>,
 <h3 class="subtitle is-6 company">Jackson, Chambers and Levy</h3>,
 <h3 class="subtitle is-6 company">Savage-Bradley</h3>]

In [21]:
soup.findAll('h3')[0].text

'Payne, Roberts and Davis'

In [22]:
companies_list = [x.text for x in companies]
companies_list[0:4]

['Payne, Roberts and Davis',
 'Vasquez-Davidson',
 'Jackson, Chambers and Levy',
 'Savage-Bradley']

In [23]:
locations = soup.findAll('p')[0].text
locations

'\n        Fake Jobs for Your Web Scraping Journey\n      '

In [24]:
locations = soup.findAll('p', attrs={'class' : 'location'}) #class_='location')
locations[0:4]

[<p class="location">
         Stewartbury, AA
       </p>,
 <p class="location">
         Christopherville, AA
       </p>,
 <p class="location">
         Port Ericaburgh, AA
       </p>,
 <p class="location">
         East Seanview, AP
       </p>]

In [25]:
locations[0].text

'\n        Stewartbury, AA\n      '

In [26]:
locations[0].text.strip()

'Stewartbury, AA'

In [27]:
locations_list = [x.text.strip() for x in locations]
locations_list[0:4]

['Stewartbury, AA',
 'Christopherville, AA',
 'Port Ericaburgh, AA',
 'East Seanview, AP']

In [28]:
dates = soup.findAll('time')
dates[0:4]

[<time datetime="2021-04-08">2021-04-08</time>,
 <time datetime="2021-04-08">2021-04-08</time>,
 <time datetime="2021-04-08">2021-04-08</time>,
 <time datetime="2021-04-08">2021-04-08</time>]

In [29]:
dates_list = [x.text for x in dates]
dates_list[0:4]

['2021-04-08', '2021-04-08', '2021-04-08', '2021-04-08']

#### 1d. Take the lists that you have created and combine them into a pandas DataFrame. 


In [31]:
data = {
    'job title' : job_list,
    'company' : companies_list,
    'location' : locations_list,
    'date posted' : dates_list
}

jobs_df = pd.DataFrame(data)
jobs_df.head()# .sort_values('job title', ascending=True)

Unnamed: 0,job title,company,location,date posted
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08


#### 2. Next, add a column that contains the url for the "Apply" button. Try this in two ways.   


#### 2a. First, use the BeautifulSoup find_all method to extract the urls.  


In [34]:
# The string='Apply' removes all the "learn" options. I did text='Apply' first but said to use string instead. 
urls = soup.findAll('a', string='Apply')
urls[0].get('href')

'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html'

In [118]:
urls_list = [x.get('href') for x in urls]
urls_list[0:25]

['https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html',
 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html',
 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html',
 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html',
 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html',
 'https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html',
 'https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html',
 'https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html',
 'https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html',
 'https://realpython.github.io/fake-jobs/jobs/waste-management-officer-9.html',
 'https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html',
 'https://realpython.github.io/fake-jobs/jobs/interpreter-11.html',
 'https://realpython.github.io/fake-jobs/jobs/architect-12.html',
 'https://realpython.gi

#### 2b. Next, get those same urls in a different way. Examine the urls and see if you can spot the pattern of how they are constructed. Then, build the url using the elements you have already extracted. Ensure that the urls that you created match those that you extracted using BeautifulSoup. Warning: You will need to do some string cleaning and prep in constructing the urls this way. For example, look carefully at the urls for the "Software Engineer (Python)" job and the "Scientist, research (maths)" job.


In [37]:
# why does this stack on top of each time each time I run it?? I have to researt kernel every time. 
#job_list[0] = 'https://realpython.github.io/fake-jobs/jobs/' + job_list[0].lower().replace(' ', '-') + '-' + 'number' + '.html'  #+ job_list[0].index
#job_list[0]

In [122]:
job_list_url = []
for i, job in enumerate(job_list):
    job = ('https://realpython.github.io/fake-jobs/jobs/' + job_list[i].lower()
           .replace(' ', '-')
           .replace('(', '')
           .replace(')', '') 
           + '-' + str(i) + '.html')
    job_list_url.append(job)
job_list_url[0:25]

['https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html',
 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html',
 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html',
 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html',
 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html',
 'https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html',
 'https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html',
 'https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html',
 'https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html',
 'https://realpython.github.io/fake-jobs/jobs/waste-management-officer-9.html',
 'https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html',
 'https://realpython.github.io/fake-jobs/jobs/interpreter-11.html',
 'https://realpython.github.io/fake-jobs/jobs/architect-12.html',
 'https://realpython.gi

In [230]:
# now to actually add the urls to the dataframe
jobs_df['url'] = urls_list
jobs_df.head()

Unnamed: 0,job title,company,location,date posted,url
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...


#### 3. Finally, we want to get the job description text for each job.  


#### 3a. Start by looking at the page for the first job, https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html. Using BeautifulSoup, extract the job description paragraph.  


In [126]:
job_1_response = requests.get('https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html')

In [128]:
job_1_response.status_code

200

In [130]:
job_1_response.text

'<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="utf-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    <title>Fake Python</title>\n    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css">\n  </head>\n  <body>\n  <section class="section">\n    <div class="container mb-5">\n      <h1 class="title is-1">\n        Fake Python\n      </h1>\n      <p class="subtitle is-3">\n        Fake Jobs for Your Web Scraping Journey\n      </p>\n    </div>\n    <div class="container">\n    <div id="ResultsContainer" class="columns is-multiline">\n    <div class="box">\n<h1 class="title is-2">Senior Python Developer</h1>\n<h2 class="subtitle is-4 company">Payne, Roberts and Davis</h2>\n<div class="content">\n    <p>Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web applicat

In [186]:
job_1_soup = BS(job_1_response.text)
#print(job_1_soup)

In [184]:
desription = job_1_soup.findAll('div', class_='content')[0].find('p').text
print(desription)

Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.


#### 3b. We want to be able to do this for all pages. Write a function which takes as input a url and returns the description text on that page. For example, if you input "https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html" into your function, it should return the string "At be than always different American address. Former claim chance prevent why measure too. Almost before some military outside baby interview. Face top individual win suddenly. Parent do ten after those scientist. Medical effort assume teacher wall. Significant his himself clearly very. Expert stop area along individual. Three own bank recognize special good along.".  


In [206]:
def description_function(url):
    job_response = requests.get(url)
    job_response_soup = BS(job_response.text)
    description = job_response_soup.findAll('div', class_='content')[0].find('p').text
    return description

In [210]:
description_function('https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html')

'Collaborate discussions responsible tech growth opportunity dashboard. Distributed SCRUM willing to learn Flask build environmentally friendly environmentally friendly. Python distributed developer teamwork inclusive Flask. Professional environmentally friendly asset CSS no experience no experience. Grit oversea Java detail-oriented collaborate relocation. Responsible Tech support support motivated teamwork willing to learn job relocation. Environmentally Friendly collaborate role. Willing To Learn oversea asset role asset relocation web application. Agile responsible tech build oversee coordinate.'

#### 3c. Use the [.apply method](https://pandas.pydata.org/docs/reference/api/pandas.Series.apply.html) on the url column you created above to retrieve the description text for all of the jobs.

In [234]:
all_descriptions = jobs_df['url'].apply(description_function)
all_descriptions[0:5]

0    Professional asset web application environment...
1    Party prevent live. Quickly candidate change a...
2    Administration even relate head color. Staff b...
3    Tv program actually race tonight themselves tr...
4    Traditional page a although for study anyone. ...
Name: url, dtype: object