In [123]:
import requests
from bs4 import BeautifulSoup as BS
from IPython.core.display import HTML
import pandas as pd
import numpy as np

## Webscraping

In this exercise, you'll practice using BeautifulSoup to parse the content of a web page. The page that you'll be scraping, https://realpython.github.io/fake-jobs/, contains job listings. Your job is to extract the data on each job and convert into a pandas DataFrame.

1. Start by performing a GET request on the url above and convert the response into a BeautifulSoup object.  
a. Use the .find method to find the tag containing the first job title ("Senior Python Developer"). Hint: can you find a tag type and/or a class that could be helpful for extracting this information? Extract the text from this title.  
b. Now, use what you did for the first title, but extract the job title for all jobs on this page. Store the results in a list.  
c. Finally, extract the companies, locations, and posting dates for each job. For example, the first job has a company of "Payne, Roberts and Davis", a location of "Stewartbury, AA", and a posting date of "2021-04-08". Ensure that the text that you extract is clean, meaning no extra spaces or other characters at the beginning or end.  
d. Take the lists that you have created and combine them into a pandas DataFrame. 

2. Next, add a column that contains the url for the "Apply" button. Try this in two ways.   
    a. First, use the BeautifulSoup find_all method to extract the urls.  
    b. Next, get those same urls in a different way. Examine the urls and see if you can spot the pattern of how they are constructed. Then, build the url using the elements you have already extracted. Ensure that the urls that you created match those that you extracted using BeautifulSoup. Warning: You will need to do some string cleaning and prep in constructing the urls this way. For example, look carefully at the urls for the "Software Engineer (Python)" job and the "Scientist, research (maths)" job.
    
3. Finally, we want to get the job description text for each job.  
    a. Start by looking at the page for the first job, https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html. Using BeautifulSoup, extract the job description paragraph.  
    b. We want to be able to do this for all pages. Write a function which takes as input a url and returns the description text on that page. For example, if you input "https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html" into your function, it should return the string "At be than always different American address. Former claim chance prevent why measure too. Almost before some military outside baby interview. Face top individual win suddenly. Parent do ten after those scientist. Medical effort assume teacher wall. Significant his himself clearly very. Expert stop area along individual. Three own bank recognize special good along.".  
    c. Use the [.apply method](https://pandas.pydata.org/docs/reference/api/pandas.Series.apply.html) on the url column you created above to retrieve the description text for all of the jobs.

In [124]:
##1. Start by performing a GET request on the url above and convert the response into a BeautifulSoup object. 
##initial get request
##also connect to url in browser (right click...inspect), review page, attributes, html

URL = 'https://realpython.github.io/fake-jobs/'

response = requests.get(URL)

In [125]:
##checking object 
type(response)

requests.models.Response

In [126]:
##check response code

response.status_code

200

##200 status code is the standard response for a successful request / 400 and 500 codes are bad requests

In [127]:
#check response text

soup = BS(response.text)

In [128]:
#print response text / can also use print(name.prettify()) / review returned info
print(type(soup))
print(soup)

<class 'bs4.BeautifulSoup'>
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Fake Python</title>
<link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
</head>
<body>
<section class="section">
<div class="container mb-5">
<h1 class="title is-1">
        Fake Python
      </h1>
<p class="subtitle is-3">
        Fake Jobs for Your Web Scraping Journey
      </p>
</div>
<div class="container">
<div class="columns is-multiline" id="ResultsContainer">
<div class="column is-half">
<div class="card">
<div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
</figure>
</div>
<div class="media-content">
<h2 class="title is-5">Senior Python Developer</h2>
<h3 class="subtitle is-6 company">Payne, Roberts a

In [129]:
##1a. Use the .find method to find the tag containing the first job title ("Senior Python Developer"). 
##Hint: can you find a tag type and/or a class that could be helpful for extracting this information? 
##Extract the text from this title.
soup.find('h2')

<h2 class="title is-5">Senior Python Developer</h2>

In [130]:
soup.find('h2').text

'Senior Python Developer'

In [131]:
##1b. Now, use what you did for the first title, but extract the job title for all jobs on this page. 
##Store the results in a list. 
##scrape h2 attribute
jobs = soup.findAll('h2') 
print(type(jobs))
jobs

<class 'bs4.element.ResultSet'>


[<h2 class="title is-5">Senior Python Developer</h2>,
 <h2 class="title is-5">Energy engineer</h2>,
 <h2 class="title is-5">Legal executive</h2>,
 <h2 class="title is-5">Fitness centre manager</h2>,
 <h2 class="title is-5">Product manager</h2>,
 <h2 class="title is-5">Medical technical officer</h2>,
 <h2 class="title is-5">Physiological scientist</h2>,
 <h2 class="title is-5">Textile designer</h2>,
 <h2 class="title is-5">Television floor manager</h2>,
 <h2 class="title is-5">Waste management officer</h2>,
 <h2 class="title is-5">Software Engineer (Python)</h2>,
 <h2 class="title is-5">Interpreter</h2>,
 <h2 class="title is-5">Architect</h2>,
 <h2 class="title is-5">Meteorologist</h2>,
 <h2 class="title is-5">Audiological scientist</h2>,
 <h2 class="title is-5">English as a second language teacher</h2>,
 <h2 class="title is-5">Surgeon</h2>,
 <h2 class="title is-5">Equities trader</h2>,
 <h2 class="title is-5">Newspaper journalist</h2>,
 <h2 class="title is-5">Materials engineer</h2>,
 

In [132]:
##same as above written slightly different
jobs2 = soup.findAll('h2', attrs={'class' : 'title is-5'})
jobs2

[<h2 class="title is-5">Senior Python Developer</h2>,
 <h2 class="title is-5">Energy engineer</h2>,
 <h2 class="title is-5">Legal executive</h2>,
 <h2 class="title is-5">Fitness centre manager</h2>,
 <h2 class="title is-5">Product manager</h2>,
 <h2 class="title is-5">Medical technical officer</h2>,
 <h2 class="title is-5">Physiological scientist</h2>,
 <h2 class="title is-5">Textile designer</h2>,
 <h2 class="title is-5">Television floor manager</h2>,
 <h2 class="title is-5">Waste management officer</h2>,
 <h2 class="title is-5">Software Engineer (Python)</h2>,
 <h2 class="title is-5">Interpreter</h2>,
 <h2 class="title is-5">Architect</h2>,
 <h2 class="title is-5">Meteorologist</h2>,
 <h2 class="title is-5">Audiological scientist</h2>,
 <h2 class="title is-5">English as a second language teacher</h2>,
 <h2 class="title is-5">Surgeon</h2>,
 <h2 class="title is-5">Equities trader</h2>,
 <h2 class="title is-5">Newspaper journalist</h2>,
 <h2 class="title is-5">Materials engineer</h2>,
 

In [133]:
##checking results / pull single job title
first_job = jobs2[0]
print(type(first_job))
print(first_job)


<class 'bs4.element.Tag'>
<h2 class="title is-5">Senior Python Developer</h2>


In [134]:
##extract single job title
first_job.text

'Senior Python Developer'

In [135]:
##for loop to extract all job titles (in list format) from result set
jobs_extract = []
for jobs in jobs:
    jobs_extract.append(jobs.text)
    

In [136]:
##display results
jobs_extract

['Senior Python Developer',
 'Energy engineer',
 'Legal executive',
 'Fitness centre manager',
 'Product manager',
 'Medical technical officer',
 'Physiological scientist',
 'Textile designer',
 'Television floor manager',
 'Waste management officer',
 'Software Engineer (Python)',
 'Interpreter',
 'Architect',
 'Meteorologist',
 'Audiological scientist',
 'English as a second language teacher',
 'Surgeon',
 'Equities trader',
 'Newspaper journalist',
 'Materials engineer',
 'Python Programmer (Entry-Level)',
 'Product/process development scientist',
 'Scientist, research (maths)',
 'Ecologist',
 'Materials engineer',
 'Historic buildings inspector/conservation officer',
 'Data scientist',
 'Psychiatrist',
 'Structural engineer',
 'Immigration officer',
 'Python Programmer (Entry-Level)',
 'Neurosurgeon',
 'Broadcast engineer',
 'Make',
 'Nurse, adult',
 'Air broker',
 'Editor, film/video',
 'Production assistant, radio',
 'Engineer, communications',
 'Sales executive',
 'Software Deve

In [15]:
##1c Finally, extract the companies, locations, and posting dates for each job. 
##For example, the first job has a company of "Payne, Roberts and Davis", a location of "Stewartbury, AA", and a posting date of "2021-04-08". 

##soup.find_all(['h1', 'a', 'p']) / https://www.edureka.co/community/42701/how-to-get-two-tags-in-findall-using-beautifulsoup


##multiple iterations trying to figure it out commented out
##jobs_detail = soup.find_all(attrs={'class': 'title is-5'}) + soup.find_all(attrs={'class': 'subtitle is-6 company'}) + soup.find_all(attrs={'class':'location'})
##jobs_detail

##jobs_detail1 = soup.find_all('div', attrs={'class': 'title is-5'})
##jobs_detail1

##jobs_detail2 = soup.find_all('h3', attrs={'class': 'subtitle is-6 company'}, 'h2', attrs={'class': 'title is-5'})
##jobs_detail2

##jobs_detail3 = soup.find_all('div', attrs={'class': 'card-content'})
##jobs_detail3

##jobs_detail4 = soup.find_all(['h2', 'h3', 'p', 'time'])
##jobs_detail4 

In [137]:
#company extract
company = soup.findAll('h3', attrs={'class' : 'subtitle is-6 company'})
company_extract = []
for company in company:
    company_extract.append(company.text)
#company_extract

In [138]:
#location_extract
location = soup.findAll('p', attrs={'class' : 'location'})
location_extract = []
for location in location:
    location_extract.append(location.text)
#location_extract

In [139]:
#time_extract
time = soup.findAll('time')
time_extract = []
for time in time:
    time_extract.append(time.text)
#time_extract

In [140]:
##1d Ensure that the text that you extract is clean, meaning no extra spaces or other characters at the beginning or end.
##multiple lists into dataframe/ https://stackoverflow.com/questions/30522724/take-multiple-lists-into-dataframe
job_posting_extract = pd.DataFrame(list(zip(jobs_extract, company_extract, location_extract, time_extract)), columns=['Job Title', 'Company', 'Location', 'Date Posted'])
print(type(job_posting_extract))
job_posting_extract                

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Job Title,Company,Location,Date Posted
0,Senior Python Developer,"Payne, Roberts and Davis","\n Stewartbury, AA\n",2021-04-08
1,Energy engineer,Vasquez-Davidson,"\n Christopherville, AA\n",2021-04-08
2,Legal executive,"Jackson, Chambers and Levy","\n Port Ericaburgh, AA\n",2021-04-08
3,Fitness centre manager,Savage-Bradley,"\n East Seanview, AP\n",2021-04-08
4,Product manager,Ramirez Inc,"\n North Jamieview, AP\n",2021-04-08
...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","\n Lake Abigail, AE\n",2021-04-08
96,"Radiographer, diagnostic",Holder LLC,"\n Jacobshire, AP\n",2021-04-08
97,Database administrator,Yates-Ferguson,"\n Port Susan, AE\n",2021-04-08
98,Furniture designer,Ortega-Lawrence,"\n North Tiffany, AA\n",2021-04-08


In [141]:
#clean up columns / https://www.geeksforgeeks.org/pandas-strip-whitespace-from-entire-dataframe/
#remove newline / https://stackoverflow.com/questions/44227748/removing-newlines-from-messy-strings-in-pandas-dataframe-cells
job_posting_extract['Job Title'].str.strip
job_posting_extract['Company'].str.strip
job_posting_extract['Location'].str.strip
job_posting_extract['Date Posted'].str.strip
job_posting_extract = job_posting_extract.replace('\n','', regex=True)
job_posting_extract

Unnamed: 0,Job Title,Company,Location,Date Posted
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08
...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","Lake Abigail, AE",2021-04-08
96,"Radiographer, diagnostic",Holder LLC,"Jacobshire, AP",2021-04-08
97,Database administrator,Yates-Ferguson,"Port Susan, AE",2021-04-08
98,Furniture designer,Ortega-Lawrence,"North Tiffany, AA",2021-04-08


#2. Next, add a column that contains the url for the "Apply" button. Try this in two ways.   
#a. First, use the BeautifulSoup find_all method to extract the urls.  
#b. Next, get those same urls in a different way. 
#Examine the urls and see if you can spot the pattern of how they are constructed. Then, build the url using the elements you have already extracted. Ensure that the urls that you created match those that you extracted using BeautifulSoup. Warning: You will need to do some string cleaning and prep in constructing the urls this way. For example, look carefully at the urls for the "Software Engineer (Python)" job and the "Scientist, research (maths)" job.

In [142]:
#1st attempt to retrieve urls
#apply_url = soup.findAll('footer', attrs={'class' : 'card-footer'})
apply_url = soup.findAll('a')
apply_url

[<a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>,
 <a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html" target="_blank">Apply</a>,
 <a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>,
 <a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html" target="_blank">Apply</a>,
 <a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>,
 <a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html" target="_blank">Apply</a>,
 <a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>,
 <a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html" target="_blank">Apply</a>,
 <a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>,
 <a class="card

In [143]:
#scrape urls / returns urls for both learn and apply
url_href = [x.get('href') for x in apply_url]
print(type(url_href))
url_href

<class 'list'>


['https://www.realpython.com',
 'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html',
 'https://www.realpython.com',
 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html',
 'https://www.realpython.com',
 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html',
 'https://www.realpython.com',
 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html',
 'https://www.realpython.com',
 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html',
 'https://www.realpython.com',
 'https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html',
 'https://www.realpython.com',
 'https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html',
 'https://www.realpython.com',
 'https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html',
 'https://www.realpython.com',
 'https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html',
 'https://www.realpython.com',
 'https:

In [144]:
#separate out url for apply
#slice indices / https://bobbyhadz.com/blog/python-list-get-only-even-indices
apply_url_href = url_href[1::2]
apply_url_href

['https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html',
 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html',
 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html',
 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html',
 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html',
 'https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html',
 'https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html',
 'https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html',
 'https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html',
 'https://realpython.github.io/fake-jobs/jobs/waste-management-officer-9.html',
 'https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html',
 'https://realpython.github.io/fake-jobs/jobs/interpreter-11.html',
 'https://realpython.github.io/fake-jobs/jobs/architect-12.html',
 'https://realpython.gi

In [145]:
#add list of apply urls / https://stackoverflow.com/questions/26666919/add-column-in-dataframe-from-list
job_posting_extract['Apply URL'] = apply_url_href
job_posting_extract

Unnamed: 0,Job Title,Company,Location,Date Posted,Apply URL
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...
...,...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","Lake Abigail, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/mu...
96,"Radiographer, diagnostic",Holder LLC,"Jacobshire, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/ra...
97,Database administrator,Yates-Ferguson,"Port Susan, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/da...
98,Furniture designer,Ortega-Lawrence,"North Tiffany, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fu...


In [25]:
#2 #b. Next, get those same urls in a different way. 
#Examine the urls and see if you can spot the pattern of how they are constructed. 
#Then, build the url using the elements you have already extracted. 
#Ensure that the urls that you created match those that you extracted using BeautifulSoup. 
#Warning: You will need to do some string cleaning and prep in constructing the urls this way. 
#For example, look carefully at the urls for the "Software Engineer (Python)" job and the "Scientist, research (maths)" job


    

In [146]:
##had trouble loading list[] to pandas df...changed code to produce series () and was able to load to pandas df
##code for build_url still requires refinement...to operate as intended in instructions
build_url = ('https://realpython.github.io/fake-jobs/jobs/' + job_posting_extract['Job Title'] + '-' + str(job_posting_extract.index) + '.html')
print(type(build_url))
build_url

<class 'pandas.core.series.Series'>


0     https://realpython.github.io/fake-jobs/jobs/Se...
1     https://realpython.github.io/fake-jobs/jobs/En...
2     https://realpython.github.io/fake-jobs/jobs/Le...
3     https://realpython.github.io/fake-jobs/jobs/Fi...
4     https://realpython.github.io/fake-jobs/jobs/Pr...
                            ...                        
95    https://realpython.github.io/fake-jobs/jobs/Mu...
96    https://realpython.github.io/fake-jobs/jobs/Ra...
97    https://realpython.github.io/fake-jobs/jobs/Da...
98    https://realpython.github.io/fake-jobs/jobs/Fu...
99    https://realpython.github.io/fake-jobs/jobs/Sh...
Name: Job Title, Length: 100, dtype: object

In [147]:
print(type(build_url))

<class 'pandas.core.series.Series'>


In [148]:

job_posting_extract['Build URL'] = build_url
job_posting_extract

Unnamed: 0,Job Title,Company,Location,Date Posted,Apply URL,Build URL
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...,https://realpython.github.io/fake-jobs/jobs/Se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...,https://realpython.github.io/fake-jobs/jobs/En...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...,https://realpython.github.io/fake-jobs/jobs/Le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...,https://realpython.github.io/fake-jobs/jobs/Fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...,https://realpython.github.io/fake-jobs/jobs/Pr...
...,...,...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","Lake Abigail, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/mu...,https://realpython.github.io/fake-jobs/jobs/Mu...
96,"Radiographer, diagnostic",Holder LLC,"Jacobshire, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/ra...,https://realpython.github.io/fake-jobs/jobs/Ra...
97,Database administrator,Yates-Ferguson,"Port Susan, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/da...,https://realpython.github.io/fake-jobs/jobs/Da...
98,Furniture designer,Ortega-Lawrence,"North Tiffany, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fu...,https://realpython.github.io/fake-jobs/jobs/Fu...


In [149]:
job_posting_extract['Build URL']

0     https://realpython.github.io/fake-jobs/jobs/Se...
1     https://realpython.github.io/fake-jobs/jobs/En...
2     https://realpython.github.io/fake-jobs/jobs/Le...
3     https://realpython.github.io/fake-jobs/jobs/Fi...
4     https://realpython.github.io/fake-jobs/jobs/Pr...
                            ...                        
95    https://realpython.github.io/fake-jobs/jobs/Mu...
96    https://realpython.github.io/fake-jobs/jobs/Ra...
97    https://realpython.github.io/fake-jobs/jobs/Da...
98    https://realpython.github.io/fake-jobs/jobs/Fu...
99    https://realpython.github.io/fake-jobs/jobs/Sh...
Name: Build URL, Length: 100, dtype: object

##3. Finally, we want to get the job description text for each job.
a. Start by looking at the page for the first job, https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html. Using BeautifulSoup, extract the job description paragraph.
b. We want to be able to do this for all pages. Write a function which takes as input a url and returns the description text on that page. For example, if you input "https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html" into your function, it should return the string "At be than always different American address. Former claim chance prevent why measure too. Almost before some military outside baby interview. Face top individual win suddenly. Parent do ten after those scientist. Medical effort assume teacher wall. Significant his himself clearly very. Expert stop area along individual. Three own bank recognize special good along.".
c. Use the .apply method on the url column you created above to retrieve the description text for all of the jobs.

In [150]:
##3 a. Start by looking at the page for the first job, 
##https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html. 
##Using BeautifulSoup, extract the job description paragraph.

URL = 'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html'

response1 = requests.get(URL)

In [151]:
type(response1)

requests.models.Response

In [152]:
response1.status_code

200

In [153]:
soup1 = BS(response1.text)
print(soup1)

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Fake Python</title>
<link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
</head>
<body>
<section class="section">
<div class="container mb-5">
<h1 class="title is-1">
        Fake Python
      </h1>
<p class="subtitle is-3">
        Fake Jobs for Your Web Scraping Journey
      </p>
</div>
<div class="container">
<div class="columns is-multiline" id="ResultsContainer">
<div class="box">
<h1 class="title is-2">Senior Python Developer</h1>
<h2 class="subtitle is-4 company">Payne, Roberts and Davis</h2>
<div class="content">
<p>Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities incl

In [154]:
paragraph = soup1.find('div', attrs={'class' : 'content'})
paragraph_extract = []
for paragraph in paragraph:
    paragraph_extract.append(paragraph.text)
paragraph_extract

['\n',
 'Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.',
 '\n',
 'Location: Stewartbury, AA',
 '\n',
 'Posted: 2021-04-08',
 '\n']

In [155]:
##slice indices to remove unwanted info / https://bobbyhadz.com/blog/python-list-get-only-even-indices
paragraph_extract1 = paragraph_extract[1::2]
paragraph_extract1 = paragraph_extract1[::2]
paragraph_extract1 = paragraph_extract1[::2]
##paragraph_extract = paragraph_extract.replace('\n','', regex=True)
paragraph_extract1

['Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.']

##3b. We want to be able to do this for all pages. Write a function which takes as input a url and returns the description text on that page.

In [156]:
#did not use this list
url_list = [job_posting_extract['Apply URL']]
url_list


[0     https://realpython.github.io/fake-jobs/jobs/se...
 1     https://realpython.github.io/fake-jobs/jobs/en...
 2     https://realpython.github.io/fake-jobs/jobs/le...
 3     https://realpython.github.io/fake-jobs/jobs/fi...
 4     https://realpython.github.io/fake-jobs/jobs/pr...
                             ...                        
 95    https://realpython.github.io/fake-jobs/jobs/mu...
 96    https://realpython.github.io/fake-jobs/jobs/ra...
 97    https://realpython.github.io/fake-jobs/jobs/da...
 98    https://realpython.github.io/fake-jobs/jobs/fu...
 99    https://realpython.github.io/fake-jobs/jobs/sh...
 Name: Apply URL, Length: 100, dtype: object]

In [157]:
#url_list1 applied in for loop below
url_list1 = job_posting_extract['Apply URL']
url_list1

0     https://realpython.github.io/fake-jobs/jobs/se...
1     https://realpython.github.io/fake-jobs/jobs/en...
2     https://realpython.github.io/fake-jobs/jobs/le...
3     https://realpython.github.io/fake-jobs/jobs/fi...
4     https://realpython.github.io/fake-jobs/jobs/pr...
                            ...                        
95    https://realpython.github.io/fake-jobs/jobs/mu...
96    https://realpython.github.io/fake-jobs/jobs/ra...
97    https://realpython.github.io/fake-jobs/jobs/da...
98    https://realpython.github.io/fake-jobs/jobs/fu...
99    https://realpython.github.io/fake-jobs/jobs/sh...
Name: Apply URL, Length: 100, dtype: object

In [158]:
##incorrect for loop....returns "list" cannot extract details w/ beautiful soup
mass_paragraph_extract = []

for link in url_list1:
    response2 = requests.get(link)
    soup2 = BS(response2.text)
    mass_paragraph_extract.append(soup2)

In [159]:
##incorrect for loop above....returns "list" cannot extract details w/ beautiful soup
print(type(mass_paragraph_extract))
print(mass_paragraph_extract)

<class 'list'>
[<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Fake Python</title>
<link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
</head>
<body>
<section class="section">
<div class="container mb-5">
<h1 class="title is-1">
        Fake Python
      </h1>
<p class="subtitle is-3">
        Fake Jobs for Your Web Scraping Journey
      </p>
</div>
<div class="container">
<div class="columns is-multiline" id="ResultsContainer">
<div class="box">
<h1 class="title is-2">Senior Python Developer</h1>
<h2 class="subtitle is-4 company">Payne, Roberts and Davis</h2>
<div class="content">
<p>Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing 

In [160]:
##for loop to scrape pages in url_list1 put in bs4.BeautifulSoup format
new_soup = BS()

for link in url_list1:
    response3 = requests.get(link)
    soup3 = BS(response3.text)
    new_soup.append(soup3)

In [161]:
#verify class type bs4 and returned info
print(type(new_soup))
print(new_soup)

<class 'bs4.BeautifulSoup'>
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Fake Python</title>
<link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
</head>
<body>
<section class="section">
<div class="container mb-5">
<h1 class="title is-1">
        Fake Python
      </h1>
<p class="subtitle is-3">
        Fake Jobs for Your Web Scraping Journey
      </p>
</div>
<div class="container">
<div class="columns is-multiline" id="ResultsContainer">
<div class="box">
<h1 class="title is-2">Senior Python Developer</h1>
<h2 class="subtitle is-4 company">Payne, Roberts and Davis</h2>
<div class="content">
<p>Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application f

In [162]:
##extract content from tag / https://stackoverflow.com/questions/5999407/extract-content-within-a-tag-with-beautifulsoup
new_soup_test = new_soup.findAll('p')
new_soup_test_extract = []
for paragraph in new_soup_test:
    new_soup_test_extract.append(paragraph.text)
new_soup_test_extract

['\n        Fake Jobs for Your Web Scraping Journey\n      ',
 'Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.',
 'Location: Stewartbury, AA',
 'Posted: 2021-04-08',
 '\n        Fake Jobs for Your Web Scraping Journey\n      ',
 'Party prevent live. Quickly candidate change although. Together type music hospital. Every speech support time operation wear often.',
 'Location: Christopherville, AA',
 'Posted: 2021-04-08',
 '\n        Fake Jobs for Your Web Scraping 

In [163]:
##slice indices to remove unwanted info / https://bobbyhadz.com/blog/python-list-get-only-even-indices
ns_test_pull = new_soup_test_extract[1::2]
ns_test_pull = ns_test_pull[::2]
    
ns_test_pull 

['Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.',
 'Party prevent live. Quickly candidate change although. Together type music hospital. Every speech support time operation wear often.',
 'Administration even relate head color. Staff beyond chair recently and off. Own available buy country store build before. Already against which continue. Look road article quickly. International big employee determine positive go Congress. Level others record hospital employee

In [164]:
#add job descriptions to dataframe
job_posting_extract['Job Description'] = ns_test_pull
#del job_posting_extract['Apply URL']
job_posting_extract

Unnamed: 0,Job Title,Company,Location,Date Posted,Apply URL,Build URL,Job Description
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...,https://realpython.github.io/fake-jobs/jobs/Se...,Professional asset web application environment...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...,https://realpython.github.io/fake-jobs/jobs/En...,Party prevent live. Quickly candidate change a...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...,https://realpython.github.io/fake-jobs/jobs/Le...,Administration even relate head color. Staff b...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...,https://realpython.github.io/fake-jobs/jobs/Fi...,Tv program actually race tonight themselves tr...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...,https://realpython.github.io/fake-jobs/jobs/Pr...,Traditional page a although for study anyone. ...
...,...,...,...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","Lake Abigail, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/mu...,https://realpython.github.io/fake-jobs/jobs/Mu...,Paper age physical current note. There reality...
96,"Radiographer, diagnostic",Holder LLC,"Jacobshire, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/ra...,https://realpython.github.io/fake-jobs/jobs/Ra...,Able such right culture. Wrong pick structure ...
97,Database administrator,Yates-Ferguson,"Port Susan, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/da...,https://realpython.github.io/fake-jobs/jobs/Da...,Create day party decade high clear. Past trade...
98,Furniture designer,Ortega-Lawrence,"North Tiffany, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fu...,https://realpython.github.io/fake-jobs/jobs/Fu...,Pressure under rock next week. Recognize so re...


#3c. Use the .apply method on the url column you created above to retrieve the description text for all of the jobs.


In [165]:
job_posting_apply = job_posting_extract
job_posting_apply

Unnamed: 0,Job Title,Company,Location,Date Posted,Apply URL,Build URL,Job Description
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...,https://realpython.github.io/fake-jobs/jobs/Se...,Professional asset web application environment...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...,https://realpython.github.io/fake-jobs/jobs/En...,Party prevent live. Quickly candidate change a...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...,https://realpython.github.io/fake-jobs/jobs/Le...,Administration even relate head color. Staff b...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...,https://realpython.github.io/fake-jobs/jobs/Fi...,Tv program actually race tonight themselves tr...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...,https://realpython.github.io/fake-jobs/jobs/Pr...,Traditional page a although for study anyone. ...
...,...,...,...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","Lake Abigail, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/mu...,https://realpython.github.io/fake-jobs/jobs/Mu...,Paper age physical current note. There reality...
96,"Radiographer, diagnostic",Holder LLC,"Jacobshire, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/ra...,https://realpython.github.io/fake-jobs/jobs/Ra...,Able such right culture. Wrong pick structure ...
97,Database administrator,Yates-Ferguson,"Port Susan, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/da...,https://realpython.github.io/fake-jobs/jobs/Da...,Create day party decade high clear. Past trade...
98,Furniture designer,Ortega-Lawrence,"North Tiffany, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fu...,https://realpython.github.io/fake-jobs/jobs/Fu...,Pressure under rock next week. Recognize so re...


apply example
def custom_rating(genre,rating):
    if 'Thriller' in genre:
        return min(10,rating+1)
    elif 'Comedy' in genre:
        return max(0,rating-1)
    else:
        return rating
        
df['CustomRating'] = df.apply(lambda x: custom_rating(x['Genre'],x['Rating']),axis=1)

##incorrect application of apply() function....rethink / rework
##rename variables in next iteration.....I am making unintended changes to cells above.

##job_posting_apply['Apply URL'] = ""

def apply_url(applyurl):
    new_soup3c = BS()
    for link in job_posting_apply['']:
        response4 = requests.get(link)
        soup4 = BS(response4.text)
        new_soup3c.append(soup4)
    
    new_soup_test3c = new_soup3c.findAll('p')
    
    new_soup_test_extract3c = []
    
    for paragraph in new_soup_test3c:
        new_soup_test_extract3c.append(paragraph.text)
     
    ns_test_pull3c = new_soup_test_extract3c[1::2]
    ns_test_pull3c = ns_test_pull3c[::2]  
  
    return ns_test_pull3c

job_posting_apply['Apply URL'] = job_posting_apply.apply(job_apply['Apply URL'], axis=1)

##job_posting_apply['Apply URL'] = job_posting_apply.apply(job_apply, axis=1)

##x.get('src') for x in images]

##job_posting_apply['Apply URL'] = job_posting_apply.apply(lambda x: job_apply(x['Apply URL']), axis=1)

##job_posting_apply = job_posting_apply.apply(lambda x: job_apply(x['Apply URL']), axis=1)

