# **Web Scraping Solo Project**

## 1. Start by performing a GET request on the url above and convert the response into a BeautifulSoup object.

In [437]:
import requests
from bs4 import BeautifulSoup as BS

In [11]:
URL = 'https://realpython.github.io/fake-jobs/'

# # Not always needed
# headers = {
#     "User-Agent": "MyPythonScript/1.0 (contact@example.com)"
# }

response = requests.get(URL) # (URL, headers = headers) if headers needed

In [12]:
type(response)

requests.models.Response

In [13]:
response.status_code

200

In [14]:
requests.get('https://realpython.github.io/fake-jobs/') # (, headers = headers) if needed

<Response [200]>

In [15]:
soup = BS(response.text)

In [78]:
# print(soup.prettify())

#### Use the .find method to find the tag containing the first job title ("Senior Python Developer"). Hint: can you find a tag type and/or a class that could be helpful for extracting this information? Extract the text from this title.

In [31]:
# Inspect element to find 'Senior Python Developer' is in tag type 'h2' class 'title is-5'. Using 'title' alone will
# return the desired result. *** Take note: class_ *** class does not work

soup.find('h2', class_ = ['title', 'is-5']).text

'Senior Python Developer'

#### Now, use what you did for the first title, but extract the job title for all jobs on this page. Store the results in a list.

In [324]:
# jobtitles = soup.findAll('h2', class_ = ['title', 'is-5'])
# print(type(jobtitles))
# jobtitles

In [323]:
### This returns a list of cleaned title strings but you lose access to HTML Tag objects

# titles = [job.text.strip() for job in soup.find_all("h2", class_="title is-5")]
# type(titles)
# titles

In [374]:
soup.find('h2')
title_list = soup.findAll('h2')
title_list

title_text = []
for title_element in title_list:
    title_text.append(title_element.text)

# title_text

#### Finally, extract the companies, locations, and posting dates for each job. For example, the first job has a company of "Payne, Roberts and Davis", a location of "Stewartbury, AA", and a posting date of "2021-04-08". Ensure that the text that you extract is clean, meaning no extra spaces or other characters at the beginning or end.

In [None]:
# h3 class_ = ['subtitle'] ; p class_ = ['location'] ; p class_ = ['is-small']

In [377]:
title_text_list = [title.text for title in title_list]
# title_text_list

In [378]:
companies_list = soup.findAll('h3')
companies_text_list = [company.text for company in companies_list]
# companies_text_list

In [379]:
location_list = soup.findAll('p', class_= 'location')
location_text_list = [location_element.text.strip() for location_element in location_list]
# location_text_list

In [380]:
date_list = soup.findAll('time')
date_text_list = [date.text for date in date_list]
# date_text_list

In [381]:
dict = {'Title': title_text_list, 'Company': companies_text_list, 'Location': location_text_list, 'Date': date_text_list}

jobs = pd.DataFrame(dict)

# jobs

In [382]:
import pandas as pd

jobcards = soup.findAll('div', class_='card-content')

data = []

for card in jobcards:
    title = card.find('h2', class_ = ['title']).text
    company = card.find('h3', class_ = ['subtitle']).text
    location = card.find('p', class_ = ['location']).text
    posting_date = card.find('p', class_ = ['is-small']).text

    jobinfo = {
        'Title': title.strip(), 
        'Company': company.strip(),
        'Location': location.strip(),
        'Date Posted': posting_date.strip()
    }

    data.append(jobinfo)

jobpostinfo = pd.DataFrame(data)
# jobpostinfo.head()

## 2. Next, add a column that contains the url for the "Apply" button. Try this in two ways.

####  First, use the BeautifulSoup find_all method to extract the urls.

In [171]:
# Note: w/o string = 'Apply" (see target = ...) output will include URLs for 'Learn' button.

# apply = soup.findAll('a')
# print(type(apply))
# apply

In [308]:
# apply = soup.find_all('a')
# apply

In [314]:
jobcards = soup.findAll('div', class_='card-content')

data = []

for card in jobcards:
    title = card.find('h2', class_=['title', 'is-5'])
    company = card.find('h3', class_='subtitle')
    location = card.find('p', class_='location')
    date = card.find('p', class_='is-small')
    
    apply = card.findAll('a')[1] 
    applyurl = apply.get('href', 'N/A')

    jobpost = {
        'Title': title.text.strip(),
        'Company': company.text.strip(),
        'Location': location.text.strip(),
        'Date Posted': date.text.strip(),
        'Apply URL': applyurl.strip()
    }

    data.append(jobpost)

# Check Check

# print(title)
# print(company)
# print(location)
# print(date)
# print(apply)
# print(applyurl)

In [334]:
apply

<a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/ship-broker-99.html" target="_blank">Applyhttps://realpython.github.io/fake-jobs/jobs/ship-broker-99.html</a>

In [315]:
jobpost = pd.DataFrame(data)
jobpost.head()

Unnamed: 0,Title,Company,Location,Date Posted,Apply URL
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...


#### Next, get those same urls in a different way. Examine the urls and see if you can spot the pattern of how they are constructed. Then, build the url using the elements you have already extracted. Ensure that the urls that you created match those that you extracted using BeautifulSoup. Warning: You will need to do some string cleaning and prep in constructing the urls this way. For example, look carefully at the urls for the "Software Engineer (Python)" job and the "Scientist, research (maths)" job.

In [386]:
start = 'https://realpython.github.io/fake-jobs/jobs/'
end = '.html'

test = start + jobs.Title[0].replace(' ', '-').lower() + '-0' + end
print(test)

i = 0
clean_link = []
for link in title_text_list:
    link = link.replace(' ', '-')
    link = link.replace('(', '')
    link = link.replace(')', '')
    link = link.replace(',', '')
    link = link.replace('/', '-')
    link = link.lower()
    clean_link.append(start + link.lower() + '-'+ str(i) + end)
    i+=1
clean_link

https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html


['https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html',
 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html',
 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html',
 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html',
 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html',
 'https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html',
 'https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html',
 'https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html',
 'https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html',
 'https://realpython.github.io/fake-jobs/jobs/waste-management-officer-9.html',
 'https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html',
 'https://realpython.github.io/fake-jobs/jobs/interpreter-11.html',
 'https://realpython.github.io/fake-jobs/jobs/architect-12.html',
 'https://realpython.gi

## 3. Finally, we want to get the job description text for each job.

#### Start by looking at the page for the first job, https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html. Using BeautifulSoup, extract the job description paragraph.

In [237]:
URL2 = 'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html'
response2 = requests.get(URL2)

In [238]:
response2.status_code

200

In [235]:
requests.get('https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html')

<Response [200]>

In [262]:
soup2 = BS(response.text)

In [263]:
descards = soup2.find('div', class_ = 'content')
descards = descards.findAll('p') if descards else []
descards = [p for p in descards if not p.has_attr('id')]
descards = descards[0].text.strip()
descards

'Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.'

#### We want to be able to do this for all pages. Write a function which takes as input a url and returns the description text on that page.

In [313]:
jobcards = soup.findAll('div', class_='card-content')

data = []

for card in jobcards:
    title = card.find('h2', class_=['title', 'is-5'])
    company = card.find('h3', class_='subtitle')
    location = card.find('p', class_='location')
    date = card.find('p', class_='is-small')

# Adding Apply
    apply = card.findAll('a')[1] 
    applyurl = apply.get('href', 'N/A')

# Without this, only the description for the first post is returned
    response = requests.get(applyurl)
    soup2 = BS(response.content)

#Adding Job Description

    jobdescription = soup2.find('div', class_ = 'content')
    jobdescription = jobdescription.findAll('p') if jobdescription else []
    jobdescription = [p for p in jobdescription if not p.has_attr('id')]
    jobdescription = jobdescription[0].text.strip()
    

    jobpostinfo = {
        'Title': title.text.strip(),
        'Company': company.text.strip(),
        'Location': location.text.strip(),
        'Date Posted': date.text.strip(),
        'Job Description': jobdescription,
        'Apply URL': applyurl.strip()
    }

    data.append(jobpostinfo)

jobpostinfo = pd.DataFrame(data)
jobpostinfo


### Wondering if this code can be condensed????

Unnamed: 0,Title,Company,Location,Date Posted,Job Description,Apply URL
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,Professional asset web application environment...,https://realpython.github.io/fake-jobs/jobs/se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,Party prevent live. Quickly candidate change a...,https://realpython.github.io/fake-jobs/jobs/en...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,Administration even relate head color. Staff b...,https://realpython.github.io/fake-jobs/jobs/le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,Tv program actually race tonight themselves tr...,https://realpython.github.io/fake-jobs/jobs/fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,Traditional page a although for study anyone. ...,https://realpython.github.io/fake-jobs/jobs/pr...
...,...,...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","Lake Abigail, AE",2021-04-08,Paper age physical current note. There reality...,https://realpython.github.io/fake-jobs/jobs/mu...
96,"Radiographer, diagnostic",Holder LLC,"Jacobshire, AP",2021-04-08,Able such right culture. Wrong pick structure ...,https://realpython.github.io/fake-jobs/jobs/ra...
97,Database administrator,Yates-Ferguson,"Port Susan, AE",2021-04-08,Create day party decade high clear. Past trade...,https://realpython.github.io/fake-jobs/jobs/da...
98,Furniture designer,Ortega-Lawrence,"North Tiffany, AA",2021-04-08,Pressure under rock next week. Recognize so re...,https://realpython.github.io/fake-jobs/jobs/fu...


In [306]:
# jobpostinfo.to_csv('results/jobpostinfo.csv', index=False)

In [307]:
# jobpostinfo['Apply URL'] = jobpostinfo['Apply URL'].apply(
#     lambda url: f'<a href="{url}" target="_blank">Apply</a>' if url != 'N/A' else 'N/A'
# )

# jobpostinfo['Apply URL'] = jobpostinfo['Apply URL'].apply(
#     lambda url: url if url.startswith('<a href=') or url == 'N/A' else f'<a href="{url}" target="_blank">Apply</a>'
# )

# jobpostinfo.to_html('results/jobpostinfo.html', escape=False, index=False)


#### Putting the pipeline together into one cell
> Would like to figure out how to remove any redundancy and make this more streamlined

In [369]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd

URL = 'https://realpython.github.io/fake-jobs/'

# # Not always needed
# headers = {
#     "User-Agent": "MyPythonScript/1.0 (contact@example.com)"
# }

response = requests.get(URL) # (URL, headers = headers) if headers needed
soup = BS(response.text)

URL2 = 'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html'
response2 = requests.get(URL2)
soup2 = BS(response.text)

jobcards = soup.findAll('div', class_='card-content')

data = []

for card in jobcards:
    title = card.find('h2', class_=['title', 'is-5'])
    company = card.find('h3', class_='subtitle')
    location = card.find('p', class_='location')
    date = card.find('p', class_='is-small')

# Adding Apply
    apply = card.findAll('a')[1] 
    applyurl = apply.get('href', 'N/A')

# Without this, only the description for the first post is returned
    response = requests.get(applyurl)
    soup2 = BS(response.content)

#Adding Job Description

    jobdescription = soup2.find('div', class_ = 'content')
    jobdescription = jobdescription.findAll('p') if jobdescription else []
    jobdescription = [p for p in jobdescription if not p.has_attr('id')]
    jobdescription = jobdescription[0].text.strip()
    

    jobpostinfo = {
        'Title': title.text.strip(),
        'Company': company.text.strip(),
        'Location': location.text.strip(),
        'Date Posted': date.text.strip(),
        'Job Description': jobdescription,
        'Apply URL': applyurl.strip()
    }

    data.append(jobpostinfo)

   
jobpostinfo = pd.DataFrame(data)

# Normalize Title, Company, Location columns
# .apply(lambda col: col.str.upper()) allows upper to be applied to all three columns at once
jobpostinfo[['Title', 'Company', 'Location']] = jobpostinfo[['Title', 'Company', 'Location']].apply(lambda col: col.str.upper())
jobpostinfo

Unnamed: 0,Title,Company,Location,Date Posted,Job Description,Apply URL
0,SENIOR PYTHON DEVELOPER,"PAYNE, ROBERTS AND DAVIS","STEWARTBURY, AA",2021-04-08,Professional asset web application environment...,https://realpython.github.io/fake-jobs/jobs/se...
1,ENERGY ENGINEER,VASQUEZ-DAVIDSON,"CHRISTOPHERVILLE, AA",2021-04-08,Party prevent live. Quickly candidate change a...,https://realpython.github.io/fake-jobs/jobs/en...
2,LEGAL EXECUTIVE,"JACKSON, CHAMBERS AND LEVY","PORT ERICABURGH, AA",2021-04-08,Administration even relate head color. Staff b...,https://realpython.github.io/fake-jobs/jobs/le...
3,FITNESS CENTRE MANAGER,SAVAGE-BRADLEY,"EAST SEANVIEW, AP",2021-04-08,Tv program actually race tonight themselves tr...,https://realpython.github.io/fake-jobs/jobs/fi...
4,PRODUCT MANAGER,RAMIREZ INC,"NORTH JAMIEVIEW, AP",2021-04-08,Traditional page a although for study anyone. ...,https://realpython.github.io/fake-jobs/jobs/pr...
...,...,...,...,...,...,...
95,MUSEUM/GALLERY EXHIBITIONS OFFICER,"NGUYEN, YODER AND PETTY","LAKE ABIGAIL, AE",2021-04-08,Paper age physical current note. There reality...,https://realpython.github.io/fake-jobs/jobs/mu...
96,"RADIOGRAPHER, DIAGNOSTIC",HOLDER LLC,"JACOBSHIRE, AP",2021-04-08,Able such right culture. Wrong pick structure ...,https://realpython.github.io/fake-jobs/jobs/ra...
97,DATABASE ADMINISTRATOR,YATES-FERGUSON,"PORT SUSAN, AE",2021-04-08,Create day party decade high clear. Past trade...,https://realpython.github.io/fake-jobs/jobs/da...
98,FURNITURE DESIGNER,ORTEGA-LAWRENCE,"NORTH TIFFANY, AA",2021-04-08,Pressure under rock next week. Recognize so re...,https://realpython.github.io/fake-jobs/jobs/fu...


#### If a save is performed and changes are made to the dataset, the main code has to be run through again or nesting occurs on the Apply URL. Recommended code for fixing this bug does not work. 

In [370]:
# Hyperlink Apply URL, save as HTML, save as CSV
jobpostinfo['Apply URL'] = jobpostinfo['Apply URL'].apply(
    lambda url: f'<a href="{url}" target="_blank">Apply</a>' if url != 'N/A' else 'N/A'
)

# This was a recommended fix for a nesting bug that breaks the URL with subsequent saves; however, testing has proven this is not a solution
# Best practice is to run the code above again before saving. Any additional changes made to the code or dataframe should be performed above this cell to prevent nesting.
jobpostinfo['Apply URL'] = jobpostinfo['Apply URL'].apply(
    lambda url: url if url.startswith('<a href=') or url == 'N/A' else f'<a href="{url}" target="_blank">Apply</a>'
)

jobpostinfo.to_html('results/jobpostinfo.html', escape=False, index=False)

jobpostinfo.to_csv('results/jobpostinfo.csv', index=False)

#### Refactoring to use functions instead of a loop using information from https://jkropko.github.io/surfing-the-data-pipeline/ch5.html
#### Functions are more optimized for maintainability and scalability. 

In [444]:
baseurl = 'https://realpython.github.io/fake-jobs/'

In [393]:
response = requests.get(baseurl)
soup = BS(response.text)

In [394]:
response.status_code

200

In [455]:
# print(response.text).prettify

In [431]:
jobtitle = soup.h2.text.strip()
print(jobtitle)

company = soup.h3.text.strip()
print(company)

location = soup.find('p', class_ = 'location').text.strip()
print(location)

apply = soup.findAll('a')[1].get('href').strip()
print(apply)

Senior Python Developer
Payne, Roberts and Davis
Stewartbury, AA
https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html


In [461]:
# Fetch HTML content, parse via BeautifulSoup
def getsoup(url):
    response = requests.get(url)
    return BS(response.text, 'html.parser')

In [462]:
# Extract job posting cards from parsed HTML
def jobcard(soup):
    return soup.findAll('div', class_ = 'card-content')

In [None]:
# Fetch and extract job description from the job's detail page
def getjobdescription(url):
    soup = getsoup(url)
    content = soup.find('div', class_ = 'content')
    if not content:
        return ''
    paragraphs = content.findAll('p')
    paragraphs = [p for p in paragraphs if not p.has_attr('id')]
    return paragraphs[0].text.strip() if paragraphs else ''

In [471]:
# Extract detailed info from job card
def jobdetails(card):
    title = card.h2.text.strip()
    company = card.h3.text.strip()
    location = card.find('p', class_ = 'location').text.strip()
    dateposted = card.find('p', class_ = 'is-small').text.strip()
   
    applylink = card.findAll('a')[1].get('href', 'N/A').strip()
    jobdescription = getjobdescription(applylink) # fetch job description from the job detail page via applylink

    return { # return dictionary with extracted job info
        'Title': title,
        'Company': company, 
        'Location': location,
        'Date Posted': dateposted,
        'Job Description': jobdescription,
        'Apply URL': applylink
    }

In [465]:
# Normalize selected columns to uppercase. Can be done later if preferred.
def normalizecolumns(df, columns):
    df[columns] = df[columns].apply(lambda col: col.str.upper())
    return df

In [468]:
# Combines entire scraping process into a df
def main():
    baseurl = 'https://realpython.github.io/fake-jobs/'
    soup = getsoup(baseurl)
    jobcards = jobcard(soup)
    details = [jobdetails(card) for card in jobcards]

    df = pd.DataFrame(details)
    df = normalizecolumns(df, ['Title', 'Company', 'Location'])
    return df

In [469]:
jobpostinfo = main()
jobpostinfo

Unnamed: 0,Title,Company,Location,Date Posted,Job Description,Apply URL
0,SENIOR PYTHON DEVELOPER,"PAYNE, ROBERTS AND DAVIS","STEWARTBURY, AA",2021-04-08,Professional asset web application environment...,https://realpython.github.io/fake-jobs/jobs/se...
1,ENERGY ENGINEER,VASQUEZ-DAVIDSON,"CHRISTOPHERVILLE, AA",2021-04-08,Party prevent live. Quickly candidate change a...,https://realpython.github.io/fake-jobs/jobs/en...
2,LEGAL EXECUTIVE,"JACKSON, CHAMBERS AND LEVY","PORT ERICABURGH, AA",2021-04-08,Administration even relate head color. Staff b...,https://realpython.github.io/fake-jobs/jobs/le...
3,FITNESS CENTRE MANAGER,SAVAGE-BRADLEY,"EAST SEANVIEW, AP",2021-04-08,Tv program actually race tonight themselves tr...,https://realpython.github.io/fake-jobs/jobs/fi...
4,PRODUCT MANAGER,RAMIREZ INC,"NORTH JAMIEVIEW, AP",2021-04-08,Traditional page a although for study anyone. ...,https://realpython.github.io/fake-jobs/jobs/pr...
...,...,...,...,...,...,...
95,MUSEUM/GALLERY EXHIBITIONS OFFICER,"NGUYEN, YODER AND PETTY","LAKE ABIGAIL, AE",2021-04-08,Paper age physical current note. There reality...,https://realpython.github.io/fake-jobs/jobs/mu...
96,"RADIOGRAPHER, DIAGNOSTIC",HOLDER LLC,"JACOBSHIRE, AP",2021-04-08,Able such right culture. Wrong pick structure ...,https://realpython.github.io/fake-jobs/jobs/ra...
97,DATABASE ADMINISTRATOR,YATES-FERGUSON,"PORT SUSAN, AE",2021-04-08,Create day party decade high clear. Past trade...,https://realpython.github.io/fake-jobs/jobs/da...
98,FURNITURE DESIGNER,ORTEGA-LAWRENCE,"NORTH TIFFANY, AA",2021-04-08,Pressure under rock next week. Recognize so re...,https://realpython.github.io/fake-jobs/jobs/fu...


#### Put it all together

In [472]:
# Fetch HTML content, parse via BeautifulSoup
def getsoup(url):
    response = requests.get(url)
    return BS(response.text, 'html.parser')

def jobcard(soup):
    return soup.findAll('div', class_ = 'card-content')

def getjobdescription(url):
    soup = getsoup(url)
    content = soup.find('div', class_ = 'content')
    if not content:
        return ''
    paragraphs = content.findAll('p')
    paragraphs = [p for p in paragraphs if not p.has_attr('id')]
    return paragraphs[0].text.strip() if paragraphs else ''

def jobdetails(card):
    title = card.h2.text.strip()
    company = card.h3.text.strip()
    location = card.find('p', class_ = 'location').text.strip()
    dateposted = card.find('p', class_ = 'is-small').text.strip()
   
    applylink = card.findAll('a')[1].get('href', 'N/A').strip()
    jobdescription = getjobdescription(applylink)

    return {
        'Title': title,
        'Company': company, 
        'Location': location,
        'Date Posted': dateposted,
        'Job Description': jobdescription,
        'Apply URL': applylink
    }

def normalizecolumns(df, columns):
    df[columns] = df[columns].apply(lambda col: col.str.upper())
    return df

def main():
    baseurl = 'https://realpython.github.io/fake-jobs/'
    soup = getsoup(baseurl)
    jobcards = jobcard(soup)
    details = [jobdetails(card) for card in jobcards]
    
    df = pd.DataFrame(details)
    df = normalizecolumns(df, ['Title', 'Company', 'Location'])
    return df

jobpostinfo = main()
jobpostinfo

Unnamed: 0,Title,Company,Location,Date Posted,Job Description,Apply URL
0,SENIOR PYTHON DEVELOPER,"PAYNE, ROBERTS AND DAVIS","STEWARTBURY, AA",2021-04-08,Professional asset web application environment...,https://realpython.github.io/fake-jobs/jobs/se...
1,ENERGY ENGINEER,VASQUEZ-DAVIDSON,"CHRISTOPHERVILLE, AA",2021-04-08,Party prevent live. Quickly candidate change a...,https://realpython.github.io/fake-jobs/jobs/en...
2,LEGAL EXECUTIVE,"JACKSON, CHAMBERS AND LEVY","PORT ERICABURGH, AA",2021-04-08,Administration even relate head color. Staff b...,https://realpython.github.io/fake-jobs/jobs/le...
3,FITNESS CENTRE MANAGER,SAVAGE-BRADLEY,"EAST SEANVIEW, AP",2021-04-08,Tv program actually race tonight themselves tr...,https://realpython.github.io/fake-jobs/jobs/fi...
4,PRODUCT MANAGER,RAMIREZ INC,"NORTH JAMIEVIEW, AP",2021-04-08,Traditional page a although for study anyone. ...,https://realpython.github.io/fake-jobs/jobs/pr...
...,...,...,...,...,...,...
95,MUSEUM/GALLERY EXHIBITIONS OFFICER,"NGUYEN, YODER AND PETTY","LAKE ABIGAIL, AE",2021-04-08,Paper age physical current note. There reality...,https://realpython.github.io/fake-jobs/jobs/mu...
96,"RADIOGRAPHER, DIAGNOSTIC",HOLDER LLC,"JACOBSHIRE, AP",2021-04-08,Able such right culture. Wrong pick structure ...,https://realpython.github.io/fake-jobs/jobs/ra...
97,DATABASE ADMINISTRATOR,YATES-FERGUSON,"PORT SUSAN, AE",2021-04-08,Create day party decade high clear. Past trade...,https://realpython.github.io/fake-jobs/jobs/da...
98,FURNITURE DESIGNER,ORTEGA-LAWRENCE,"NORTH TIFFANY, AA",2021-04-08,Pressure under rock next week. Recognize so re...,https://realpython.github.io/fake-jobs/jobs/fu...


## Review Workflow

In [473]:
import pandas as pd
from bs4 import BeautifulSoup as BS
import requests

url = "http://realpython.github.io/fake-jobs/"
response = requests.get(url)
soup = BS(response.text)

In [475]:
first_title = soup.find('h2', attrs = {'class': 'title'})

In [479]:
titles_soup_object = soup.findAll('h2')

In [481]:
title_list = [each_title.text for each_title in titles_soup_object]

In [483]:
companies_soup_object = soup.findAll('h3')

In [484]:
companies_list = [each_company.text for each_company in companies_soup_object]

In [486]:
location_soup_object = soup.findAll('p', attrs = {'class': 'location'})

In [498]:
location_list = [each_location.text.strip() for each_location in location_soup_object]

In [499]:
time_soup_object = soup.findAll('time')

In [505]:
date_list = [each_date.text for each_date in time_soup_object]

In [556]:
pd.DataFrame({'title': title_list, 'company': companies_list, 'location': location_list, 'date': date_list})

Unnamed: 0,title,company,location,date
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08
...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","Lake Abigail, AE",2021-04-08
96,"Radiographer, diagnostic",Holder LLC,"Jacobshire, AP",2021-04-08
97,Database administrator,Yates-Ferguson,"Port Susan, AE",2021-04-08
98,Furniture designer,Ortega-Lawrence,"North Tiffany, AA",2021-04-08


In [557]:
a_tags_soup_object = soup.findAll('a')

In [558]:
dirty_links = [each_a['href'] for each_a in a_tags_soup_object]

In [559]:
counter = 0

for link in dirty_links:
    counter+=1
    if counter%2 == 0:
        print(link)

https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html
https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html
https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html
https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html
https://realpython.github.io/fake-jobs/jobs/product-manager-4.html
https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html
https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html
https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html
https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html
https://realpython.github.io/fake-jobs/jobs/waste-management-officer-9.html
https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html
https://realpython.github.io/fake-jobs/jobs/interpreter-11.html
https://realpython.github.io/fake-jobs/jobs/architect-12.html
https://realpython.github.io/fake-jobs/jobs/meteorologist-13.html
https://r

In [516]:
#Another Way
link_list = []

for link in dirty_links:
    counter+=1
    if counter%2 == 0:
        link_list.append(link)

link_list

['https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html',
 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html',
 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html',
 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html',
 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html',
 'https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html',
 'https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html',
 'https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html',
 'https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html',
 'https://realpython.github.io/fake-jobs/jobs/waste-management-officer-9.html',
 'https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html',
 'https://realpython.github.io/fake-jobs/jobs/interpreter-11.html',
 'https://realpython.github.io/fake-jobs/jobs/architect-12.html',
 'https://realpython.gi

In [523]:
# Another Way
[link for int, link in enumerate(dirty_links) if int%2 != 0]

['https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html',
 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html',
 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html',
 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html',
 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html',
 'https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html',
 'https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html',
 'https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html',
 'https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html',
 'https://realpython.github.io/fake-jobs/jobs/waste-management-officer-9.html',
 'https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html',
 'https://realpython.github.io/fake-jobs/jobs/interpreter-11.html',
 'https://realpython.github.io/fake-jobs/jobs/architect-12.html',
 'https://realpython.gi

In [560]:
link_start = 'https://realpython.github.io/fake-jobs/jobs/'
link_end = '.html'

In [561]:
i = 0 # counter

clean_link_list = []
for each_title in title_list:
    each_title = each_title.replace(' ', '-')
    each_title = each_title.replace('(', '')
    each_title = each_title.replace(')', '')
    each_title = each_title.replace("'", '')
    each_title = each_title.replace('/', '-')
    each_title = each_title.replace(',', '')
    clean_link = link_start + each_title.lower() + '-' + str(i) + link_end
    clean_link_list.append(clean_link)
    i+=1 # remember i is the counter

clean_link_list

['https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html',
 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html',
 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html',
 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html',
 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html',
 'https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html',
 'https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html',
 'https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html',
 'https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html',
 'https://realpython.github.io/fake-jobs/jobs/waste-management-officer-9.html',
 'https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html',
 'https://realpython.github.io/fake-jobs/jobs/interpreter-11.html',
 'https://realpython.github.io/fake-jobs/jobs/architect-12.html',
 'https://realpython.gi

In [554]:
clean_link_list2 = []

for i, each_title in enumerate(title_list):
    each_title = each_title.replace(' ', '-')
    each_title = each_title.replace('(', '')
    each_title = each_title.replace(')', '')
    each_title = each_title.replace("'", '')
    each_title = each_title.replace('/', '-')
    each_title = each_title.replace(',', '')
    clean_link = link_start + each_title.lower() + '-' + str(i) + link_end
    clean_link_list.append(clean_link)


clean_link_list2

[]

In [562]:
jobs['Apply'] = clean_link_list
jobs

Unnamed: 0,Title,Company,Location,Date,apply link,Apply
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...,https://realpython.github.io/fake-jobs/jobs/se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...,https://realpython.github.io/fake-jobs/jobs/en...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...,https://realpython.github.io/fake-jobs/jobs/le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...,https://realpython.github.io/fake-jobs/jobs/fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...,https://realpython.github.io/fake-jobs/jobs/pr...
...,...,...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","Lake Abigail, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/mu...,https://realpython.github.io/fake-jobs/jobs/mu...
96,"Radiographer, diagnostic",Holder LLC,"Jacobshire, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/ra...,https://realpython.github.io/fake-jobs/jobs/ra...
97,Database administrator,Yates-Ferguson,"Port Susan, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/da...,https://realpython.github.io/fake-jobs/jobs/da...
98,Furniture designer,Ortega-Lawrence,"North Tiffany, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fu...,https://realpython.github.io/fake-jobs/jobs/fu...


In [547]:
singleURL = 'http://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html'

In [548]:
singleResponse = requests.get(singleURL)

In [549]:
singleResponse

<Response [200]>

In [550]:
singleSoup = BS(singleResponse.text)

In [551]:
singleSoup.findAll('p')[1]

<p>Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.</p>

In [552]:
jobs_descriptions = []

for each_url in clean_link_list:
    each_url = requests.get(each_url)
    soup_url = BS(each_url.text)
    text = soup_url.findAll('p')[1].text
    jobs_descriptions.append(text)

jobs_descriptions

['Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.',
 'Party prevent live. Quickly candidate change although. Together type music hospital. Every speech support time operation wear often.',
 'Administration even relate head color. Staff beyond chair recently and off. Own available buy country store build before. Already against which continue. Look road article quickly. International big employee determine positive go Congress. Level others record hospital employee

In [555]:
def get_desc(singleURL):
    request = requests.get(singleURL)
    soup = BS(request.text)
    return soup.findAll('p')[1].text

In [563]:
description_list = jobs.Apply.apply(get_desc)

In [564]:
jobs['description'] = description_list

In [565]:
jobs

Unnamed: 0,Title,Company,Location,Date,apply link,Apply,description
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...,https://realpython.github.io/fake-jobs/jobs/se...,Professional asset web application environment...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...,https://realpython.github.io/fake-jobs/jobs/en...,Party prevent live. Quickly candidate change a...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...,https://realpython.github.io/fake-jobs/jobs/le...,Administration even relate head color. Staff b...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...,https://realpython.github.io/fake-jobs/jobs/fi...,Tv program actually race tonight themselves tr...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...,https://realpython.github.io/fake-jobs/jobs/pr...,Traditional page a although for study anyone. ...
...,...,...,...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty","Lake Abigail, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/mu...,https://realpython.github.io/fake-jobs/jobs/mu...,Paper age physical current note. There reality...
96,"Radiographer, diagnostic",Holder LLC,"Jacobshire, AP",2021-04-08,https://realpython.github.io/fake-jobs/jobs/ra...,https://realpython.github.io/fake-jobs/jobs/ra...,Able such right culture. Wrong pick structure ...
97,Database administrator,Yates-Ferguson,"Port Susan, AE",2021-04-08,https://realpython.github.io/fake-jobs/jobs/da...,https://realpython.github.io/fake-jobs/jobs/da...,Create day party decade high clear. Past trade...
98,Furniture designer,Ortega-Lawrence,"North Tiffany, AA",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fu...,https://realpython.github.io/fake-jobs/jobs/fu...,Pressure under rock next week. Recognize so re...


In [566]:
new_jobs = jobs[['Title', 'Company', 'Date', 'Apply', 'description']]

In [567]:
new_jobs

Unnamed: 0,Title,Company,Date,Apply,description
0,Senior Python Developer,"Payne, Roberts and Davis",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...,Professional asset web application environment...
1,Energy engineer,Vasquez-Davidson,2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...,Party prevent live. Quickly candidate change a...
2,Legal executive,"Jackson, Chambers and Levy",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...,Administration even relate head color. Staff b...
3,Fitness centre manager,Savage-Bradley,2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...,Tv program actually race tonight themselves tr...
4,Product manager,Ramirez Inc,2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...,Traditional page a although for study anyone. ...
...,...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty",2021-04-08,https://realpython.github.io/fake-jobs/jobs/mu...,Paper age physical current note. There reality...
96,"Radiographer, diagnostic",Holder LLC,2021-04-08,https://realpython.github.io/fake-jobs/jobs/ra...,Able such right culture. Wrong pick structure ...
97,Database administrator,Yates-Ferguson,2021-04-08,https://realpython.github.io/fake-jobs/jobs/da...,Create day party decade high clear. Past trade...
98,Furniture designer,Ortega-Lawrence,2021-04-08,https://realpython.github.io/fake-jobs/jobs/fu...,Pressure under rock next week. Recognize so re...


In [568]:
new_jobs = new_jobs.drop('description', axis = 1)

In [569]:
new_jobs['Description'] = description_list

In [570]:
new_jobs

Unnamed: 0,Title,Company,Date,Apply,Description
0,Senior Python Developer,"Payne, Roberts and Davis",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...,Professional asset web application environment...
1,Energy engineer,Vasquez-Davidson,2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...,Party prevent live. Quickly candidate change a...
2,Legal executive,"Jackson, Chambers and Levy",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...,Administration even relate head color. Staff b...
3,Fitness centre manager,Savage-Bradley,2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...,Tv program actually race tonight themselves tr...
4,Product manager,Ramirez Inc,2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...,Traditional page a although for study anyone. ...
...,...,...,...,...,...
95,Museum/gallery exhibitions officer,"Nguyen, Yoder and Petty",2021-04-08,https://realpython.github.io/fake-jobs/jobs/mu...,Paper age physical current note. There reality...
96,"Radiographer, diagnostic",Holder LLC,2021-04-08,https://realpython.github.io/fake-jobs/jobs/ra...,Able such right culture. Wrong pick structure ...
97,Database administrator,Yates-Ferguson,2021-04-08,https://realpython.github.io/fake-jobs/jobs/da...,Create day party decade high clear. Past trade...
98,Furniture designer,Ortega-Lawrence,2021-04-08,https://realpython.github.io/fake-jobs/jobs/fu...,Pressure under rock next week. Recognize so re...
