In [1]:
import requests as r
import pandas as pd
from bs4 import BeautifulSoup as BS

### 1. Start by performing a GET request on the url above and convert the response into a BeautifulSoup object.  

In [2]:
URL = 'https://realpython.github.io/fake-jobs/'

response = r.get(URL)

response.status_code

200

In [3]:
soup = BS(response.text)

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Fake Python
  </title>
  <link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
 </head>
 <body>
  <section class="section">
   <div class="container mb-5">
    <h1 class="title is-1">
     Fake Python
    </h1>
    <p class="subtitle is-3">
     Fake Jobs for Your Web Scraping Journey
    </p>
   </div>
   <div class="container">
    <div class="columns is-multiline" id="ResultsContainer">
     <div class="column is-half">
      <div class="card">
       <div class="card-content">
        <div class="media">
         <div class="media-left">
          <figure class="image is-48x48">
           <img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
          </figure>
         </div>
         <div class="media-content">
          <h2 c

### a. Use the .find method to find the tag containing the first job title ("Senior Python Developer"). Hint: can you find a tag type and/or a class that could be helpful for extracting this information? Extract the text from this title. 

In [5]:
soup.find('h2', attrs={'class':'title is-5'}).text

'Senior Python Developer'

### b. Now, use what you did for the first title, but extract the job title for all jobs on this page. Store the results in a list. 

In [6]:
job_title = soup.findAll('h2', attrs={'class':'title is-5'})

In [7]:
job_title[1]

<h2 class="title is-5">Energy engineer</h2>

In [8]:
cards = soup.find_all("div", class_="card")

titles = [] 
companies = []
locations = [] 
dates = []

for card in cards:
    title = card.find("h2", class_="title").get_text(strip=True)
    company = card.find("h3", class_="company").get_text(strip=True)
    location = card.find("p", class_="location").get_text(strip=True)
    date = card.find("time").get_text(strip=True)

    titles.append(title)
    companies.append(company)
    locations.append(location)
    dates.append(date)

    print("Title:", title if title else "N/A")
    print("Company:", company if company else "N/A")
    print("Location:", location if location else "N/A")
    print("Date Posted:", date if date else "N/A")
    print("-" * 40)


Title: Senior Python Developer
Company: Payne, Roberts and Davis
Location: Stewartbury, AA
Date Posted: 2021-04-08
----------------------------------------
Title: Energy engineer
Company: Vasquez-Davidson
Location: Christopherville, AA
Date Posted: 2021-04-08
----------------------------------------
Title: Legal executive
Company: Jackson, Chambers and Levy
Location: Port Ericaburgh, AA
Date Posted: 2021-04-08
----------------------------------------
Title: Fitness centre manager
Company: Savage-Bradley
Location: East Seanview, AP
Date Posted: 2021-04-08
----------------------------------------
Title: Product manager
Company: Ramirez Inc
Location: North Jamieview, AP
Date Posted: 2021-04-08
----------------------------------------
Title: Medical technical officer
Company: Rogers-Yates
Location: Davidville, AP
Date Posted: 2021-04-08
----------------------------------------
Title: Physiological scientist
Company: Kramer-Klein
Location: South Christopher, AE
Date Posted: 2021-04-08
-----

In [9]:
lst_jobs = list(zip(titles, companies, locations, dates))

In [10]:
lst_jobs[0]

('Senior Python Developer',
 'Payne, Roberts and Davis',
 'Stewartbury, AA',
 '2021-04-08')

In [11]:
jobs_df = pd.DataFrame(lst_jobs, columns=["title", "company", "location", "date_posted"])

In [12]:
jobs_df.head()

Unnamed: 0,title,company,location,date_posted
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08


### 2. Next, add a column that contains the url for the "Apply" button. Try this in two ways.   

    a. First, use the BeautifulSoup find_all method to extract the urls.  

In [13]:
links = soup.findAll('a', class_='card-footer-item')

# Note to self: in plain python use in instead of .str.contains()
apply_links = [link.get('href') for link in links if 'fake-jobs' in link.get('href', '')]

In [14]:
apply_links

['https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html',
 'https://realpython.github.io/fake-jobs/jobs/energy-engineer-1.html',
 'https://realpython.github.io/fake-jobs/jobs/legal-executive-2.html',
 'https://realpython.github.io/fake-jobs/jobs/fitness-centre-manager-3.html',
 'https://realpython.github.io/fake-jobs/jobs/product-manager-4.html',
 'https://realpython.github.io/fake-jobs/jobs/medical-technical-officer-5.html',
 'https://realpython.github.io/fake-jobs/jobs/physiological-scientist-6.html',
 'https://realpython.github.io/fake-jobs/jobs/textile-designer-7.html',
 'https://realpython.github.io/fake-jobs/jobs/television-floor-manager-8.html',
 'https://realpython.github.io/fake-jobs/jobs/waste-management-officer-9.html',
 'https://realpython.github.io/fake-jobs/jobs/software-engineer-python-10.html',
 'https://realpython.github.io/fake-jobs/jobs/interpreter-11.html',
 'https://realpython.github.io/fake-jobs/jobs/architect-12.html',
 'https://realpython.gi

In [15]:
jobs_df['url'] = [link.get('href') for link in links if 'fake-jobs' in link.get('href') ]

#### b. Next, get those same urls in a different way. Examine the urls and see if you can spot the pattern of how they are constructed. Then, build the url using the elements you have already extracted. Ensure that the urls that you created match those that you extracted using BeautifulSoup. Warning: You will need to do some string cleaning and prep in constructing the urls this way. For example, look carefully at the urls for the "Software Engineer (Python)" job and the "Scientist, research (maths)" job.

In [16]:
fj_url = 'https://realpython.github.io/fake-jobs/jobs/'

for i, job in jobs_df.iterrows():
    jobs_df.at[i, 'const_url'] = fj_url + job.title.lower().replace(' ', '-').replace('(', '').replace(')', '').replace(',', '').replace('/', '-') + "-" + str(i) + ".html"

In [17]:
# This help me refine the code above for non-matching cases
is_same = jobs_df['url'] == jobs_df['const_url']

incor_urls = jobs_df.loc[is_same == False]

for url in incor_urls['const_url']:
    print(url)

In [18]:
url = jobs_df['url'][0]

url

'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html'

In [19]:
response = r.get(url)

response.status_code

200

In [20]:
job_soup = BS(response.text)

In [21]:
desc = job_soup.find('div', class_='content').find('p').text

desc

'Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.'

In [27]:
def get_job_desc(url):
    
    res = r.get(url)

    if res.status_code == 200:
        soup = BS(res.text)
        return soup.find('div', class_='content').find('p').text
    else:
        print(f'There was an erro: {res.status_code}')
        return None

In [28]:
jobs_df['description'] = jobs_df['url'].apply(get_job_desc)

In [29]:
# verifyin that it loaded correctly
for desc in jobs_df['description']:
    print(f'{desc} \n')

Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset. 

Party prevent live. Quickly candidate change although. Together type music hospital. Every speech support time operation wear often. 

Administration even relate head color. Staff beyond chair recently and off. Own available buy country store build before. Already against which continue. Look road article quickly. International big employee determine positive go Congress. Level others record hospital employee towar

### Sources:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text
<br>Get index of a list: https://www.programiz.com/python-programming/methods/list/index
<br>How to use at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.at.html