## Introduction to Beautiful Soup for web scrapping example
As seen at https://realpython.com/beautiful-soup-web-scraper-python/

In [1]:
import requests

In [2]:
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
results = soup.find(id='ResultsContainer')

In [5]:
print(results.prettify())

<div class="mux-custom-scroll" data-extend="left" data-mux="customScroll" data-target="html" id="ResultsContainer">
 <div class="scrollable" id="ResultsScrollable">
  <script type="application/ld+json">
   {"@context":"https://schema.org","@type":"ItemList","mainEntityOfPage":{
            "@type":"CollectionPage","@id":"https://www.monster.com/jobs/search/?q=Software-Developer&amp;where=Australia"
            }
            ,"itemListElement":[

                 {"@type":"ListItem","position":1,"url":"https://job-openings.monster.com/lead-performance-engineer-software-systems-plantation-fl-sunnyvale-ca-culver-new-york-city-ca-seattle-wa-austin-tx-toronto-ny-us-magic-leap-inc/e5aff8c3-a1ff-4606-a1bc-793e8a588bc0"}
                    ,
                 {"@type":"ListItem","position":2,"url":""}
                    ,
                 {"@type":"ListItem","position":3,"url":"https://job-openings.monster.com/senior-lead-software-engineer-browser-sunnyvale-ca-plantation-fl-hq-austin

In [6]:
job_elems = results.find_all('section', class_='card-content')

In [7]:
for job_elem in job_elems:
    #print(job_elem, end='\n'*2)
    title_elem = job_elem.find('h2', class_='title')
    if title_elem is None:
        continue
    print(title_elem.text)

Lead Performance Engineer, Software Systems

Senior/Lead Software Engineer, Browser

Service Consultant REST

Solution Delivery Manager

Customer Experience Technical Analyst - Sydney, New South Wales

Runner

Associate Strategist

Strategist

IT Service Management Platform Developer



In [8]:
eng = results.find_all('h2', string=lambda text: 'engineer' in text.lower())

In [9]:
for r in eng:
    link = r.find('a')['href']
    print(link)
    break

https://job-openings.monster.com/lead-performance-engineer-software-systems-plantation-fl-sunnyvale-ca-culver-new-york-city-ca-seattle-wa-austin-tx-toronto-ny-us-magic-leap-inc/e5aff8c3-a1ff-4606-a1bc-793e8a588bc0


## NBA Web Scraper

In [10]:
import requests
from bs4 import BeautifulSoup

In [11]:
URL = 'https://www.nba.com/stats/leaders/'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

In [12]:
soup.find('nba-stat-table')

<nba-stat-table filters="filters" ng-if="!isLoading &amp;&amp; !noData" params="params" rows="playerStats" template="players/players-leaders"></nba-stat-table>

In [13]:
from selenium import webdriver
import time

In [17]:
url = "https://www.basketball-reference.com/leagues/NBA_2017_standings.html"
browser = webdriver.Chrome('/Users/stephenbrock/chromedriver')

browser.get(url)
time.sleep(3)
html = browser.page_source
soup = BeautifulSoup(html, "lxml")

print(len(soup.find_all("table")))
print(soup.find("table", {"id": "expanded_standings"}))

browser.close()
browser.quit()

14
<table class="sortable stats_table now_sortable" data-cols-to-freeze="2" id="expanded_standings"><caption>Expanded Standings Table</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>
<tr class="over_header">
<th aria-label="" class=" over_header center" colspan="3" data-stat=""></th>
<th aria-label="" class=" over_header center" colspan="2" data-stat="header_place">Place</th>
<th aria-label="" class=" over_header center" colspan="2" data-stat="header_conf">Conference</th>
<th aria-label="" class=" over_header center" colspan="6" data-stat="header_div">Division</th>
<th aria-label="" class=" over_header center" colspan="2" data-stat="header_all_star">All-Star</th>
<th aria-label="" class=" over_header center" colspan="2" data-stat="header_diff">Margin</th>
<th aria-label="" class=" over_header center" colspan="7" data-stat="header_month">Month</th>
</tr>

In [22]:
url = "https://www.nba.com/stats/leaders/"
browser = webdriver.Chrome('/Users/stephenbrock/chromedriver')

browser.get(url)
time.sleep(3)
html = browser.page_source
soup = BeautifulSoup(html, "lxml")

table = soup.find_all("table")[0]

# get header
header_html = table.find('tr')
columns = list()
columns_html = header_html.find_all('th')
for c in columns_html:
    columns.append(c.text)
print('columns:',columns)


players = table.find_all('td', class_='player')
print(players[0])

print(len(soup.find_all("table")))
print(soup.find("table"))

browser.close()
browser.quit()

columns: ['#', '\nPlayer\n          ', 'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'EFF']
<td class="player">
<a href="/player/201935/traditional/">James Harden</a>
</td>
2
<table>
<thead>
<tr>
<th cf="" data-field="RANK">#</th>
<th cf="" class="player" data-field="PLAYER">
<!-- -->Player
          </th>
<th cf="" data-field="GP">GP</th>
<!-- --><th cf="" data-field="MIN" ng-if="params.PerMode!=='Per48'">MIN</th><!-- -->
<th cf="" class="" data-field="PTS">PTS</th>
<th cf="" data-field="FGM">FGM</th>
<th cf="" data-field="FGA">FGA</th>
<th cf="" data-field="FG_PCT">FG%</th>
<th cf="" data-field="FG3M">3PM</th>
<th cf="" data-field="FG3A">3PA</th>
<th cf="" data-field="FG3_PCT">3P%</th>
<th cf="" data-field="FTM">FTM</th>
<th cf="" data-field="FTA">FTA</th>
<th cf="" data-field="FT_PCT">FT%</th>
<th cf="" data-field="OREB">OREB</th>
<th cf="" data-field="DREB">DREB</th>
<th cf="" data-field="REB">R