## Webscraping

#### Basic html page

```
<!DOCTYPE html>
<html>
<head>
    <title>Web Page!</title>
    <style>
        body {background-color: powderblue;}
        h1   {color: blue;}
        p    {color: red;}
    </style>
    <link rel="stylesheet" href="styles.css">
    <script>
        document.getElementById("demo").innerHTML = "Hello JavaScript!";
    </script>
</head>
<body>
    <h1>A Very Bold Header</h1>
    <div style="background-color:lightblue">
        <p>This is a paragraph.</p>
    </div>
</body>
</html>
```

### nyc weather history

http://w1.weather.gov/data/obhistory/KNYC.html

In [1]:
knyc_link = 'http://w1.weather.gov/data/obhistory/KNYC.html'

In [2]:
import requests

knyc_page = requests.get(knyc_link)
knyc_page

<Response [200]>

In [3]:
# the first 1000 characters of the page
print(knyc_page.content[:1000])

b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\r\n\t\t\t\t\t\t\t<html><meta name="Author" content="Leon Minton"><head><title>\r\n\t\t\t\t\t\t\tNational Weather Service : Observed Weather for past 3 Days : New York City, Central Park</title>\r\n\t\t\t\t\t\t\t<link rel="STYLESHEET" type="text/css" href="/images/weather/fcicons/main.css"></head>\r\n\t\t\t\t\t\t\t<body bgcolor="#ffffff" leftmargin="0" topmargin="0" marginwidth="0" marginheight="0" background="/images/weather/fcicons/gray_background.gif">\r\n\t\t\t\t\t\t\t<table cellspacing="0" cellpadding="0" border="0" width="670" background="/images/weather/fcicons/topbanner.jpg">\r\n\t\t\t\t\t\t\t<tr><td align="right" height="19"><a href="http://weather.gov"><span class="nwslink">weather.gov</span></a>&nbsp;&nbsp;&nbsp;</td></tr></table>\r\n\t\t\t\t\t\t\t<table cellspacing="0" cellpadding="0" border="0" width="670"><tr valign="top">\r\n\t\t\t\t\t\t\t<td rowspan="2"><a href="http://www.noaa.gov"><img src="/images/weathe

In [4]:
# need to parse some html!
from bs4 import BeautifulSoup

In [5]:
knyc_soup = BeautifulSoup(knyc_page.content)

In [6]:
# first 1000 characters more legibly
print(knyc_soup.prettify()[:1000])

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html>
 <head>
  <meta content="Leon Minton" name="Author"/>
  <title>
   National Weather Service : Observed Weather for past 3 Days : New York City, Central Park
  </title>
  <link href="/images/weather/fcicons/main.css" rel="STYLESHEET" type="text/css"/>
 </head>
 <body background="/images/weather/fcicons/gray_background.gif" bgcolor="#ffffff" leftmargin="0" marginheight="0" marginwidth="0" topmargin="0">
  <table background="/images/weather/fcicons/topbanner.jpg" border="0" cellpadding="0" cellspacing="0" width="670">
   <tr>
    <td align="right" height="19">
     <a href="http://weather.gov">
      <span class="nwslink">
       weather.gov
      </span>
     </a>
    </td>
   </tr>
  </table>
  <table border="0" cellpadding="0" cellspacing="0" width="670">
   <tr valign="top">
    <td rowspan="2">
     <a href="http://www.noaa.gov">
      <img alt="NOAA logo - Click to go to the NOAA homepage" border="0" height="78" s

In [7]:
# print the 4rd table in the page
print(knyc_soup.find_all('table')[3])

<table border="0" cellpadding="2" cellspacing="3" width="670"><tr align="center" bgcolor="#b0c4de"><th rowspan="3" width="17">D<br/>a<br/>t<br/>e</th><th rowspan="3" width="32">Time<br/>(edt)</th>
<th rowspan="3" width="80">Wind<br/>(mph)</th><th rowspan="3" width="40">Vis.<br/>(mi.)</th><th rowspan="3" width="80">Weather</th><th rowspan="3" width="65">Sky Cond.</th>
<th colspan="4">Temperature (ºF)</th><th rowspan="3" width="65">Relative<br/>Humidity</th><th rowspan="3" width="80">Wind<br/>Chill<br/>(°F)</th><th rowspan="3" width="80">Heat<br/>Index<br/>(°F)</th><th colspan="2">Pressure</th><th colspan="3">Precipitation (in.)</th></tr>
<tr align="center" bgcolor="#b0c4de"><th rowspan="2" width="45">Air</th><th rowspan="2" width="26">Dwpt</th><th colspan="2">6 hour</th>
<th rowspan="2" width="40">altimeter<br/>(in)</th><th rowspan="2" width="40">sea level<br/>(mb)</th><th rowspan="2" width="24">1 hr</th>
<th rowspan="2" width="24">3 hr</th><th rowspan="2" width="30">6 hr</th></tr>
<tr 

In [8]:
# extract data from the 4th table in the page into a dataframe

import pandas as pd

data_table = knyc_soup.find_all('table')[3]

table_rows = data_table.find_all('tr') # get rows from table

data = []
for idx,tr in enumerate(table_rows):
    if idx < 3 :                       # skip header rows
        continue
    td = tr.find_all('td')             # get table cells
    row = [elem.text for elem in td]   # pull text from cells
    data.append(row)                   # add to dataset
    
pd.DataFrame(data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,4,15:51,W 14 G 37,10.0,Overcast,OVC080,60,35,,,39%,,,29.76,1006.7,,,
1,4,14:51,Vrbl 7 G 26,10.0,Mostly Cloudy,BKN110,63,35,,,35%,,,29.74,1006.0,,,
2,4,13:51,Vrbl 5,10.0,Overcast,OVC090,66,36,75.0,64.0,33%,,,29.7,1004.7,,,
3,4,12:51,Vrbl 3,10.0,Overcast,FEW080 SCT090 OVC110,70,36,,,29%,,,29.67,1003.7,,,
4,4,11:51,,10.0,Mostly Cloudy,SCT080 BKN095,70,37,,,30%,,,29.68,1004.0,,,


#### Laguardia Weather History Summary
https://www.wunderground.com/history/daily/us/ny/new-york-city/KNYC/date/2018-12-3?cm_ven=localwx_history

In [9]:
wu_link = 'https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA'

<img src='./images/wunderground_table.png'>

In [10]:
# get the page
import requests
wu_page = requests.get(wu_link)
wu_page

<Response [200]>

In [11]:
from bs4 import BeautifulSoup
wu_soup = BeautifulSoup(wu_page.content)

In [12]:
print(wu_soup.prettify()[:1000])

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/Organization" lang="en" prefix="og: http://ogp.me/ns#">
 <head itemscope="" itemtype="http://schema.org/WebSite">
  <meta charset="utf-8"/>
  <title>
   New York City, NY Weather History | Weather Underground
  </title>
  <base href="/"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <link href="//widgets.outbrain.com" rel="dns-prefetch"/>
  <link href="//odb.outbrain.com" rel="dns-prefetch"/>
  <link href="//c.amazon-adsystem.com" rel="dns-prefetch"/>
  <link href="//s.amazon-adsystem.com" rel="dns-prefetch"/>
  <link href="//aax.amazon-adsystem.com" rel="dns-prefetch"/>
  <link href="//partner.googleadservices.com" rel="dns-prefetch"/>
  <link href="//tpc.googlesyndication.com" rel="dns-prefetch"/>
  <link href="//pagead2.googlesyndication.com" rel="dns-prefetch"/>
  <link href="//h.nexac.com" rel="dns-prefetch"/>
  <link href="/

In [13]:
# the table we want doesn't exist! culprit: javascript
wu_soup.find_all('table',class_=lambda value: value and value.startswith("mat-table"))

[]

In [14]:
# get the text from the page
wu_text = wu_soup.get_text()

# clean up the whitespace
import re
wu_text = re.sub(r'\n+','\n',wu_text.strip())
print(wu_text[-1000:])

nTime=timing.domainLookupEnd-timing.domainLookupStart;api.connectTime=timing.connectEnd-timing.connectStart;api.requestTime=timing.responseEnd-timing.requestStart;api.initDomTreeTime=timing.domInteractive-timing.responseEnd;api.loadEventTime=timing.loadEventEnd-timing.loadEventStart}return api},printTable:function(opts){var table={};var data=this.getTimes(opts)||{};Object.keys(data).sort().forEach(function(k){table[k]={ms:data[k],s:+(data[k]/1e3).toFixed(2)}});console.table(table)},printSimpleTable:function(){this.printTable({simple:true})}};function isNumeric(n){return!isNaN(parseFloat(n))&&isFinite(n)}if(typeof module!=="undefined"&&module.exports){module.exports=window.timing}})(typeof window!=="undefined"?window:{});
      
      window.addEventListener("load",function(){setTimeout(function(){newrelic&&timing&&timing.getTimes()&&Object.keys(timing.getTimes()).forEach(function(i){newrelic.setCustomAttribute("timing"+i.charAt(0).toUpperCase()+i.slice(1),timing.getTimes()[i])})},0)});

### Need to actually render page to process scripts!

In [15]:
# need to install chromedriver: https://sites.google.com/a/chromium.org/chromedriver/home
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

chrome_options = Options()
chrome_options.add_argument("--headless")

driver = webdriver.Chrome(options=chrome_options)

In [16]:
# this will actually render the page
driver.get(wu_link)

In [18]:
# two ways to find the table we want
wu_table = driver.find_element_by_class_name('mat-table')
#wu_table = driver.find_element_by_id('history-observation-table')

In [19]:
# text in the table
wu_table.text

'Time\nTemperature\nDew Point\nHumidity\nWind\nWind Speed\nWind Gust\nPressure\nPrecip.\nCondition\n12:51 AM 71 F 55 F 57 % SW 10 mph 0 mph 29.59 in 0.0 in Cloudy\n1:51 AM 69 F 54 F 58 % WNW 9 mph 0 mph 29.57 in 0.0 in Mostly Cloudy\n2:51 AM 67 F 44 F 44 % N 10 mph 0 mph 29.58 in 0.0 in Cloudy\n3:51 AM 66 F 43 F 43 % NNE 10 mph 0 mph 29.59 in 0.0 in Fair\n4:51 AM 64 F 43 F 46 % ESE 8 mph 0 mph 29.61 in 0.0 in Cloudy\n5:51 AM 62 F 44 F 52 % N 3 mph 0 mph 29.63 in 0.0 in Cloudy\n6:51 AM 61 F 44 F 54 % NE 6 mph 0 mph 29.65 in 0.0 in Mostly Cloudy\n7:51 AM 65 F 37 F 36 % NNE 7 mph 0 mph 29.65 in 0.0 in Mostly Cloudy\n8:51 AM 65 F 36 F 34 % E 7 mph 0 mph 29.65 in 0.0 in Partly Cloudy\n9:51 AM 67 F 35 F 31 % NE 5 mph 0 mph 29.64 in 0.0 in Fair\n10:51 AM 68 F 38 F 33 % WNW 9 mph 0 mph 29.64 in 0.0 in Fair\n11:51 AM 71 F 36 F 28 % WNW 12 mph 0 mph 29.62 in 0.0 in Partly Cloudy\n12:51 PM 69 F 37 F 31 % NW 12 mph 0 mph 29.61 in 0.0 in Mostly Cloudy\n1:51 PM 66 F 36 F 33 % NW 20 mph 30 mph 29.63 

In [24]:
# extracting text into a datafram
wu_data = []
for tr in wu_table.find_elements_by_css_selector('tr'):
    tmp_row = []
    for th in tr.find_elements_by_css_selector('th'):
        tmp_row.append(th.text.strip())
    for td in tr.find_elements_by_css_selector('td'):
        tmp_row.append(td.text.strip())
    wu_data.append(tmp_row)
df_wu = pd.DataFrame(wu_data[1:],columns=wu_data[0])
df_wu.head()

Unnamed: 0,Time,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Precip.,Condition
0,12:51 AM,71 F,55 F,57 %,SW,10 mph,0 mph,29.59 in,0.0 in,Cloudy
1,1:51 AM,69 F,54 F,58 %,WNW,9 mph,0 mph,29.57 in,0.0 in,Mostly Cloudy
2,2:51 AM,67 F,44 F,44 %,N,10 mph,0 mph,29.58 in,0.0 in,Cloudy
3,3:51 AM,66 F,43 F,43 %,NNE,10 mph,0 mph,29.59 in,0.0 in,Fair
4,4:51 AM,64 F,43 F,46 %,ESE,8 mph,0 mph,29.61 in,0.0 in,Cloudy


In [25]:
# visualize the rendered table, still missing some stuff, need to debug
wu_table.screenshot('./images/test1.png')

True

<img src='./images/test1.png'>