In [1]:
# Weather.gov has a dynamic table produced after the HTML is loaded.  To extract that information, I installed selenium.
# With selenium I can use Python to control the browser and extract the rendered HTML after the tables are generated.
# (I tried first with requests-html, but didn't have luck... but that installation made the selenium installation easier because
# it installed the required Chrome web driver as well)

# Outputs to ../data/weather_gov_20250327_20250425.csv

#pip install requests-html
#pip install selenium

In [23]:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
import pandas as pd

In [4]:
url = 'https://www.weather.gov/wrh/timeseries?site=KBNA&hours=720&units=english&chart=off&headers=on&obs=tabular&hourly=true&pview=standard&font=12&history=yes&start=20250327&end=20250425&plot='
# note that I am using the start and end dates of start=20250327&end=20250425 to tie in with the Active Dispatch data.

In [7]:
browser = webdriver.Chrome()

In [8]:
browser.get(url)
time.sleep(10)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')

In [25]:
df = pd.DataFrame(columns = ['date_time','temp','wind_direction','wind_speed','wind_gust','visibility_miles','weather','clouds','prcp_1_hr'])

In [None]:
    code = row['incident_type_code']
    match = re.search(r'^(\d+)', code)    # search for a digit or multiple, followed by a space, followed by any number of any characters
    if match :
        code = match.group(1)
    else :
        print('Missed a regex match in a row... look into this and try again')

In [77]:
i = 0
for row in soup.find('table', attrs={'id' : 'OBS_DATA'}).find_all('tr') :
    data = row.findAll('td')
    if len(data) > 0 :
        df.loc[i,'date_time'] = data[0].text	# break this down more!
        df.loc[i,'temp'] = data[1].text
        df.loc[i,'wind_direction'] = data[6].text

        # wind speeds display as '7', '9', '10', '0', and can include gusts like '20G43'
        wind_data = re.search(r'^(\d+)\D*(\d*$)', data[7].text)

        if wind_data :
            df.loc[i,'wind_speed'] = wind_data.group(1) 
            df.loc[i,'wind_gust'] = wind_data.group(2)
        else :
            df.loc[i,'wind_speed'] = '0' 
            df.loc[i,'wind_gust'] = '0'

        df.loc[i,'visibility_miles'] = data[8].text
        df.loc[i,'weather'] = data[9].text
        df.loc[i,'clouds'] = data[10].text
        df.loc[i,'prcp_1_hr'] = data[14].text
        i += 1
    else :
        print('skipped row.  blank?  first row only?')

skipped row.  blank?  first row only?


In [79]:
df.head(10)

Unnamed: 0,date_time,temp,wind_direction,wind_speed,wind_gust,visibility_miles,weather,clouds,prcp_1_hr
0,"Apr 11, 2:53 pm",60,NNW,14,,10.0,,SCT029 BKN050 BKN120,
1,"Apr 11, 2:33 pm",60,NNW,10,,10.0,,FEW029 BKN047 BKN120,
2,"Apr 11, 1:53 pm",57,N,10,,10.0,,BKN020 BKN037 BKN110,
3,"Apr 11, 1:33 pm",57,N,7,,10.0,,FEW014 BKN047 BKN075,
4,"Apr 11, 1:03 pm",56,N,12,,10.0,,SCT010 BKN016 OVC060,
5,"Apr 11, 12:53 pm",55,N,8,,10.0,,SCT008 BKN045 BKN055,0.11
6,"Apr 11, 12:36 pm",54,N,9,,10.0,,SCT008 BKN050 OVC070,0.11
7,"Apr 11, 12:19 pm",51,N,9,,6.0,"Lt rain, Mist",FEW019 BKN035 OVC050,0.11
8,"Apr 11, 12:04 pm",54,N,22,37.0,1.5,Lt rain,FEW020 BKN027 OVC042,T
9,"Apr 11, 11:53 am",55,NW,7,,10.0,Lt rain,SCT020 BKN035 OVC045,T


In [None]:
# Next, break down the time/date and figure out if any other columns need cleaning.

In [None]:
# And figure out how to combine multiple records.
# What if there are multiple per hour?  Use the max values.
# What if hours are missing?  Copy previous values maybe?  Look at the results and see if anything is missing!
    # Nope!  It looks like there are extra observations in addition to the hourly observations, but every hour has at least one observation.
# But I need to test for null values, and decide how to handle those if they exist.


In [81]:
df.to_csv('../data/weather_gov_20250327_20250425.csv', index=False)

In [None]:
# Looking at the first and last rows, skip index 0, scrape from 1 to max.

In [11]:
print( len(soup.findAll('table', attrs={'id' : 'OBS_DATA'})[0].find_all("tr")) )

473


In [15]:
print( soup.findAll('table', attrs={'id' : 'OBS_DATA'})[0].find_all("tr")[472] )

<tr><td bgcolor="yellow"><strong>Mar 26, 7:53 pm</strong></td><td>61</td><td>36</td><td>39</td><td> </td><td> </td><td>E</td><td>6</td><td>10.00</td><td> </td><td>CLR <font color="#FF00FF"></font> <font color="#FF00FF"></font></td><td> </td><td>29.56</td><td>30.21</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>


In [19]:
print( soup.findAll('table', attrs={'id' : 'OBS_DATA'})[0].find_all("tr")[1] )

<tr><td>Apr 11, 2:53 pm</td><td>60</td><td>45</td><td>57</td><td> </td><td> </td><td>NNW</td><td>14</td><td>10.00</td><td> </td><td>SCT029 <font color="orange">BKN050</font> BKN120</td><td>1019.00</td><td>29.46</td><td>30.10</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>


In [15]:
print(len(soup.find_all("table")))
#print(soup.find("table", {"id": "expanded_standings"}))

4


In [19]:
print(soup.find_all("table")[0])

<table id="icons" width="100%">
<tbody>
<tr>
<td><select class="plot_select"><option selected="selected" value="">Select Graph ......</option><option value="temperature">Temperature</option><option value="dewpt">Dew Point</option><option value="rh">Relative Humidity</option><option value="heat_index">Heat Index</option><option value="wind_chill">Wind Chill</option><option value="wind_dir">Wind Direction</option><option value="wind_speedgust">Wind Speed &amp; Gusts</option><option value="vsby">Visibility</option><option value="slp">Sea Level Pressure</option><option value="alstg">Altimeter Setting</option><option value="one_hour_pcpn">1 Hour Precip</option><option value="three_hour_pcpn">3 Hour Precip</option><option value="six_hour_pcpn">6 Hour Precip</option><option value="twentyfour_hour_pcpn">24 Hour Precip</option></select></td>
<td class="highlight" id="hoursToggle" title="Click to switch between 3 and 7 day data sets">3 Days</td>
<td class="highlight" id="obsToggle" title="Click 

In [51]:
print(soup.findAll('table', attrs={'id' : 'OBS_DATA'})[0]) #.findAll('tr')[4].text)
# , attrs={'name' : 'DC.date.created'})[0]['content']
# OBS_DATA

<table id="OBS_DATA" style="font-size: 12px;"><thead><tr id="HEADER"><th>Date/Time<br/> <br/>(L)</th><th class="zoom" id="temperature" onclick="makeLineChart('2025-03-26T19:53:00-0500,2025-03-26T20:53:00-0500,2025-03-26T21:53:00-0500,2025-03-26T22:53:00-0500,2025-03-26T23:53:00-0500,2025-03-27T00:53:00-0500,2025-03-27T01:53:00-0500,2025-03-27T02:53:00-0500,2025-03-27T03:53:00-0500,2025-03-27T04:53:00-0500,2025-03-27T05:53:00-0500,2025-03-27T06:53:00-0500,2025-03-27T07:53:00-0500,2025-03-27T08:53:00-0500,2025-03-27T09:53:00-0500,2025-03-27T10:53:00-0500,2025-03-27T11:53:00-0500,2025-03-27T12:53:00-0500,2025-03-27T13:53:00-0500,2025-03-27T14:53:00-0500,2025-03-27T15:53:00-0500,2025-03-27T16:53:00-0500,2025-03-27T17:53:00-0500,2025-03-27T18:53:00-0500,2025-03-27T19:53:00-0500,2025-03-27T20:53:00-0500,2025-03-27T21:53:00-0500,2025-03-27T22:53:00-0500,2025-03-27T23:53:00-0500,2025-03-28T00:53:00-0500,2025-03-28T01:53:00-0500,2025-03-28T02:53:00-0500,2025-03-28T03:53:00-0500,2025-03-28T04:53

In [101]:
print( soup.findAll('table')[1].findAll('td')[0].text )

1:53 AM


In [91]:
import pandas as pd

In [93]:
thedata = soup.findAll('table')[1]

In [95]:
pd.read_html(thedata)

TypeError: 'NoneType' object is not callable

In [81]:
#pip install requests-html
#pip uninstall requests-html

In this notebook, we will be using a **GET** request. This is a request for data from a specified resource.  

Another common type or request is a **POST** request. POST submits data to be processed (e.g., from an HTML form) to the identified resource. The data is included in the body of the request. This may result in the creation of a new resource or the updates of existing resources or both.

To perform a GET request, use `requests.get()` and pass in the desired url.

In [3]:
URL = 'https://www.wunderground.com/history/daily/us/tn/nashville/KBNA/date/2024-3-5'

response = requests.get(URL)

Let's see what kind of object we get.

In [5]:
type(response)

requests.models.Response

We can check the status code using the `status_code` attribute.

In [7]:
response.status_code

200

A 200 status code is the standard response for a successful request.  

Other common status codes:
 * 400: Bad Request
 * 404: Not Found

Let's see what happens if we request a non-existent URL.

In [11]:
requests.get('https://en.wikipedia.org/wiki/Tuning_Award')

<Response [404]>

**Back to the good correct request**, let's see what this request returned.

In [49]:
response.text



In [55]:
response.text.findAll('table')

AttributeError: 'str' object has no attribute 'findAll'

It is very hard to decipher the above text. Luckily for us, the [_Beautiful Soup_](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) library comes to the rescue. This library assists us in parsing HTML into something usable.

In [11]:
from bs4 import BeautifulSoup as BS

First, we can soupify our response text. Since we are working with HTML, we can specify that we need the html parser.

In [13]:
soup = BS(response.text)

Now, we can print it out in a slightly more readable form.

In [47]:
print(soup)

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/Organization" lang="en" prefix="og: http://ogp.me/ns#"><head itemscope="" itemtype="http://schema.org/WebSite">
<meta charset="utf-8"/>
<title>Nashville, TN Weather History | Weather Underground</title>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<link href="//widgets.outbrain.com" rel="dns-prefetch"/>
<link href="//odb.outbrain.com" rel="dns-prefetch"/>
<link href="//c.amazon-adsystem.com" rel="dns-prefetch"/>
<link href="//s.amazon-adsystem.com" rel="dns-prefetch"/>
<link href="//aax.amazon-adsystem.com" rel="dns-prefetch"/>
<link href="//partner.googleadservices.com" rel="dns-prefetch"/>
<link href="//tpc.googlesyndication.com" rel="dns-prefetch"/>
<link href="//pagead2.googlesyndication.com" rel="dns-prefetch"/>
<link href="//h.nexac.com" rel="dns-prefetch"/>
<link href="//js-sec.indexww.com" rel="dns-prefetch"/>
<link href="//secu

In [15]:
print(soup.prettify())

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/Organization" lang="en" prefix="og: http://ogp.me/ns#">
 <head itemscope="" itemtype="http://schema.org/WebSite">
  <meta charset="utf-8"/>
  <title>
   Nashville, TN Weather History | Weather Underground
  </title>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <link href="//widgets.outbrain.com" rel="dns-prefetch"/>
  <link href="//odb.outbrain.com" rel="dns-prefetch"/>
  <link href="//c.amazon-adsystem.com" rel="dns-prefetch"/>
  <link href="//s.amazon-adsystem.com" rel="dns-prefetch"/>
  <link href="//aax.amazon-adsystem.com" rel="dns-prefetch"/>
  <link href="//partner.googleadservices.com" rel="dns-prefetch"/>
  <link href="//tpc.googlesyndication.com" rel="dns-prefetch"/>
  <link href="//pagead2.googlesyndication.com" rel="dns-prefetch"/>
  <link href="//h.nexac.com" rel="dns-prefetch"/>
  <link href="//js-sec.indexww.com" re

What we are looking at is the HTML for this page. This is rendered by your browser into the Wikipedia page that you see.

<img src="assets/html.png">


If you navigate to this page in your browser, you can view page source or inspect elements to see the underlying HTML.

If you are using Safari, this may not by avaiable and you'll need to activate it. According to [this](https://www.socialmeteor.com/2013/03/04/how-to-view-html-source-in-safari-web-browser/) website, you can activate this by following these steps:


1. Open Safari.
2. Select ‘Preferences’ from the ‘Safari’ menu.
3. In the ‘Advanced’ section and select ‘Show Develop menu’ in menu bar.’
4. Visit the web page you want to view HTML source for.
5. Select ‘Show Page Source’ from the ‘Develop’ menu that has been added to Safari.


Beautiful Soup lets us search through this HTML and extract out the contents we want by tag.  

Say we wanted to find the title of this page. We can accomplish this by using the `.find` method on our soup, telling it that we want to find the first `title` tag.

In [17]:
soup.find('title')

<title>Nashville, TN Weather History | Weather Underground</title>

Notice that this returns a bs4 Tag object.

In [19]:
type(soup.find('title'))

bs4.element.Tag

To extract out the text, you can use the `.text` attribute.

In [21]:
soup.find('title').text

'Nashville, TN Weather History | Weather Underground'

The `.find` method find the first matching tag. 

We can find _all_ elements with a particular tag using the `.findAll(<tag>)` method. Say we want to find all images. We'll look for the `img` tag.

In [23]:
images = soup.findAll('img')
print(type(images))
images

<class 'bs4.element.ResultSet'>


[<img _ngcontent-sc190="" alt="icon" class="station-condition" src="//www.wunderground.com/static/i/c/v4/30.svg"/>,
 <img _ngcontent-sc286="" alt="Access Logo" data-lazy="//www.wunderground.com/static/i/logos/ea-logo-desktop.svg"/>,
 <img _ngcontent-sc286="" alt="The Weather Company Logo" src="//www.wunderground.com/static/i/misc/twc-white.svg" width="60px"/>,
 <img _ngcontent-sc286="" alt="The Weather Channel Logo" src="//www.wunderground.com/static/i/misc/twc-knockout.svg" width="40px"/>,
 <img _ngcontent-sc286="" alt="Weather Underground Logo" src="//www.wunderground.com/static/i/misc/wu-knockout.svg" width="60px"/>,
 <img _ngcontent-sc286="" alt="Storm Radar Logo" src="//www.wunderground.com/static/i/misc/sr-knockout.png" width="65px"/>]

Let's look closer at the first image.

In [25]:
first_image = images[0]
print(type(first_image))
first_image

<class 'bs4.element.Tag'>


<img _ngcontent-sc190="" alt="icon" class="station-condition" src="//www.wunderground.com/static/i/c/v4/30.svg"/>

You can access attributes of a Tag object in the same way that you would access values from a dictionary.

In [27]:
first_image['src']

'//www.wunderground.com/static/i/c/v4/30.svg'

You can also safely access attributes using `.get`. This might be useful if, for example, you aren't sure if a particular Tag or all tags had a certain attribute.

In [29]:
# Non-safe
first_image['class']

['station-condition']

In [31]:
# Safe
first_image.get('class')

['station-condition']

You can also specify a default value when using `get`.

In [33]:
first_image.get('class', default = 'No Class')

['station-condition']

If you want to grab a particular attribute for all images, an easy way to do so is with a list comprehension.

In [35]:
image_srcs = [x.get('src') for x in images]

In [37]:
image_srcs

['//www.wunderground.com/static/i/c/v4/30.svg',
 None,
 '//www.wunderground.com/static/i/misc/twc-white.svg',
 '//www.wunderground.com/static/i/misc/twc-knockout.svg',
 '//www.wunderground.com/static/i/misc/wu-knockout.svg',
 '//www.wunderground.com/static/i/misc/sr-knockout.png']

We can further navigate the html tree to extract out other bits of information.

When scraping from a web page, you should make use of "View Page Source" and/or "Inspect Element" in your web browswer.

For example, let's say we want to look at the second header on the page.

In [39]:
soup.findAll('header')[1]

IndexError: list index out of range

Similar to using `find` and `findall` in the full soup, we can use the `.find` method just within a Tag.

In [41]:
soup.findAll('header')[1].find('h1').get('id')

IndexError: list index out of range

In [43]:
soup.findAll('header')[1].find('h1').text

IndexError: list index out of range

Now, let's look for the table containing the Turing Award winners.

Using `.findAll` reveals that there are multiple tables on the page.

In [53]:
soup.findAll('table')
response.findAll('table')

AttributeError: 'Response' object has no attribute 'findAll'

If we know a bit more about what we are looking for, we can include an `attrs` argument and pass a dictionary. 

Go to the Turing award page in your browser, right click on the top of the table and choose "Inspect". You will notice that this table is defined with tag `<table class="wikitable">.` Armed with this information, we can narrow down our search.

In [50]:
soup.find('table', attrs={'class' : 'wikitable'})

<table class="wikitable sortable">
<caption>Recipients of the ACM Turing award
</caption>
<tbody><tr>
<th scope="col">Year
</th>
<th scope="col">Recipient(s)
</th>
<th class="unsortable" scope="col">Photo
</th>
<th class="unsortable" scope="col">Rationale
</th>
<th scope="col">Affiliated institute(s)
</th></tr>
<tr>
<th scope="row">1966
</th>
<td><span data-sort-value="Perlis, Alan"><span class="vcard"><span class="fn"><a href="/wiki/Alan_Perlis" title="Alan Perlis">Alan Perlis</a></span></span></span>
</td>
<td align="center"><span typeof="mw:File"><a class="mw-file-description" href="/wiki/File:No_image.svg"><img class="mw-file-element" data-file-height="1" data-file-width="1" decoding="async" height="80" src="//upload.wikimedia.org/wikipedia/commons/thumb/1/1d/No_image.svg/80px-No_image.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/1d/No_image.svg/120px-No_image.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/1d/No_image.svg/160px-No_image.svg.png 

If we want to interact with the table, we can use the _pandas_ `read_html` method.

In [52]:
import pandas as pd

In [53]:
pd.read_html(str(soup.find('table', attrs={'class' : 'wikitable'})))[0]

  pd.read_html(str(soup.find('table', attrs={'class' : 'wikitable'})))[0]


Unnamed: 0,Year,Recipient(s),Photo,Rationale,Affiliated institute(s)
0,1966,Alan Perlis,,"""For his influence in the area of advanced com...",Carnegie Mellon University
1,1967,Maurice Wilkes,,"""Wilkes is best known as the builder and desig...",University of Cambridge
2,1968,Richard Hamming,,"""For his work on numerical methods, automatic ...",Bell Labs
3,1969,Marvin Minsky,,"""For his central role in creating, shaping, pr...",Massachusetts Institute of Technology
4,1970,James H. Wilkinson,,"""For his research in numerical analysis to fac...",National Physical Laboratory
...,...,...,...,...,...
74,2021,Jack Dongarra,,"""For pioneering contributions to numerical alg...",Argonne National Laboratory Oak Ridge National...
75,2022,Robert Metcalfe,,"""For the invention, standardization, and comme...","Massachusetts Institute of Technology, Harvard..."
76,2023,Avi Wigderson,,"""For reshaping our understanding of the role o...","Institute for Advanced Study, Princeton Univer..."
77,2024,Andrew Barto,,"""For developing the conceptual and algorithmic...",University of Massachusetts (Amherst)


In [57]:
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from time import sleep
    
    def get_dynamic_html(url):
        driver = webdriver.Chrome()  # Or another browser driver
        driver.get(url)
        sleep(3)  # Wait for content to load
        html = driver.page_source
        driver.quit()
        return html
    
    url = "https://www.wunderground.com/history/daily/us/tn/nashville/KBNA/date/2024-3-5"
    html = get_dynamic_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table')  # Or use a more specific selector
    
    if table:
        # Process the table
        print(table)
    else:
        print("Table not found.")

ModuleNotFoundError: No module named 'selenium'