In [1]:
# Weather.gov has a dynamic table produced after the HTML is loaded.  To extract that information, I installed selenium.
# With selenium I can use Python to control the browser and extract the rendered HTML after the tables are generated.
# (I tried first with requests-html, but didn't have luck... but that installation made the selenium installation easier because
# it installed the required Chrome web driver as well)

# Outputs to ../data/weather_gov/weather_gov_[startdatechunk]_[enddatechunk].csv

#pip install requests-html
#pip install selenium

In [3]:
from bs4 import BeautifulSoup
import time
from datetime import datetime as dt
import datetime
from dateutil.relativedelta import relativedelta
from selenium import webdriver
import re
import pandas as pd

In [5]:
filepath = '../data/weather_gov/weather_gov' # this will be followed by yyyymmdd-yyyymmdd.csv

In [7]:
startdate = datetime.date(2018,1,1) # note that this NEEDS TO BE THE FIRST OF THE MONTH for my later chunk loop to function
enddate = datetime.date(2025,4,11)

In [9]:
# url will be like https://www.weather.gov/wrh/timeseries?site=KBNA&hours=500&units=english&chart=off&headers=on&obs=tabular&hourly=true&pview=standard&font=12&history=yes&start=20180101&end=20180120&plot=
url_pt1 = 'https://www.weather.gov/wrh/timeseries?site=KBNA&hours=500&units=english&chart=off&headers=on&obs=tabular&hourly=true&pview=standard&font=12&history=yes&start='
url_pt2 = '&end='
url_pt3 = '&plot='

In [27]:
# request half a month at a time
# generate the urls

urls = []
year_month_part = []
prev_year_month_part = [] # The report request for a month includes several hours from the last day of the previous month.  This should categorize those.
expectedmon = []          # This is the array to compare to.  If the month is not the expected month, go with the previous_year_month instead.
filenames = []

requestdate = startdate

while requestdate <= enddate :
    first_of_month = requestdate
    mid_of_month = first_of_month + datetime.timedelta(days=14)
    second_half_of_month = first_of_month + datetime.timedelta(days=15)
    last_of_month = first_of_month + relativedelta(months=1) - datetime.timedelta(days=1)

    if mid_of_month > enddate :
        mid_of_month = enddate

    first_of_month = str(first_of_month).replace("-", "")
    mid_of_month = str(mid_of_month).replace("-", "")
    
    url = url_pt1
    url += first_of_month
    url += url_pt2
    url += mid_of_month
    url += url_pt3

    urls.append(url)
    year_month_part.append(str(requestdate)[0:8])          # store the corresponding yyyy-mm- for later (I'll need it!)
    prevmon = requestdate - relativedelta(months=1)
    prev_year_month_part.append(str(prevmon)[0:8])
    expectedmon.append(requestdate.month)
    filenames.append(str(first_of_month) + '-' + str(mid_of_month))

    if second_half_of_month < enddate :
        if last_of_month > enddate :
            last_of_month = enddate
    
        second_half_of_month = str(second_half_of_month).replace("-", "")
        last_of_month = str(last_of_month).replace("-", "")
        
        url = url_pt1
        url += second_half_of_month
        url += url_pt2
        url += last_of_month
        url += url_pt3
    
        urls.append(url)
        year_month_part.append(str(requestdate)[0:8])
        prevmon = requestdate - relativedelta(months=1)
        prev_year_month_part.append(str(prevmon)[0:8])
        expectedmon.append(requestdate.month)
        filenames.append(str(second_half_of_month) + '-' + str(last_of_month))
    
    requestdate += relativedelta(months=1)

In [29]:
for i in range(5) :
    print(urls[i])
    print(year_month_part[i])
    print(expectedmon[i])
    print(prev_year_month_part[i])
    print(filenames[i])
print(urls[len(urls)-1])

https://www.weather.gov/wrh/timeseries?site=KBNA&hours=500&units=english&chart=off&headers=on&obs=tabular&hourly=true&pview=standard&font=12&history=yes&start=20180101&end=20180115&plot=
2018-01-
1
2017-12-
20180101-20180115
https://www.weather.gov/wrh/timeseries?site=KBNA&hours=500&units=english&chart=off&headers=on&obs=tabular&hourly=true&pview=standard&font=12&history=yes&start=20180116&end=20180131&plot=
2018-01-
1
2017-12-
20180116-20180131
https://www.weather.gov/wrh/timeseries?site=KBNA&hours=500&units=english&chart=off&headers=on&obs=tabular&hourly=true&pview=standard&font=12&history=yes&start=20180201&end=20180215&plot=
2018-02-
2
2018-01-
20180201-20180215
https://www.weather.gov/wrh/timeseries?site=KBNA&hours=500&units=english&chart=off&headers=on&obs=tabular&hourly=true&pview=standard&font=12&history=yes&start=20180216&end=20180228&plot=
2018-02-
2
2018-01-
20180216-20180228
https://www.weather.gov/wrh/timeseries?site=KBNA&hours=500&units=english&chart=off&headers=on&obs=ta

In [15]:
browser = webdriver.Chrome()

In [19]:
# process the dates from something like 'Dec 9, 7:48 pm' into dates and times like [2020-12-30] & 07:48

def extract_dateparts(extract_from) :
    match = re.search(r'([A-Z][a-z][a-z])\s(\d+), (\d+:\d+\d+ .m)', extract_from)   # search for the 3-letter month (Aug), 9 and 7:48 pm
    if match :
        monthnum = datetime.datetime.strptime(str(match.group(1)), "%b").month
        daypart = str(match.group(2))
        if len(daypart) == 1 :
            daypart = '0' + daypart
        timepart = str(dt.strptime(match.group(3), '%I:%M %p').strftime('%H:%M'))
        hourpart = timepart[0:2]
        return monthpart, daypart, timepart, hourpart
    else :
        print('Missed a regex match in a row... look into this and try again')

print(extract_dateparts('Apr 29, 5:48 am'))
print(extract_dateparts('Aug 19, 7:48 pm'))
print(extract_dateparts('Dec 9, 12:48 am'))
print(extract_dateparts('Sep 1, 12:48 pm'))

(4, '29', '05:48', '05')
(8, '19', '19:48', '19')
(12, '09', '00:48', '00')
(9, '01', '12:48', '12')


In [31]:
# processing chunk X of Y: url
x = 1
y = len(urls)

j = 0 # use for index for year_month_part and filenames

for url in urls :
    print(f"\rprocessing chunk {x} of {y}: {url}                  ", end="")
    x += 1
    browser.get(url)
    time.sleep(15)
    html = browser.page_source
    soup = BeautifulSoup(html, 'lxml')

    df = pd.DataFrame(columns = ['date','time','hr','temp','wind_direction','wind_speed','wind_gust','visibility_miles','weather','clouds','prcp_1_hr'])

    i = 0

    if 'Heat' in soup.find('table', attrs={'id' : 'OBS_DATA'}).find('tr').text : # if the heat index is in there, the columns get shifted.
        heatshift = 1                                                                  # shift wind direction and everything after !
    else :
        heatshift = 0
    if 'Chill' in soup.find('table', attrs={'id' : 'OBS_DATA'}).find('tr').text : # if the wind chill is in there, the columns get shifted.
        chillshift = 1                                                                  # shift wind direction and everything after !
    else :
        chillshift = 0

    if soup.find('table', attrs={'id' : 'OBS_DATA'}).find('tr').findAll('th')[7+chillshift+heatshift].text.strip() != 'Weather' :
        print()
        print('error.  "Weather" header is not where expected.  In url ' + url)         # check these later?
        print()

    for row in soup.find('table', attrs={'id' : 'OBS_DATA'}).find_all('tr') :
        data = row.findAll('td')
        if len(data) > 0 :
            monthnum, daypart, timepart, hr = extract_dateparts(data[0].text)	# break this down!
            if monthnum == expectedmon[j] :
                df.loc[i,'date'] = year_month_part[j] + daypart
            else :
                df.loc[i,'date'] = prev_year_month_part[j] + daypart
            df.loc[i,'time'] = timepart
            df.loc[i,'hr'] = hr
            df.loc[i,'temp'] = data[1].text
            df.loc[i,'wind_direction'] = data[4+heatshift+chillshift].text
    
            # wind speeds display as '7', '9', '10', '0', and can include gusts like '20G43'
            wind_data = re.search(r'^(\d+)\D*(\d*$)', data[5+chillshift+heatshift].text)
    
            if wind_data :
                df.loc[i,'wind_speed'] = wind_data.group(1) 
                df.loc[i,'wind_gust'] = wind_data.group(2)
            else :
                df.loc[i,'wind_speed'] = '0' 
                df.loc[i,'wind_gust'] = '0'
    
            df.loc[i,'visibility_miles'] = data[6+chillshift+heatshift].text
            df.loc[i,'weather'] = data[7+chillshift+heatshift].text
            df.loc[i,'clouds'] = data[8+chillshift+heatshift].text
            df.loc[i,'prcp_1_hr'] = data[12+chillshift+heatshift].text
            i += 1
        elif  i != 0 :
            print('error.  Unexpected blank row (more than just first row).  In url ' + url)
            print()
    
    df.to_csv(filepath + filenames[j] + '.csv', index=False)
    j += 1

print()
print('done!')

processing chunk 175 of 175: https://www.weather.gov/wrh/timeseries?site=KBNA&hours=500&units=english&chart=off&headers=on&obs=tabular&hourly=true&pview=standard&font=12&history=yes&start=20250401&end=20250411&plot=                  
done!
