In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from selenium import webdriver

In [2]:
### DEFINE PARAMETERS
start_yr = 2010
end_yr = 2016
city = 'santa-monica'

### API structure
url_root = 'https://www.timeanddate.com/weather/usa/' + city + '/historic?month='
yr_root = '&year='

In [3]:
#### FUNCTIONS

# NAME: process_page
# @param driver: webdriver object
# @param u: url to process as string
# @y: year as numeric
def process_page(driver, u, y):
    
    # Get page and identify all daily links for the month
    driver.get(u)
    page = driver.page_source
    soup = BeautifulSoup(page, 'html.parser')
    links = soup.find('div', {'class': 'weatherLinks'})
    
    # Create empty object to hold results
    results = list()
    
    # For each daily page, parse table to get hourly temp & humidity
    for link in links:
        l = link.get_text()
        driver.find_element_by_link_text(l).click()
        soup = BeautifulSoup(page, 'html.parser')
        table = soup.find('table', {'id': 'wt-his'})
        table = table.find('tbody')
        rows = table.findAll('tr')

        for row in rows:
            headers = row.findAll('th')
            timestamp = re.search('.*(am|pm)', headers[0].get_text()).group(0)
            cells = row.findAll('td')
            temp = re.search('\d{2,3}', cells[1].get_text()).group(0)
            humidity = cells[5].get_text()
            results.append((y, l, timestamp, temp, humidity))
            
    return(results)

In [5]:
# Build year & month objects for iteration
yrs = range(start_yr, end_yr)
mnths = range(1, 13)

# Process each year/month
# Restart webdriver between months in order to minimize crashes
for y in yrs:
    for m in mnths:
        
        # Launch webdriver
        driver = webdriver.Firefox()
        url = str(url_root + str(m) + yr_root + str(y))
        print url
        results = list()
        results = process_page(driver, url, y)
        driver.quit()
        df = pd.DataFrame(results, columns = ('year', 'date', 'hour', 'temp', 'humidity'))
        df.to_csv('Data/' + str(y) + '_' + str(m) + '.csv')

https://www.timeanddate.com/weather/usa/santa-monica/historic?month=1&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=2&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=3&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=4&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=5&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=6&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=7&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=8&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=9&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=10&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=11&year=2016
https://www.timeanddate.com/weather/usa/santa-monica/historic?month=12&year=2016
