# Weather data collection

In [96]:
# Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By 

import numpy as np

import time
from datetime import date, timedelta

In [56]:
baseURL = 'https://www.wunderground.com/history/daily/'
sfAddonURL = 'us/ca/san-bruno/KSFO/date/'
sbAddonURL = 'us/ca/san-bernardino/KSBD/date/'

In [None]:
sf_dailies = []
sb_dailies = []

In [57]:
# creates a range of dates from start date to < end date
# format is yyyy/mm/dd
def getDateRange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

sfDateURLs = []
sbDateURLs = []

# create array of dates for San Francisco
for single_date in getDateRange(date(2017, 12, 31), date(2018, 3, 2)):
    sfDateURLs.append(single_date.strftime("%Y-%-m-%-d"))
    
# create array of dates for San Bernardino
for single_date in getDateRange(date(2016, 6, 30), date(2016, 9, 2)):
    sbDateURLs.append(single_date.strftime("%Y-%-m-%-d"))

In [108]:
def buildURLForDate(base, loc, date):
    return base + loc + date

possible_conditions = set()

def extractData(base, loc, dates):
    PATH = './chromedriver'
    service = Service(PATH)
    driver = webdriver.Chrome(service = service)
    
    data = [['Date', 'Time', 'Temperature (F)', 'Humdity', 'Wind Speed (mph)', 'Rainfall (in)', 'Condition', 'Visibility']]
    
    # All the columns that will be extracted
    relevant_selectors = ['.cdk-cell.mat-column-dateString', 
                          '.cdk-cell.mat-column-temperature', 
                          '.cdk-cell.mat-column-humidity',
                          '.cdk-cell.mat-column-windSpeed',
                          '.cdk-cell.mat-column-precipRate',
                          '.cdk-cell.mat-column-condition']
    for day in range(len(dates)):
        # For one webpage
        driver.get(buildURLForDate(base, loc, dates[day]))
        time.sleep(5)

        # Get the visibility
        vis = driver.find_element(By.CSS_SELECTOR, '.ng-star-inserted:nth-child(8) .ng-star-inserted+ .ng-star-inserted th+ .ng-star-inserted').text

        # Get table elements
        table = driver.find_element(By.CSS_SELECTOR, '.observation-table')

        inverse_table = []
        for selector in relevant_selectors:
            a = map(lambda x: x.text, table.find_elements(By.CSS_SELECTOR, selector))
            inverse_table.append(list(a))

        # Get data per timestamp from transposed matrix
        for i in range(len(inverse_table[0])):
            single_data = []
            single_data.append(dates[day])
            # Insert retrieved parameters
            for col in inverse_table:
                single_data.append(col[i])

            single_data.append(vis)
            print(single_data, end = '\r')
            data.append(single_data)
            possible_conditions.add(single_data[-2])

    driver.close()
    
    return np.array(data, dtype = 'object')

sf_data = extractData(baseURL, sfAddonURL, sfDateURLs)
np.savetxt('sf-weather.csv', sf_data, fmt = '%s', delimiter = ',')

sb_data = extractData(baseURL, sbAddonURL, sbDateURLs)
np.savetxt('sb-weather.csv', sb_data, fmt = '%s', delimiter = ',')

print(possible_conditions)

{'Rain', 'Cloudy / Windy', 'Mostly Cloudy', 'Light Rain', 'Patches of Fog', 'Light Rain / Windy', 'Haze', 'Partly Cloudy', 'Rain / Windy', 'Fair / Windy', 'Heavy Rain / Windy', 'Mist', 'Fair', 'Partly Cloudy / Windy', 'Heavy Rain', 'Mostly Cloudy / Windy', 'Fog', 'Cloudy', 'Blowing Sand'}
