# Generate Dataframe of Weather Information 
- Scrape historical weather infromation from https://www.wunderground.com
- Location: KBWI

### 1. Web Scraping

In [2]:
import pickle
import pandas as pd
import datetime
from datetime import datetime as dt
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
# Time
start_date = dt(2022, 2, 14)
end_date = dt(2023, 2, 12)
start_time = 330 # 05:30:00 
end_time = 1260 # 21:00:00
busi_date = pd.bdate_range(start=start_date, end=end_date).date  # 260 business dates

In [56]:
# There are chances where the tables may not be fully loaded. 
# Therefore, it's necessary to return to dates where the data was not successfully scraped at first attempt.
# Therefore, the helper funciton is not directly used but modified as shown in the next code block to ensure weather info of all dates are captured.
def get_weather_df(date):
    date_str = date.strftime('%Y-%m-%d')
    url = f'https://www.wunderground.com/history/daily/KBWI/date/{date_str}'
    driver = webdriver.Safari()
    driver.get(url)
    tables = WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table")))
    result = pd.read_html(tables[1].get_attribute('outerHTML'))[0]
    driver.quit()
    result = result[~result.Time.isna()]
    result["Date"] = date
    # driver.quit()
    return result

In [63]:
dates = list(busi_date)
list_df_weather = []
while len(dates) > 0:
    # keep scraping until there's no date missed
    missed_dates = []
    for date in tqdm(dates):
        # list_df_weather.append(get_weather_df(date))

        date_str = date.strftime('%Y-%m-%d')
        url = f'https://www.wunderground.com/history/daily/KBWI/date/{date_str}'
        driver = webdriver.Safari()
        driver.get(url)
        try: 
            tables = WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table")))
            if len(tables) < 2:
                # tables may not be fully loaded
                missed_dates.append(date)
                driver.quit()
            else:
                result = pd.read_html(tables[1].get_attribute('outerHTML'))[0]
                driver.quit()
                result = result[~result.Time.isna()]
                result["Date"] = date

                list_df_weather.append(result)
        except:
            # may encounter overtime exception
            missed_dates.append(date)
            driver.quit()
    print(f"Remaining (missed): {len(missed_dates)} dates!")
    dates = missed_dates.copy()
    


100%|██████████| 260/260 [13:59<00:00,  3.23s/it]


Remaining (missed): 31 dates!


100%|██████████| 31/31 [01:28<00:00,  2.85s/it]


Remaining (missed): 1 dates!


100%|██████████| 1/1 [00:02<00:00,  2.25s/it]

Remaining (missed): 0 dates!





In [64]:
df_weather = pd.concat(list_df_weather, axis=0)  # (7241, 11), covers all of 260 business dates

In [67]:
pickle.dump(df_weather, open("./TSMO_df_weather.pkl", "wb"))

### 2. Process Weather Dataframe
- Clean date and time to focus on business hours with 5-min granularity 
- Impute missing data
- Incorporate time embedding

In [4]:
df_weather = pickle.load(open("./TSMO_df_weather.pkl", "rb"))

In [5]:
df_weather

Unnamed: 0,Time,Temperature,Dew Point,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Precip.,Condition,Date
0,1:54 AM,27 °F,17 °F,66 °%,W,3 °mph,0 °mph,29.91 °in,0.0 °in,Mostly Cloudy,2022-02-14
1,2:54 AM,27 °F,17 °F,66 °%,WSW,6 °mph,0 °mph,29.92 °in,0.0 °in,Cloudy,2022-02-14
2,3:54 AM,27 °F,16 °F,63 °%,NE,3 °mph,0 °mph,29.92 °in,0.0 °in,Light Snow,2022-02-14
3,4:25 AM,26 °F,15 °F,63 °%,NE,5 °mph,0 °mph,29.92 °in,0.0 °in,Partly Cloudy,2022-02-14
4,4:54 AM,25 °F,15 °F,66 °%,NE,6 °mph,0 °mph,29.92 °in,0.0 °in,Fair,2022-02-14
...,...,...,...,...,...,...,...,...,...,...,...
19,8:54 PM,51 °F,39 °F,63 °%,SSW,3 °mph,0 °mph,29.94 °in,0.0 °in,Fair,2023-01-30
20,9:54 PM,45 °F,39 °F,80 °%,CALM,0 °mph,0 °mph,29.93 °in,0.0 °in,Fair,2023-01-30
21,10:54 PM,47 °F,39 °F,74 °%,CALM,0 °mph,0 °mph,29.93 °in,0.0 °in,Partly Cloudy,2023-01-30
22,11:54 PM,46 °F,39 °F,76 °%,CALM,0 °mph,0 °mph,29.93 °in,0.0 °in,Partly Cloudy,2023-01-30
