# Collect data

* Import packages

In [1]:
import json
import time
import pandas as pd
from datetime import datetime, timedelta
from datetime import date
from selenium import webdriver

* `collect_one_day_hist` function is in charge of parsing HTML to get weather data of a determined date, each row of data is got every 30 minutes on the day. Each row of data will be saved as a dictionary and then added into a list named `data`. Because we cannot use the `requests` package so we crawl data using `selenium` instead.

* Parameters:

1. `dt`: the date that we will crawl data (YYYY-MM-DD)
2. `data`: store the list of dictionaries
3. `driver`: the Chrome webdriver

In [24]:
def collect_one_day_hist(dt,data,driver):
    url = f'https://www.wunderground.com/history/daily/vn/quận-tân-bình/VVTS/date/{dt}'
    driver.get(url)
    time.sleep(10)
    rows = driver.find_elements_by_xpath('//tr[@class="mat-row cdk-row ng-star-inserted"]')
    for r in rows:
        dict_ = {}
        temp = (r.text).split()
        dict_['date'] = dt
        dict_['time'] = ' '.join(temp[:2])
        dict_['temperature'] = temp[2]
        dict_['dew_point'] = temp[4]
        dict_['humidity'] = temp[6]
        dict_['wind'] = temp[8]
        dict_['wind_speed'] = temp[9]
        dict_['wind_gust'] = temp[11]
        dict_['pressure'] = temp[13]
        dict_['precip.'] = temp[15]
        dict_['condition'] = ' '.join(temp[17:])
        data.append(dict_)
    return data

* `collect_weather_hist` function will invoke `collect_one_day_hist` function until `start_date` equal to `end_date` and we will have the weather data from `start_date` to `end_date` (contain `end_date`).

* Parameters:

1. `start_date`: the date that we begin get data (YYYY-MM-DD)
2. `end_date`: the date that we stop getting data. (YYYY-MM-DD)

In [25]:
def collect_weather_hist(start_date, end_date):
    data = []
    date_s, date_e = date.fromisoformat(start_date), date.fromisoformat(end_date)
    s = timedelta(days = 1)
    driver = webdriver.Chrome('./chromedriver')
    while date_s <= date_e:
        data = collect_one_day_hist(str(date_s),data,driver)
        date_s += s
    driver.quit()
    return data

In [33]:
start_date = '2021-07-01'
end_date = '2021-09-04'

* Invoke `collect_weather_hist` function and store the data into `data_weather`.

In [34]:
data_weather = collect_weather_hist(start_date, end_date)

* Convert a list of dictionaries to pandas DataFrame

In [36]:
df = pd.DataFrame(data_weather)

In [37]:
df.head(5)

Unnamed: 0,date,time,temperature,dew_point,humidity,wind,wind_speed,wind_gust,pressure,precip.,condition
0,2021-07-01,12:00 AM,81,79,94,WSW,6,0,29.76,0.0,Partly Cloudy
1,2021-07-01,12:30 AM,81,79,94,WSW,7,0,29.76,0.0,Partly Cloudy
2,2021-07-01,1:00 AM,82,79,89,SW,6,0,29.76,0.0,Fair
3,2021-07-01,1:30 AM,81,79,94,SW,6,0,29.76,0.0,Fair
4,2021-07-01,2:00 AM,81,79,94,SSW,7,0,29.73,0.0,Fair


* Shape of dataframe

In [38]:
df.shape

(3162, 11)

* Save dataframe to excel file (`weather_hist.xlsx`)

In [42]:
df.to_excel('./weather_hist.xlsx',index=False)