In [214]:
import requests
from bs4 import BeautifulSoup
import csv
from os import sys
from selenium import webdriver
import re
from datetime import date, timedelta
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os

In [169]:
station_code_dic = dict()
station_code_dic["TaiPei"] = "RCSS"
station_code_dic["ShenZhen"] = "ZGSZ"
station_code_dic["Teheran"] = "OIII"
station_code_dic["HangZhou"] = "ZSHC"
station_code_dic["BeiJing"] = "ZBAA"

In [217]:
def get_max_temp(city_name : str, _date : date, station_code_dic : dict = station_code_dic) -> int:

    code = station_code_dic[city_name]

    url = f"https://www.wunderground.com/history/daily/{code}/date/{_date.year}-{_date.month}-{_date.day}"
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "observation-table")))
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find_all('div', class_='observation-table')
    finally:
        driver.quit()
    
    vals = []
    if table:
        rows = table[0].find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) >= 2:
                temp = cols[1].find('span', class_="wu-value wu-value-to")
                if temp:
                    raw = str(temp)
                    match = re.search(r'class="wu-value wu-value-to">(\d+)</span>', raw)
                    if match:
                        value = int(match.group(1))
                        vals.append(value)
                        # print(f"Row {n}: {value}°F")

    if not vals:
        raise ValueError(f"No temperature data found for {city_name} on {_date}")
    return max(vals)
    

In [218]:
def fill_csv_with_weather_data(station_code_dic : dict = station_code_dic, specific_cities : list[str] = station_code_dic.keys(), limited_days = False, days_to_cover : int = -1) -> None:
    for city in specific_cities:
        start_date = date(2005,1,1)
        end_date = date(2025,4,15)
        os.makedirs("output", exist_ok=True)
        file_name = f"output/{city}_weather_{start_date}_to_{end_date}.csv"
        
        current_date = start_date

        with open(file_name, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(["Date", "Max Temperature"])

        nDays = 0
        while current_date <= end_date:
            nDays += 1
            try:
                max_temp = get_max_temp(city, current_date)
                max_temp = round((max_temp - 32) * 5 / 9, 2)
            except Exception as e:
                print(f"Error on {city} - {current_date}: {e}")
                max_temp = "-"

            with open(file_name, 'a', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([current_date, max_temp])
            
                current_date += timedelta(days=1)
            
            if limited_days and nDays >= days_to_cover:
                print(f"Breaking scrapping for city {city} after successfully scrapping {nDays} days")
                break
                

In [221]:
start = time.time()
days = 5
fill_csv_with_weather_data(limited_days=True, days_to_cover=days, specific_cities = ["TaiPei"])
end = time.time()
elapsed = end - start
more_iterations = int(365 / days  * 20)
estimation_seconds = int(elapsed * more_iterations)
estimation_hours = int(estimation_seconds / 3600)

print(f"The whole process will take approximately {estimation_hours} hours")

Breaking scrapping for city TaiPei after successfully scrapping 5 days
The whole process will take approximately 39 hours
