In [1]:
import requests
from bs4 import BeautifulSoup
import csv
from os import sys
from selenium import webdriver
import re
from datetime import date, timedelta
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os
from dateutil.relativedelta import relativedelta
import calendar



In [4]:
station_code_dic = dict()
station_code_dic["TaiPei"] = "RCSS"
# station_code_dic["ShenZhen"] = "ZGSZ"
# station_code_dic["Teheran"] = "OIII"
# station_code_dic["HangZhou"] = "ZSHC"
station_code_dic["BeiJing"] = "ZBAA"
station_code_dic["Seoul"] = "RKSS"
station_code_dic["Vladivostok"] = "UHWW"
station_code_dic["Doha"] = "OTHH"

def get_max_temp(city_name : str, year : int , month : int, file_name : str,   station_code_dic : dict = station_code_dic) -> int:

    code = station_code_dic[city_name]

    url = f"https://www.wunderground.com/history/monthly/{code}/date/{year}-{month}"
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "observation-table")))
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find_all('div', class_='observation-table')
    finally:
        driver.quit()
    
    vals = []
    if table:
        whole_table = table[0].find("table")
        relevant_table = whole_table.find_all("table")[1]
        n = len(relevant_table)
        for id, row in enumerate(relevant_table):
            if id == 0 or id == n - 1:
                continue
            try:
                cols = row.find_all("td")
            except:
                print("Error finding td")
            match = re.search(r'<td[^>]*>\s*(\d+)\s*</td>', str(cols[0]))
            if match:
                number = int(match.group(1))
                vals.append(number)

    if not vals:
        raise ValueError(f"No temperature data found for {city_name} on {year}-{month:02d}")
    else:
        with open(file_name, "a", newline='') as f:
            writer = csv.writer(f)

            num_days = calendar.monthrange(year, month)[1]  # e.g., 30 for April

            for i, value in enumerate(vals, start=1):
                if i > num_days:
                    break  # or continue if you want to log or debug instead
                full_date = date(year, month, i)
                writer.writerow([full_date.isoformat(), value])


In [5]:
def fill_csv_with_weather_data(station_code_dic : dict = station_code_dic, specific_cities : list[str] = station_code_dic.keys()) -> None:
    for city in specific_cities:
        start_date = date(2005,1,1)
        end_date = date(2025,3,31)
        os.makedirs("output", exist_ok=True)
        file_name = f"output/{city}_weather_{start_date}_to_{end_date}.csv"
        
        current_date = start_date

        with open(file_name, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(["Date", "Max Temperature"])

        while current_date <= end_date:
            try:
                get_max_temp(city, current_date.year, current_date.month, file_name)
            except Exception as e:
                print(f"Error on {city} - {current_date}: {e}")
            
            current_date += relativedelta(months=1)
                

In [None]:
# Run to fill all cities up
fill_csv_with_weather_data()