In [1]:
import requests
from bs4 import BeautifulSoup
import csv
from os import sys
from selenium import webdriver
import re
from datetime import date, timedelta
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os
from dateutil.relativedelta import relativedelta
import calendar

In [48]:
station_code_dic = dict()
station_code_dic["TaiPei"] = "RCSS"
# station_code_dic["ShenZhen"] = "ZGSZ"
# station_code_dic["Teheran"] = "OIII"
# station_code_dic["HangZhou"] = "ZSHC"
station_code_dic["BeiJing"] = "ZBAA"
station_code_dic["Seoul"] = "RKSS"
station_code_dic["Vladivostok"] = "UHWW"
station_code_dic["Doha"] = "OTHH"

def get_max_temp(city_name : str, year : int , month : int, file_name : str,   station_code_dic : dict = station_code_dic) -> int:

    code = station_code_dic[city_name]

    url = f"https://www.wunderground.com/history/monthly/{code}/date/{year}-{month}"
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "observation-table")))
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find_all('div', class_='observation-table')
    finally:
        driver.quit()
    
    vals = []
    if table:
        whole_table = table[0].find("table")
        relevant_table = whole_table.find_all("table")[1]
        n = len(relevant_table)
        for id, row in enumerate(relevant_table):
            if id == 0 or id == n - 1:
                continue
            try:
                cols = row.find_all("td")
                print("XD?")
                print(cols)
            except:
                print("Error finding td")
            match = re.search(r'<td[^>]*>\s*(\d+)\s*</td>', str(cols[0]))
            if match:
                number = int(match.group(1))
                vals.append(number)

    if not vals:
        raise ValueError(f"No temperature data found for {city_name} on {year}-{month:02d}")
    else:
        with open(file_name, "a", newline='') as f:
            writer = csv.writer(f)

            num_days = calendar.monthrange(year, month)[1]  # e.g., 30 for April

            for i, value in enumerate(vals, start=1):
                if i > num_days:
                    break  # or continue if you want to log or debug instead
                full_date = date(year, month, i)
                writer.writerow([full_date.isoformat(), value])


In [49]:
def fill_csv_with_weather_data(station_code_dic : dict = station_code_dic, specific_cities : list[str] = station_code_dic.keys()) -> None:
    for city in specific_cities:
        start_date = date(2025,1,1)
        end_date = date(2025,3,31)
        os.makedirs("output", exist_ok=True)
        file_name = f"output/{city}_weather_{start_date}_to_{end_date}.csv"
        
        current_date = start_date

        with open(file_name, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(["Date", "Max Temperature"])

        while current_date <= end_date:
            try:
                get_max_temp(city, current_date.year, current_date.month, file_name)
            except Exception as e:
                print(f"Error on {city} - {current_date}: {e}")
            
            current_date += relativedelta(months=1)
                

In [50]:
# Run to fill all cities up
fill_csv_with_weather_data()

KeyboardInterrupt: 

In [66]:
station_code_dic = dict()
station_code_dic["TaiPei"] = "RCSS"
# station_code_dic["ShenZhen"] = "ZGSZ"
# station_code_dic["Teheran"] = "OIII"
# station_code_dic["HangZhou"] = "ZSHC"
station_code_dic["BeiJing"] = "ZBAA"
station_code_dic["Seoul"] = "RKSS"
station_code_dic["Vladivostok"] = "UHWW"
station_code_dic["Doha"] = "OTHH"

def get_weather_data(city_name: str, year: int, month: int, file_name: str, station_code_dic: dict = station_code_dic) -> None:
    code = station_code_dic[city_name]
    print(f"\n=== Processing {city_name} - {year}-{month:02d} ===") 
    url = f"https://www.wunderground.com/history/monthly/{code}/date/{year}-{month}"
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  
    driver = webdriver.Chrome(options=options)
    
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "observation-table")))
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find_all('div', class_='observation-table')
    finally:
        driver.quit()
    
    temp_vals = []
    dew_vals = []
    humidity_vals = []
    wind_vals = []
    pressure_vals = []
    
    if table:
        whole_table = table[0].find("table")
        relevant_tables = whole_table.find_all("table")
        
        for id, row in enumerate(relevant_tables[1]):
            if id == 0 or id == len(relevant_tables[1]) - 1:
                continue
            try:
                cols = row.find_all("td")
                temp_vals.append({
                    'max': float(cols[0].get_text().strip()),
                    'avg': float(cols[1].get_text().strip()),
                    'min': float(cols[2].get_text().strip())
                })
            except Exception as e:
                print(f"Error processing temperature row {id}: {e}")
        
        # Dew Point (table 2)
        for id, row in enumerate(relevant_tables[2]):
            if id == 0 or id == len(relevant_tables[2]) - 1:
                continue
            try:
                cols = row.find_all("td")
                dew_vals.append({
                    'max': float(cols[0].get_text().strip()),
                    'avg': float(cols[1].get_text().strip()),
                    'min': float(cols[2].get_text().strip())
                })
            except Exception as e:
                print(f"Error processing dew point row {id}: {e}")
        
        # Humidity (table 3)
        for id, row in enumerate(relevant_tables[3]):
            if id == 0 or id == len(relevant_tables[3]) - 1:
                continue
            try:
                cols = row.find_all("td")
                humidity_vals.append({
                    'max': float(cols[0].get_text().strip()),
                    'avg': float(cols[1].get_text().strip()),
                    'min': float(cols[2].get_text().strip())
                })
            except Exception as e:
                print(f"Error processing humidity row {id}: {e}")
        
        # Wind Speed (table 4)
        for id, row in enumerate(relevant_tables[4]):
            if id == 0 or id == len(relevant_tables[4]) - 1:
                continue
            try:
                cols = row.find_all("td")
                wind_vals.append({
                    'max': float(cols[0].get_text().strip()),
                    'avg': float(cols[1].get_text().strip()),
                    'min': float(cols[2].get_text().strip())
                })
            except Exception as e:
                print(f"Error processing wind speed row {id}: {e}")
        
        # Pressure (table 5)
        for id, row in enumerate(relevant_tables[5]):
            if id == 0 or id == len(relevant_tables[5]) - 1:
                continue
            try:
                cols = row.find_all("td")
                pressure_vals.append({
                    'max': float(cols[0].get_text().strip()),
                    'avg': float(cols[1].get_text().strip()),
                    'min': float(cols[2].get_text().strip())
                })
            except Exception as e:
                print(f"Error processing pressure row {id}: {e}")

    # Write to CSV
    if not temp_vals:
        raise ValueError(f"No temperature data found for {city_name} on {year}-{month:02d}")
    else:
        with open(file_name, "a", newline='') as f:
            writer = csv.writer(f)
            num_days = calendar.monthrange(year, month)[1]
            
            for i in range(num_days):
                if i >= len(temp_vals):
                    break
                
                full_date = date(year, month, i+1)
                writer.writerow([
                    full_date.isoformat(),
                    # Temperature
                    temp_vals[i]['max'], temp_vals[i]['avg'], temp_vals[i]['min'],
                    # Dew Point
                    dew_vals[i]['max'], dew_vals[i]['avg'], dew_vals[i]['min'],
                    # Humidity
                    humidity_vals[i]['max'], humidity_vals[i]['avg'], humidity_vals[i]['min'],
                    # Wind Speed
                    wind_vals[i]['max'], wind_vals[i]['avg'], wind_vals[i]['min'],
                    # Pressure
                    pressure_vals[i]['max'], pressure_vals[i]['avg'], pressure_vals[i]['min']
                ])

In [67]:
def fill_csv_with_weather_data(station_code_dic: dict = station_code_dic, specific_cities: list[str] = station_code_dic.keys()) -> None:
    for city in specific_cities:
        start_date = date(2005, 1, 1)
        end_date = date(2025, 3, 31)
        os.makedirs("output", exist_ok=True)
        file_name = f"output/{city}_weather_{start_date}_to_{end_date}.csv"
        
        current_date = start_date

        with open(file_name, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([
                "Date",
                "Max Temp", "Avg Temp", "Min Temp",
                "Max Dew", "Avg Dew", "Min Dew",
                "Max Humid", "Avg Humid", "Min Humid",
                "Max Wind", "Avg Wind", "Min Wind",
                "Max Press", "Avg Press", "Min Press"
            ])

        while current_date <= end_date:
            try:
                get_weather_data(city, current_date.year, current_date.month, file_name)
            except Exception as e:
                print(f"Error on {city} - {current_date}: {e}")
            
            current_date += relativedelta(months=1)
                

In [68]:
# Run to fill data for the three-day test period
fill_csv_with_weather_data()


=== Processing TaiPei - 2005-01 ===

=== Processing TaiPei - 2005-02 ===

=== Processing TaiPei - 2005-03 ===

=== Processing TaiPei - 2005-04 ===

=== Processing TaiPei - 2005-05 ===

=== Processing TaiPei - 2005-06 ===

=== Processing TaiPei - 2005-07 ===

=== Processing TaiPei - 2005-08 ===

=== Processing TaiPei - 2005-09 ===

=== Processing TaiPei - 2005-10 ===

=== Processing TaiPei - 2005-11 ===

=== Processing TaiPei - 2005-12 ===

=== Processing TaiPei - 2006-01 ===

=== Processing TaiPei - 2006-02 ===

=== Processing TaiPei - 2006-03 ===

=== Processing TaiPei - 2006-04 ===

=== Processing TaiPei - 2006-05 ===

=== Processing TaiPei - 2006-06 ===

=== Processing TaiPei - 2006-07 ===

=== Processing TaiPei - 2006-08 ===

=== Processing TaiPei - 2006-09 ===

=== Processing TaiPei - 2006-10 ===

=== Processing TaiPei - 2006-11 ===

=== Processing TaiPei - 2006-12 ===

=== Processing TaiPei - 2007-01 ===

=== Processing TaiPei - 2007-02 ===

=== Processing TaiPei - 2007-03 ===

