In [1]:
import requests
from bs4 import BeautifulSoup
import pathlib

pathlib.Path('data').mkdir(exist_ok=True)
pathlib.Path('data/maple').mkdir(exist_ok=True)

# 楓葉紅葉日期
if not pathlib.Path('data/maple/maple_foliage_2024.csv').exists():
    response = requests.get('https://www.data.jma.go.jp/sakura/phn_014.html')
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    rows = soup.find_all('tr')
    with open('data/maple/maple_foliage_2024.csv', 'w', encoding='utf-8') as file:
        file.write('location,date,avg_year_diff,last_year_diff\n')
        for row in rows:
            columns = row.find_all('th')
            if len(columns) != 0:
                continue
            columns = row.find_all('td')
            location_name = columns[0].text.replace(' ', '').strip()
            foliage_date = columns[4].text.replace(' ', '').strip()
            avg_year_diff = columns[5].text.replace(' ', '').strip()
            last_year_diff = columns[6].text.replace(' ', '').strip()
            file.write(f'{location_name},{foliage_date},{avg_year_diff},{last_year_diff}\n')

# 楓葉落葉日期
if not pathlib.Path('data/maple/maple_shedding_2024.csv').exists():
    response = requests.get('https://www.data.jma.go.jp/sakura/phn_015.html')
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    rows = soup.find_all('tr')
    with open('data/maple/maple_shedding_2024.csv', 'w', encoding='utf-8') as file:
        file.write('location,date,avg_year_diff,last_year_diff\n')
        for row in rows:
            columns = row.find_all('th')
            if len(columns) != 0:
                continue
            columns = row.find_all('td')
            location_name = columns[0].text.replace(' ', '').strip()
            foliage_date = columns[4].text.replace(' ', '').strip()
            avg_year_diff = columns[5].text.replace(' ', '').strip()
            last_year_diff = columns[6].text.replace(' ', '').strip()
            file.write(f'{location_name},{foliage_date},{avg_year_diff},{last_year_diff}\n')

In [2]:
import pandas as pd

foliage_data = pd.read_csv('data/maple/maple_foliage_2024.csv', encoding='utf-8')
shedding_data = pd.read_csv('data/maple/maple_shedding_2024.csv', encoding='utf-8')

foliage_data.head()

Unnamed: 0,location,date,avg_year_diff,last_year_diff
0,旭川,10月25日,2,-2
1,札幌,11月11日,14,1
2,帯広,10月29日,9,-3
3,釧路,10月19日,3,-4
4,室蘭,11月17日,10,-3


In [3]:
location_names = foliage_data['location'].unique()
location_names

array(['旭川', '札幌', '帯広', '釧路', '室蘭', '函館', '青森', '秋田', '盛岡', '山形', '仙台',
       '福島', '新潟', '金沢', '富山', '長野', '宇都宮', '福井', '前橋', '熊谷', '水戸', '岐阜',
       '名古屋', '甲府', '銚子', '津', '静岡', '東京', '横浜', '松江', '鳥取', '京都', '彦根',
       '下関', '広島', '岡山', '神戸', '大阪', '和歌山', '奈良', '福岡', '佐賀', '大分', '長崎',
       '熊本', '鹿児島', '宮崎', '松山', '高松', '高知', '徳島'], dtype=object)

In [4]:
from time import sleep

def fetch_history_weather_data(latitude, longtitude, start_date, end_date, variables):
    url = (
        f'https://archive-api.open-meteo.com/v1/archive?'
        f'latitude={latitude}&'
        f'&longitude={longtitude}&'
        f'start_date={start_date}&'
        f'end_date={end_date}&'
        f'{variables}&'
        'timezone=Asia/Tokyo'
    )
    for _ in range(3):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.Timeout:
            print('Timeout, retrying...')
            sleep(5)
        except requests.exceptions.RequestException as e:
            print(f'Request failed: {e}')
            break
    return None

In [5]:
from tqdm import tqdm
import json

def download_weather_data(geolocator, location_names, directory):
    for location_name in (pbar := tqdm(location_names)):
        pbar.set_description('Downloading')
        pbar.set_postfix_str(location_name)
        
        pathlib.Path(f'{directory}/{location_name}').mkdir(exist_ok=True, parents=True)

        location = geolocator.geocode(location_name)
        if location is None:
            print(f'{location_name} not found')
            continue

        year = 2024

        if pathlib.Path(f'{directory}/{location_name}/{year}.json').exists():
            continue
        
        response = fetch_history_weather_data(
            location.latitude, 
            location.longitude, 
            f'{year}-09-01', 
            f'{year}-12-28', # 還沒到 12/31, 所以設定為 28
            'daily=temperature_2m_max,temperature_2m_min,temperature_2m_mean,daylight_duration,precipitation_sum'
        )

        if response is None:
            print(f'Failed to fetch data for {location_name} in {year}')
            return

        # 確認是否有錯誤 ( 如: API 每小時的限制 )
        if response.get('error'):
            print(response.get('reason'))
            return

        with open(f'{directory}/{location_name}/{year}.json', 'w', encoding='utf-8') as file:
            file.write(json.dumps(response))
        
        sleep(1)

In [6]:
from geopy.geocoders import Nominatim
import numpy as np


geolocator = Nominatim(user_agent="kaede")

download_completed = True

if not download_completed:
    download_weather_data(geolocator, location_names, 'data/weather')

In [7]:
from datetime import datetime

data = pd.DataFrame()
for location_name_index, location_name in enumerate(tqdm(location_names)):  
    year = 2024
    if not pathlib.Path(f'data/weather/{location_name}/{year}.json').exists():
        continue

    with open(f'data/weather/{location_name}/{year}.json', 'r', encoding='utf-8') as file:
        weather_data = json.load(file)

    foliage_date = foliage_data['date'].iloc[location_name_index]
    shedding_date = shedding_data['date'].iloc[location_name_index]

    status = 0
    for time_index, time in enumerate(weather_data['daily']['time']):
        time = datetime.strptime(time, '%Y-%m-%d')

        # 跳過可能沒有資料的日期
        if time.month == 12 and time.day > 26: 
            break
        
        if foliage_date == '-':
            continue

        def str_to_date(str):
            if str == '-':
                return None
            month = str.split('月')[0]
            day = str.split('月')[1].split('日')[0]
            return datetime.strptime(f'{year}-{month}-{day}', '%Y-%m-%d')

        if time == str_to_date(foliage_date):
            status = 1
        elif time == str_to_date(shedding_date):
            status = 2
            
        new_row = pd.DataFrame({
            'year': [year],
            'month': [time.month],
            'day': [time.day],
            'latitude': [weather_data['latitude']],
            'longitude': [weather_data['longitude']],
            'elevation': [weather_data['elevation']],
            'max_temperature': [weather_data['daily']['temperature_2m_max'][time_index]],
            'min_temperature': [weather_data['daily']['temperature_2m_min'][time_index]],
            'mean_temperature': [weather_data['daily']['temperature_2m_mean'][time_index]],
            'daylight_duration': [weather_data['daily']['daylight_duration'][time_index]],
            'precipitation_sum': [weather_data['daily']['precipitation_sum'][time_index]],
            'location_name': [location_name],
            'status': [status],
        })
        data = pd.concat([data, new_row], ignore_index=True)

data.to_csv('data/test_data.csv', index=False)

100%|██████████| 51/51 [00:02<00:00, 23.87it/s]


In [8]:
from lightgbm import Booster
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

model = Booster(model_file='./data/lgbm_model.txt')

data = pd.read_csv('data/test_data.csv')

X = data.drop(columns=['status', 'location_name'])
y = data['status']

y_pred_proba = model.predict(X)
y_pred = np.argmax(y_pred_proba, axis=1)

f1 = f1_score(y, y_pred, average='macro')
accuracy = accuracy_score(y, y_pred)
roc_auc = roc_auc_score(y, y_pred_proba, multi_class='ovr')

print(f'F1 Score: {f1:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

F1 Score: 0.7670
Accuracy: 0.9014
ROC AUC: 0.9752


In [9]:
real_data = pd.read_csv('data/maple/maple_foliage_2024.csv')

table = pd.DataFrame()
result = []
for i, status in enumerate(y_pred):
    prev_status = y_pred[i-1] if i > 0 else None
    prev_location = data['location_name'].iloc[i-1] if i > 0 else None
    
    location = data['location_name'].iloc[i]

    if prev_location != location:
        if result:
            date = result[-1] 
            real_date = real_data[real_data['location'] == prev_location]['date'].values[0]
            real_date = datetime.strptime(f'2024年{real_date}', '%Y年%m月%d日')
            diff_days = (real_date - date).days
            diff_days_str = f'+{diff_days}' if diff_days > 0 else str(diff_days)
            new_row = pd.DataFrame({
                '地點': [prev_location],
                '預測日期': [datetime.strftime(date, '%m/%d')],
                '真實日期': [datetime.strftime(real_date, '%m/%d')],
                '相差天數': [diff_days_str],
                '平年差': [real_data[real_data['location'] == prev_location]['avg_year_diff'].values[0]],
                '昨年差': [real_data[real_data['location'] == prev_location]['last_year_diff'].values[0]],
            })
            table = pd.concat([table, new_row], ignore_index=True)
        result = []
    
    if status == 1 and prev_status == 0:
        date = datetime(data['year'].iloc[i], data['month'].iloc[i], data['day'].iloc[i])
        result.append(date)

table

Unnamed: 0,地點,預測日期,真實日期,相差天數,平年差,昨年差
0,旭川,10/27,10/25,-2,2,-2
1,札幌,11/09,11/11,2,14,+1
2,帯広,10/31,10/29,-2,9,-3
3,室蘭,11/17,11/17,0,10,-3
4,函館,11/12,11/14,2,12,+3
5,青森,11/17,11/15,-2,2,-4
6,秋田,11/22,11/17,-5,5,-11
7,盛岡,11/19,11/27,8,14,+3
8,山形,11/29,12/04,5,9,-2
9,仙台,11/27,11/22,-5,1,-2


In [15]:
# 平均誤差天數
table['相差天數'] = table['相差天數'].apply(lambda x: int(x))
print(f'平均相差天數: {table['相差天數'].mean().item()}')

# 平均平年差
table['平年差'] = table['平年差'].apply(lambda x: int(x))
print(f'平均平年差: {table['平年差'].mean().item()}')

# 平均昨年差
table['昨年差'] = table['昨年差'].apply(lambda x: int(x) if x != '///' else np.nan)
print(f'平均昨年差: {table['昨年差'].mean().item()}')

平均相差天數: 4.666666666666667
平均平年差: 7.583333333333333
平均昨年差: 4.170212765957447
