In [32]:
import requests
import pathlib

pathlib.Path('data').mkdir(exist_ok=True)

# 楓葉紅葉日期
if not pathlib.Path('data/maple/maple_foliage.csv').exists():
    response = requests.get('https://www.data.jma.go.jp/sakura/ruinenchi/015.csv')
    response.encoding = 'shift-jis'
    with open('data/maple/maple_foliage.csv', 'w', encoding='utf-8') as file:
        file.write(response.text)


# 楓葉落葉日期
if not pathlib.Path('data/maple/maple_fall.csv').exists():
    response = requests.get('https://www.data.jma.go.jp/sakura/ruinenchi/016.csv')
    response.encoding = 'shift-jis'
    with open('data/maple/maple_shedding.csv', 'w', encoding='utf-8') as file:
        file.write(response.text)

In [33]:
import pandas as pd

foliage_data = pd.read_csv('data/maple/maple_foliage.csv', skiprows=1, encoding='utf-8')

foliage_data.head()

Unnamed: 0,番号,地点名,1953,rm,1954,rm.1,1955,rm.2,1956,rm.3,...,2032,rm.79,平年値,rm.80,最早値,rm.81,最早年,最晩値,rm.82,最晩年
0,401,稚内,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,406,留萌,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,407,旭川,0,0,0,0,0,0,0,0,...,0,0,1023,6,1008,6,1993,1105,6,2002
3,409,網走,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,412,札幌,1008,6,1015,6,1017,6,1024,6,...,0,0,1028,6,926,6,1963,1110,6,2023


In [34]:
def drop_unused_columns(df):
    columns = ['番号', '平年値', '最早値', '最早年', '最晩値', '最晩年']
    remark_columns = [col for col in df.columns if col.startswith('rm')]
    return df.drop(columns=columns + remark_columns)

foliage_data = drop_unused_columns(foliage_data)

foliage_data.head()


Unnamed: 0,地点名,1953,1954,1955,1956,1957,1958,1959,1960,1961,...,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032
0,稚内,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,留萌,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,旭川,0,0,0,0,0,0,0,0,0,...,1027,0,0,0,0,0,0,0,0,0
3,網走,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,札幌,1008,1015,1017,1024,1020,1012,0,0,1013,...,1110,0,0,0,0,0,0,0,0,0


In [24]:
foliage_data['地点名'] = foliage_data['地点名'].str.strip()
location_names = foliage_data['地点名'].unique()
pd.Series(location_names)

0        稚内
1        留萌
2        旭川
3        網走
4        札幌
       ... 
97      宮古島
98      久米島
99       那覇
100      名護
101    南大東島
Length: 102, dtype: object

In [25]:
from time import sleep

def fetch_history_weather_data(latitude, longtitude, start_date, end_date, variables):
    url = (
        f'https://archive-api.open-meteo.com/v1/archive?'
        f'latitude={latitude}&'
        f'&longitude={longtitude}&'
        f'start_date={start_date}&'
        f'end_date={end_date}&'
        f'{variables}&'
        'timezone=Asia/Tokyo'
    )
    for _ in range(3):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.Timeout:
            print('Timeout, retrying...')
            sleep(5)
        except requests.exceptions.RequestException as e:
            print(f'Request failed: {e}')
            break
    return None
    

In [26]:
from tqdm import tqdm
import json

def download_weather_data(geolocator, location_names, directory):
    for location_name in (pbar := tqdm(location_names)):
        pbar.set_description('Downloading')
        pbar.set_postfix_str(location_name)
        
        pathlib.Path(f'{directory}/{location_name}').mkdir(exist_ok=True, parents=True)

        location = geolocator.geocode(location_name)
        if location is None:
            print(f'{location_name} not found')
            continue

        for year in range(1953, 2024):
            if pathlib.Path(f'{directory}/{location_name}/{year}.json').exists():
                continue
            response = fetch_history_weather_data(
                location.latitude, 
                location.longitude, 
                f'{year}-09-01', 
                f'{year}-12-31', 
                'daily=temperature_2m_max,temperature_2m_min,temperature_2m_mean,daylight_duration,precipitation_sum'
            )

            if response is None:
                print(f'Failed to fetch data for {location_name} in {year}')
                return

            # 確認是否有錯誤 ( 如: API 每小時的限制 )
            if response.get('error'):
                print(response.get('reason'))
                return

            with open(f'{directory}/{location_name}/{year}.json', 'w', encoding='utf-8') as file:
                file.write(json.dumps(response))
            
            sleep(1)

In [27]:
from geopy.geocoders import Nominatim
import numpy as np


geolocator = Nominatim(user_agent="kaede")

download_completed = True

location_names = np.array_split(location_names, 4)

id = 2
if not download_completed:
    download_weather_data(geolocator, location_names[id], 'data/weather')

In [28]:
def get_date_index(year, date):
    """
    獲取日期在某段時間的索引
    """
    start_date = f'{year}-09-01'
    end_date = f'{year}-12-31'
    date_range = pd.date_range(start_date, end_date)

    return date_range.get_loc(date)

def number_to_date(num):
    """
    將數字轉換為日期格式
    """
    return f'{num // 100:02d}-{num % 100:02d}'

def get_weather_data(data, metric, index, offset, mode=None):
    """
    取得天氣資料中某個指標的指定範圍平均值、最大值或最小值

    Args:
        data (dict): 天氣資料
        metric (str): 指標名稱
        index (int): 日期索引
        offset (int): 日期偏移量
        mode (str): 計算模式 (mean, max, min)
    Returns:
        float: 計算結果
    """
    # 確認是否有足夠資料
    if index < offset:
        return None
    
    if offset != 0: # 如果偏移量不為 0 則進行不同模式的計算
        if mode == 'mean':
            return sum(data['daily'][metric][index-offset:index]) / (offset + 1)
        elif mode == 'max':
            return max(data['daily'][metric][index-offset:index])
        elif mode == 'min':
            return min(data['daily'][metric][index-offset:index])
    else:
        return data['daily'][metric][index]

In [29]:
# 天氣指標
metrics = ['temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'daylight_duration', 'precipitation_sum']

# 將天氣指標轉換為縮寫
metric_mapping = {
    'temperature_2m_max': 't',
    'temperature_2m_min': 't',
    'temperature_2m_mean': 't',
    'daylight_duration': 'd',
    'precipitation_sum': 'p'
}

# 週期所對應的日數 (offset)
periods = {
    '': 0,
    '1w': 6,
    '2w': 13
}

# 計算模式
mode_mapping = {
    'temperature_2m_max': ['max'],
    'temperature_2m_min': ['min'],
    'temperature_2m_mean': ['mean'],
    'daylight_duration': ['max', 'mean', 'min'],
    'precipitation_sum': ['max', 'mean', 'min']

}

In [31]:
data_column = [
    'latitude',
    'd', 'p', 't_max', 't_min', 't_mean',
    't_1w_max', 't_1w_min', 't_1w_mean',
    't_2w_max', 't_2w_min', 't_2w_mean',
    'd_1w_max', 'd_1w_min', 'd_1w_mean',
    'd_2w_max', 'd_2w_min', 'd_2w_mean',
    'p_1w_max', 'p_1w_min', 'p_1w_mean',
    'p_2w_max', 'p_2w_min', 'p_2w_mean'
]

data = pd.DataFrame(columns=data_column)

record_data = foliage_data.to_dict(orient='records')
for record in (pbar := tqdm(record_data)):
    pbar.set_postfix_str(record['地点名'])
    location_name = record['地点名']
    location = geolocator.geocode(location_name)

    for year in range(1953, 2024):
        foliage_day = record[str(year)]
        if foliage_day == 0 or len(str(foliage_day)) == 3: # 沒有資料
            continue

        data_index = get_date_index(year, f'{year}-{number_to_date(foliage_day)}')

        # 讀取天氣資料
        with open(f'data/weather/{location_name}/{year}.json', 'r', encoding='utf-8') as file:
            weather_data = json.load(file)
        
        # 取紅葉當天的資料當作 true，前 6 天的資料當作 false
        for index in range(data_index - 6, data_index + 1):
            new_row = { 
                'latitude': location.latitude, # 緯度
                'is_foliage': 1 if index == data_index else 0, # 是否為紅葉當天
            }

            # 對於所有的組合 (metric, period, mode)
            for metric in metrics:
                for period, offset in periods.items():
                    for mode in mode_mapping[metric]:
                        # 跳過沒有用的組合 Ex: d_max, d_min
                        if metric in ['daylight_duration', 'precipitation_sum'] and mode in ['max', 'min'] and offset == 0:
                            continue

                        # 設定欄位名稱
                        if offset == 0 and metric in ['daylight_duration', 'precipitation_sum']:
                            key = f'{metric_mapping[metric]}' # Ex: d, p
                        elif offset == 0:
                            key = f'{metric_mapping[metric]}_{mode}' # Ex: t_max, t_min, t_mean
                        else:
                            key = f'{metric_mapping[metric]}_{period}_{mode}'
                        
                        # 取得資料
                        value = get_weather_data(weather_data, metric, index, offset, mode)
                        if value is None:
                            continue
                        new_row[key] = value

            new_row = pd.DataFrame(new_row, index=[0])
            data = pd.concat([data, new_row], ignore_index=True)
        
data.to_csv('data/data.csv', index=False)

  data = pd.concat([data, new_row], ignore_index=True)
100%|██████████| 102/102 [01:50<00:00,  1.08s/it, 南大東島]
