In [1]:
import os
import pandas as pd
import numpy as np
import re

In [2]:
file_path = "2024_AQI_dataset.csv"
data = pd.read_csv(file_path, low_memory = False)
sorted_data = data.sort_values(by = ['sitename', 'datacreationdate'])

In [3]:
# 臺東對應是知本（水試所）測站
# 花蓮對應吉安光華測站
spec_data = sorted_data[(sorted_data['sitename'].isin(['二林', '中壢', '竹山', '臺東', '橋頭', '花蓮', '斗六', '新港', '新營', '永和', '三重']))]

In [4]:
spec_data = spec_data.drop(columns = ['longitude', 'latitude', 'siteid', 'unit', 'county', 'pollutant', 'status', 'windspeed', 'winddirec'])

In [5]:
# 將 datacreationdate 轉為 datetime 格式
spec_data['datacreationdate'] = pd.to_datetime(spec_data['datacreationdate'])
spec_data_january = spec_data[spec_data['datacreationdate'].dt.month == 1]

In [6]:
# 定義一個函數，處理每個 sitename 的數據
def process_group(group):
    # 建立完整的時間序列
    full_time_range = pd.date_range(start=group['datacreationdate'].min(), end=group['datacreationdate'].max(), freq='h')
    # 重設索引並對齊時間序列
    group = group.set_index('datacreationdate').reindex(full_time_range).reset_index()
    group.rename(columns={"index": "datacreationdate"}, inplace=True)
    # 補充 sitename 並插值
    group['sitename'] = group['sitename'].ffill()  # 填補 sitename
    return group

In [7]:
spec_data_january_full = spec_data_january.groupby('sitename', group_keys=False).apply(process_group).reset_index(drop=True)

  spec_data_january_full = spec_data_january.groupby('sitename', group_keys=False).apply(process_group).reset_index(drop=True)


In [8]:
for column in spec_data_january_full.columns:
    if column != "datacreationdate":  
        if spec_data_january_full[column].isna().any():
            spec_data_january_full[column] = spec_data_january_full[column].fillna(spec_data_january_full[column].rolling(window=24, min_periods=1, center=True).mean().round(1))
        columns_to_round_first = ['aqi', 'pm10', 'pm2.5', 'pm10_avg', 'so2_avg'] 
        for column in columns_to_round_first:
             spec_data_january_full[column] = spec_data_january_full[column].round().astype(float)

In [9]:
spec_data_january_full

Unnamed: 0,datacreationdate,sitename,aqi,so2,co,o3,o3_8hr,pm10,pm2.5,no2,nox,no,co_8hr,pm2.5_avg,pm10_avg,so2_avg
0,2024-01-01 00:00:00,三重,105.0,1.0,0.7,32.9,35.0,59.0,30.0,25.1,41.2,16.0,0.9,37.2,65.0,0.0
1,2024-01-01 01:00:00,三重,110.0,1.1,0.63,35.3,34.6,70.0,44.0,18.4,29.5,11.0,0.8,39.0,67.0,0.0
2,2024-01-01 02:00:00,三重,111.0,1.1,0.72,34.0,33.9,56.0,39.0,18.6,27.1,8.5,0.8,39.5,66.0,0.0
3,2024-01-01 03:00:00,三重,110.0,0.9,0.62,31.1,33.2,59.0,37.0,19.8,32.6,12.8,0.7,39.1,64.0,0.0
4,2024-01-01 04:00:00,三重,109.0,0.9,0.6,34.5,32.8,52.0,34.0,15.7,26.2,10.4,0.7,38.7,62.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8179,2024-01-31 19:00:00,花蓮,33.0,0.8,0.43,27.9,36.6,16.0,12.0,10.5,11.5,1.0,0.2,9.7,19.0,0.0
8180,2024-01-31 20:00:00,花蓮,34.0,0.9,0.53,16.4,34.9,26.0,14.0,16.0,17.3,1.2,0.3,10.4,20.0,0.0
8181,2024-01-31 21:00:00,花蓮,36.0,0.8,0.4,14.2,31.8,18.0,13.0,12.7,13.9,1.1,0.3,11.2,20.0,0.0
8182,2024-01-31 22:00:00,花蓮,36.0,0.7,0.25,24.7,29.6,12.0,10.0,4.4,4.9,0.5,0.3,11.0,18.0,0.0


In [10]:
spec_data_january_full.isna().mean()

datacreationdate    0.0
sitename            0.0
aqi                 0.0
so2                 0.0
co                  0.0
o3                  0.0
o3_8hr              0.0
pm10                0.0
pm2.5               0.0
no2                 0.0
nox                 0.0
no                  0.0
co_8hr              0.0
pm2.5_avg           0.0
pm10_avg            0.0
so2_avg             0.0
dtype: float64

In [11]:
base_dir = "Weather_History_Data"

def merge_weather_data(aqi_df, base_dir):
    for sitename in os.listdir(base_dir):
        sitename_path = os.path.join(base_dir, sitename)
        if os.path.isdir(sitename_path):
            for csv_file in os.listdir(sitename_path):
                if csv_file.endswith(".csv"):
                    # 提取日期，假設檔名格式為 `C0AI30-2024-01-01.csv`
                    split = csv_file.split("-")
                    year_str, month_str, date_str = split[1], split[2], split[3].replace(".csv", "")

                    csv_path = os.path.join(sitename_path, csv_file)
                    weather_df = pd.read_csv(csv_path)
                    weather_df = weather_df.drop(index=0).reset_index(drop=True)
                    weather_df['觀測時間(hour)'] = weather_df['觀測時間(hour)'].astype(int)
                    
                    for _, row in weather_df.iterrows():
                        # 構建 datacreationdate 的對應值
                        obs_time = row['觀測時間(hour)']
                        if obs_time != 24:
                            target_datetime = pd.to_datetime(f"{year_str}-{month_str}-{date_str} {obs_time:02}:00:00")
                        
                        # 處理 ObsTime 為 24 的情況，轉到下一天
                        if obs_time == 24:
                            target_datetime += pd.Timedelta(days=1)
                            target_datetime = target_datetime.replace(hour=0)

                        # 在 AQI 資料集中匹配 sitename 和 datacreationdate
                        mask = (aqi_df['sitename'] == sitename) & (aqi_df['datacreationdate'] == target_datetime)
                        for col in row.index:
                            if col not in ["觀測時間(hour)"]:
                                aqi_df.loc[mask, col] = row[col]

    return aqi_df


In [12]:
merge_data = merge_weather_data(spec_data_january_full, base_dir)

In [13]:
# 將各測站 2024-01-01 00:00:00 的資料刪除
merge_data = merge_data.groupby('sitename').apply(lambda group: group.iloc[1:]).reset_index(drop=True)

  merge_data = merge_data.groupby('sitename').apply(lambda group: group.iloc[1:]).reset_index(drop=True)


In [14]:
def replace_special_chars_with_nan(value):
    if re.search(r'[\/&]', str(value)):  # 檢查是否包含 / 或 &
        return np.nan
    return value
merge_data_cleaned = merge_data.map(replace_special_chars_with_nan)
merge_data_cleaned.isna().mean()

datacreationdate      0.000000
sitename              0.000000
aqi                   0.000000
so2                   0.000000
co                    0.000000
o3                    0.000000
o3_8hr                0.000000
pm10                  0.000000
pm2.5                 0.000000
no2                   0.000000
nox                   0.000000
no                    0.000000
co_8hr                0.000000
pm2.5_avg             0.000000
pm10_avg              0.000000
so2_avg               0.000000
測站氣壓(hPa)             0.007831
氣溫(℃)                 0.007831
相對溼度(%)               0.007831
風速(m/s)               0.007831
風向(360degree)         0.007831
最大瞬間風(m/s)            0.000000
最大瞬間風風向(360degree)    0.000000
降水量(mm)               0.011012
dtype: float64

In [15]:
columns_to_convert = merge_data_cleaned.columns[2:]  # 第三列及之後的欄位
merge_data_cleaned[columns_to_convert] = merge_data_cleaned[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [16]:
for column in columns_to_convert:
    merge_data_cleaned[column] = merge_data_cleaned[column].fillna(merge_data_cleaned[column].rolling(window=24, min_periods=1, center=True).mean().round(1))

In [17]:
merge_data_cleaned.isna().mean()

datacreationdate      0.0
sitename              0.0
aqi                   0.0
so2                   0.0
co                    0.0
o3                    0.0
o3_8hr                0.0
pm10                  0.0
pm2.5                 0.0
no2                   0.0
nox                   0.0
no                    0.0
co_8hr                0.0
pm2.5_avg             0.0
pm10_avg              0.0
so2_avg               0.0
測站氣壓(hPa)             0.0
氣溫(℃)                 0.0
相對溼度(%)               0.0
風速(m/s)               0.0
風向(360degree)         0.0
最大瞬間風(m/s)            0.0
最大瞬間風風向(360degree)    0.0
降水量(mm)               0.0
dtype: float64

In [18]:
merge_data_cleaned.dtypes

datacreationdate      datetime64[ns]
sitename                      object
aqi                          float64
so2                          float64
co                           float64
o3                           float64
o3_8hr                       float64
pm10                         float64
pm2.5                        float64
no2                          float64
nox                          float64
no                           float64
co_8hr                       float64
pm2.5_avg                    float64
pm10_avg                     float64
so2_avg                      float64
測站氣壓(hPa)                    float64
氣溫(℃)                        float64
相對溼度(%)                      float64
風速(m/s)                      float64
風向(360degree)                float64
最大瞬間風(m/s)                   float64
最大瞬間風風向(360degree)           float64
降水量(mm)                      float64
dtype: object

In [19]:
merge_data_cleaned

Unnamed: 0,datacreationdate,sitename,aqi,so2,co,o3,o3_8hr,pm10,pm2.5,no2,...,pm10_avg,so2_avg,測站氣壓(hPa),氣溫(℃),相對溼度(%),風速(m/s),風向(360degree),最大瞬間風(m/s),最大瞬間風風向(360degree),降水量(mm)
0,2024-01-01 01:00:00,三重,110.0,1.1,0.63,35.3,34.6,70.0,44.0,18.4,...,67.0,0.0,1021.2,17.2,62.0,8.4,71.0,13.2,58.0,0.0
1,2024-01-01 02:00:00,三重,111.0,1.1,0.72,34.0,33.9,56.0,39.0,18.6,...,66.0,0.0,1020.6,17.1,64.0,6.6,79.0,12.6,63.0,0.0
2,2024-01-01 03:00:00,三重,110.0,0.9,0.62,31.1,33.2,59.0,37.0,19.8,...,64.0,0.0,1020.2,17.0,66.0,5.9,73.0,12.1,82.0,0.0
3,2024-01-01 04:00:00,三重,109.0,0.9,0.60,34.5,32.8,52.0,34.0,15.7,...,62.0,0.0,1020.0,16.9,66.0,6.1,72.0,11.0,88.0,0.0
4,2024-01-01 05:00:00,三重,106.0,0.9,0.54,33.8,33.2,56.0,40.0,15.5,...,60.0,0.0,1020.4,17.0,66.0,5.6,78.0,10.4,72.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8168,2024-01-31 19:00:00,花蓮,33.0,0.8,0.43,27.9,36.6,16.0,12.0,10.5,...,19.0,0.0,1017.9,22.1,85.0,1.0,230.0,2.8,295.0,0.0
8169,2024-01-31 20:00:00,花蓮,34.0,0.9,0.53,16.4,34.9,26.0,14.0,16.0,...,20.0,0.0,1018.2,22.1,85.0,0.8,233.0,1.8,274.0,0.0
8170,2024-01-31 21:00:00,花蓮,36.0,0.8,0.40,14.2,31.8,18.0,13.0,12.7,...,20.0,0.0,1018.0,21.6,89.0,2.3,225.0,3.8,202.0,0.0
8171,2024-01-31 22:00:00,花蓮,36.0,0.7,0.25,24.7,29.6,12.0,10.0,4.4,...,18.0,0.0,1018.2,21.6,89.0,1.6,228.0,3.4,237.0,0.0


In [20]:
# 將 DataFrame 輸出為 CSV 檔案
output_path = "2024_01_AQI_Weather_dataset.csv"
merge_data_cleaned.to_csv(output_path, index=False, encoding="utf-8")