In [1]:
import os
import numpy as np
import csv
import pandas as pd
import math

In [2]:
def get_year_list(dir = "data"):
    return os.listdir(dir)

get_year_list()


['2018', '2019', '2020', '2021', '2022', '2023']

In [3]:
def get_station_list(year = 'homo' , dir = "data"): 
    data = year if (year != 'homo') else get_year_list()[-1]
    station_list = os.listdir(dir + "/" + data)
    tmp_lst = []
    for z in station_list:
        tmp_lst.append(z[:-9])
    return tmp_lst
print(get_station_list('2022'))

['三義', '三重', '中壢', '中山', '二林', '仁武', '冬山', '前金', '前鎮', '南投', '古亭', '善化', '嘉義', '土城', '埔里', '基隆', '士林', '大同', '大園', '大寮', '大里', '安南', '宜蘭', '富貴角', '小港', '屏東', '崙背', '左營', '平鎮', '彰化', '復興', '忠明', '恆春', '斗六', '新店', '新港', '新營', '新竹', '新莊', '朴子', '松山', '板橋', '林口', '林園', '桃園', '楠梓', '橋頭', '永和', '汐止', '沙鹿', '淡水', '湖口', '潮州', '竹山', '竹東', '線西', '美濃', '臺南', '臺東', '臺西', '花蓮', '苗栗', '菜寮', '萬華', '萬里', '西屯', '觀音', '豐原', '金門', '關山', '陽明', '頭份', '馬公', '馬祖', '鳳山', '麥寮', '龍潭']


In [4]:
def get_parameters(station = 'homo' , year='homo', dir="data"):

    year = year if (year != 'homo') else get_year_list()[-1]
    station = station if (station != 'homo') else get_station_list()[-1]

    file = dir + '/' + year + '/' + station + '_' + year + '.csv'
    param_list = []
    with open(file, newline='' , encoding = 'utf-8-sig') as csvfile:
        kiwi = csv.reader(csvfile, delimiter=',')
        next(kiwi)
        for row in kiwi:
            berry = row[2]
            if(berry in param_list):
                break
            param_list.append(berry)
    return param_list

print(get_parameters('新竹' , '2018'))

['AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx', 'O3', 'PM10', 'PM2.5', 'RAINFALL', 'RH', 'SO2', 'THC', 'WD_HR', 'WIND_DIREC', 'WIND_SPEED', 'WS_HR']


In [5]:
def process_file(station_id, year, dir="data"):
    station = get_station_list(year)[station_id]
    param_list = get_parameters( station , year , dir)
    berry = {}
    sum_param = {}
    val_count = {} 
    avg_param = {}
    for p in param_list:
        berry[p] = []
        sum_param[p] = 0
        val_count[p] = 0
    
    file = dir + '/' + year + '/' + station + '_' + year + '.csv'
    with open(file, newline='', encoding='utf-8-sig') as csvfile:
        kiwi = csv.reader(csvfile, delimiter=',')
        next(kiwi)
        for row in kiwi:
            if(len(row) == 0):
                break
            param = row[2]
            for t in range(24):
                data = row[t + 3]
                try:
                    sum_param[param] += float(data)
                    val_count[param] += 1
                except ValueError:
                    data = "yarimasune"
                berry[param].append(data)
    content = []
    for param in param_list:
        data_size = len(berry[param])
        l = len(content)
        content.append([param])
        avg_param[param] = round(sum_param[param] / val_count[param] , 1)
        for i in range(data_size):
            if(berry[param][i] == "yarimasune"):
                content[l].append(avg_param[param])
                berry[param][i] = avg_param[param]
            else:
                content[l].append(float(berry[param][i]))
                berry[param][i] = float(berry[param][i])

    y_list = ['next_O3' , 'next_PM2.5' , 'next_PM10' , 'next_CO' , 'next_SO2' ,'next_NO2']
    for y in y_list:
        l = len(content)
        content.append([y])
        if (y == 'next_O3'):
            x_param = "O3"
            for i in range(1 , len(berry[x_param])):
                content[l].append(berry[x_param][i])
            content[l].append(avg_param[x_param])

        elif (y == 'next_PM2.5'):
            x_param = "PM2.5"
            for i in range(11):
                content[l].append(avg_param[x_param])
            for i in range(12, len(berry[x_param])):
                mean_12_prev = sum(berry[x_param][i-12:i]) / 12
                mean_4_prev = sum(berry[x_param][i-4:i]) / 4 
                weighted_mean = (mean_12_prev * 0.5) + (mean_4_prev * 0.5)
                content[l].append(round(weighted_mean , 1))
            content[l].append(avg_param[x_param])

        elif(y == 'next_PM10'):
            x_param = "PM10"
            for i in range(11):
                content[l].append(avg_param[x_param])
            for i in range(12, len(berry[x_param])):
                mean_12_prev = sum(berry[x_param][i-12:i]) / 12
                mean_4_prev = sum(berry[x_param][i-4:i]) / 4
                weighted_mean = (mean_12_prev * 0.5) + (mean_4_prev * 0.5)
                content[l].append(round(weighted_mean , 1))
            content[l].append(avg_param[x_param])

        elif(y == 'next_CO'):
            x_param = "CO"
            for i in range(7):
                content[l].append(avg_param[x_param])
            for i in range(8, len(berry[x_param])):
                content[l].append( round(sum(berry[x_param][i-8:i]) / 8  , 1))
            content[l].append(avg_param[x_param])

        elif(y == 'next_SO2'):
            x_param = "SO2"
            for i in range(1 , len(berry[x_param])):
                content[l].append(berry[x_param][i])
            content[l].append(avg_param[x_param])

        elif(y == 'next_NO2'):
            x_param = "NO2"
            for i in range(1 , len(berry[x_param])):
                content[l].append(berry[x_param][i])
            content[l].append(avg_param[x_param])
            
    return np.array(content).T

# process_file(37, '2023')

In [6]:
def SplitData(data, split_ratio):
    ll = len(data)
    k = int(ll * split_ratio)
    a = data[:k]
    b = data[k:]
    return a , b

In [7]:
content = np.array([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])
for year in get_year_list():
    content = np.vstack( (content , process_file(37, year)[1:]) )
content = content[1:]
# np.random.shuffle(content)
train_data,validation_data = SplitData(content , 0.8)

head = np.array([['AMB_TEMP', 'CH4', 'CO', 'NMHC', 'NO', 'NO2', 'NOx', 'O3', 'PM10',
       'PM2.5', 'RAINFALL', 'RH', 'SO2', 'THC', 'WD_HR', 'WIND_DIREC',
       'WIND_SPEED', 'WS_HR', 'next_O3', 'next_PM2.5', 'next_PM10',
       'next_CO', 'next_SO2', 'next_NO2']])

with open('output.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerows(head)
    writer.writerows(content)

with open('data_train.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerows(head)
    writer.writerows(train_data)

with open('data_validation.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerows(head)
    writer.writerows(validation_data)