In [20]:
import json
import os
import csv

In [37]:
# for released days
def Load_datelist(mode):
    '''
    mode`: should be a `str` in `['train', 'val', 'test', 'release']`\\
    '''
    mode_list = ['train', 'val', 'test', 'release']
    if mode in mode_list:
        with open(f"data_datelist_{mode}.txt", 'r') as f:
            datelist = f.read().split() 
        datelist.sort()
        return datelist
    else:
        raise Exception(f'wrong mode, mode should be a `str` in {mode_list}')

In [38]:
Load_datelist('val')

['20231106',
 '20231107',
 '20231108',
 '20231109',
 '20231110',
 '20231111',
 '20231112']

In [22]:
def Load_predict_stns(mode='predict'):
    # for released stations
    if mode == 'all':
        with open("html.2023.final.data/demographic.json", "r") as f:
            stations = json.load(f)
        stations = list(stations.keys())
        return stations
    
    elif mode == 'predict':
        with open("html.2023.final.data/sno_test_set.txt", "r") as f:
            predict_stns = f.read().split()
            predict_stns.sort()
        return predict_stns
    
    else:
        raise Exception('wrong mode, mode should be "all" or "predict"')

In [23]:
# load time list
def Load_time_list():
    time_list = []
    for hour in range(24):
            for min in range(60):
                h = str(hour) if hour > 9 else '0' + str(hour)
                m = str(min) if min > 9 else '0' + str(min)
                time_list.append(f"{h}:{m}")
    return time_list

In [24]:
def Load_date_feature():
    # read csv data
    day_feature = {}
    csv_file_path = "data_dayfeature.csv"
    with open(csv_file_path, 'r') as csv_file:
        # Create a CSV reader
        csv_reader = csv.DictReader(csv_file)

        # Iterate over each row in the CSV file
        for row in csv_reader:
            day_feature[row["date"]] = {}
            for feature in ['weekday', 'workingday', 'holiday']:
                day_feature[row["date"]][feature] = row[feature]
    return day_feature

In [25]:
def Load_stn_tot():
    with open("./data_stn_tot.json", 'r') as f:
        stn_tot = json.load(f)
    return stn_tot

In [26]:
def Load_weather_data():
    with open("./data_weather.json", "r") as f:
        weather_data = json.load(f)
    return weather_data

In [27]:
# gen data
# 預設txt檔

def Gen_data(csv=False, mode='train', dirpath='./data'):
    '''
    `csv`: `boolean` for file type\\
    `mode`: should be a `str` in `['train', 'val', 'test', 'release']`\\
    `dirpath`: should be a `str`, directory path for storing data. if path don't exist, it will make a new directory 
    '''
    
    predict_stns = Load_predict_stns()
    time_list = Load_time_list()
    date_list = Load_datelist(mode)
    
    print(f"predict_stns ({len(predict_stns)}):", predict_stns)
    print("date_list:", date_list)
    print("time_list:", time_list)
    
    if not csv:
        print(f"\nGenerating {mode} txt data ...\n")
        split = ' '
    else:
        print(f"\nGenerating {mode} csv data ...\n")
        split = ','
    
    if not os.path.exists(f"{dirpath}"):
        os.mkdir(f"{dirpath}")
    
    
    # start write file
    for stn in predict_stns: 
        # for csv file
        new_filename = f"{dirpath}/{stn}_{mode}"
        if not csv:
            data_file = open(f"{new_filename}.txt", 'w')
        else:
            data_file = open(f"{new_filename}.csv", 'w')
            # data_file.write("stn,date,time,tot,weekday,workingday,holiday,rain,temp,moist,sbi\n")
            data_file.write("stn,date,time,tot,weekday,workingday,holiday,rain,sbi\n")
        
        # open source file
        with open(f"./data_by_station/{stn}.json", "r") as f:
            source_data = json.load(f)
        
        # gen data file
        for date in date_list:
            print(stn, date)
            for time in time_list:
                new_data = stn + split + date + split + time
                
                # for feature in ["tot", "weekday", "workingday", "holiday", "rain", "temp", "moist", "sbi"]:
                for feature in ["tot", "weekday", "workingday", "holiday", "rain", "sbi"]:
                        new_data = new_data + split + str(source_data[time][date][feature])
                        
                data_file.write(new_data + "\n")
                
        data_file.close()
        
    print(f"complete {mode} data gen")

In [28]:
Gen_data(csv=False, mode='train', dirpath='./data')
Gen_data(csv=False, mode='val', dirpath='./data')
Gen_data(csv=False, mode='test', dirpath='./data')
Gen_data(csv=False, mode='release', dirpath='./data')

predict_stns (112): ['500101001', '500101002', '500101003', '500101004', '500101005', '500101006', '500101007', '500101008', '500101009', '500101010', '500101013', '500101014', '500101015', '500101018', '500101019', '500101020', '500101021', '500101022', '500101023', '500101024', '500101025', '500101026', '500101027', '500101028', '500101029', '500101030', '500101031', '500101032', '500101033', '500101034', '500101035', '500101036', '500101037', '500101038', '500101039', '500101040', '500101041', '500101042', '500101091', '500101092', '500101093', '500101094', '500101114', '500101115', '500101123', '500101166', '500101175', '500101176', '500101181', '500101184', '500101185', '500101188', '500101189', '500101190', '500101191', '500101193', '500101199', '500101209', '500101216', '500101219', '500105066', '500106002', '500106003', '500106004', '500119043', '500119044', '500119045', '500119046', '500119047', '500119048', '500119049', '500119050', '500119051', '500119052', '500119053', '500