In [1]:
import os
import json
import csv
from tqdm import tqdm

In [2]:
# for released days
def Load_datelist(mode):
    '''
    `mode`: should be a `str` in `['train', 'val', 'test', 'release']`\\
    '''
    mode_list = ['trarn', 'val', 'test', 'release']
    if mode in mode_list:
        with open(f"data_datelist_{mode}.txt", 'r') as f:
            datelist = f.read().split() 
        datelist.sort()
        return datelist
    else:
        raise Exception(f'wrong mode, mode should be a `str` in {mode_list}')

In [3]:
def Load_stn_tot_dic():
    with open("./data_stn_tot.json", "r") as f:
        stn_tot_dic = json.load(f)
    return stn_tot_dic

In [4]:
def Load_predict_stns(mode='predict'):
    '''
    `mode`: should be a `str` in `['predict', 'all']`
    '''
    # for released stations
    if mode == 'all':
        with open("html.2023.final.data/demographic.json", "r") as f:
            stations = json.load(f)
        stations = list(stations.keys())
        return stations
    
    elif mode == 'predict':
        with open("html.2023.final.data/sno_test_set.txt", "r") as f:
            predict_stns = f.read().split()
            predict_stns.sort()
        return predict_stns
    
    else:
        raise Exception('wrong mode, mode should be "all" or "predict"')

In [5]:
# load time list
def Load_time_list():
    time_list = []
    for hour in range(24):
            for min in range(60):
                h = str(hour) if hour > 9 else '0' + str(hour)
                m = str(min) if min > 9 else '0' + str(min)
                time_list.append(f"{h}:{m}")
    return time_list

In [6]:
def Load_day_feature_dic():
    day_feature_dic = {}
    csv_file_path = "data_dayfeature.csv"
    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)

        for row in csv_reader:
            day_feature_dic[row["date"]] = {}
            for feature in ['weekday', 'workingday', 'holiday']:
                day_feature_dic[row["date"]][feature] = row[feature]
    return day_feature_dic
    

In [7]:
def Load_weather_dic():
    with open("./data_weather.json", "r") as f:
        weather_dic = json.load(f)
    return weather_dic

In [11]:
def Create_datelist_file(mode, datelist_filename=None):
    datelist_filename = f"data_datelist_{mode}.txt" if datelist_filename == None else datelist_filename
    
    if mode == 'release':
        html_release_dir="html.2023.final.data/release"
        datelist_release=os.listdir(html_release_dir)
        # del .DS_Store and the dates which we don't want 
        for date in [".DS_Store", "20231015", "20231130", "20231214", "20231215", "20231216"]:
            if os.path.exists(f"{html_release_dir}/{date}"):
                datelist_release.remove(date)
        datelist_release.sort()
        
        with open(f"{datelist_filename}", 'w') as f:
            for date in datelist_release:
                f.write(f"{date}\n")
        print("datelist_release:", datelist_release)
    
    elif mode == 'train':
        datelist_release = Load_datelist(mode='release')
        datelist_val = Load_datelist(mode='val')
        datelist_train = [date for date in datelist_release if date not in datelist_val]
        datelist_train.sort()
        
        with open(f"{datelist_filename}", 'w') as f:
            for date in datelist_train:
                f.write(f"{date}\n")
                
        print("datelist_train:", datelist_train)
    else:
        raise Exception(f'wrong mode={mode}, mode should be "train", "val" or "test"')
    

In [12]:
# Create_datelist_file(mode='release')
# Create_datelist_file(mode='train')

datelist_release: ['20231002', '20231003', '20231004', '20231005', '20231006', '20231007', '20231008', '20231009', '20231010', '20231011', '20231016', '20231017', '20231018', '20231019', '20231020', '20231025', '20231026', '20231027', '20231028', '20231029', '20231030', '20231031', '20231101', '20231102', '20231103', '20231104', '20231105', '20231106', '20231107', '20231108', '20231109', '20231110', '20231111', '20231112', '20231113', '20231114', '20231115', '20231116', '20231117', '20231118', '20231119', '20231120', '20231121', '20231122', '20231123', '20231124', '20231125', '20231126', '20231127', '20231128', '20231129', '20231201', '20231202', '20231203', '20231204', '20231205', '20231206', '20231207', '20231208', '20231209', '20231210', '20231211', '20231212', '20231213']


In [11]:
def Create_stn_tot_dic():
    stn_tot = {}
    predict_stns = Load_predict_stns()
    print("predict_stns:", predict_stns)
    
    for stn in predict_stns:
        with open(f"./data_by_station/{stn}.json") as f:
            stn_info = json.load(f)
        stn_tot[stn] = stn_info["00:00"]["20231002"]["tot"]
        
    with open("./stn_tot.json", 'w') as f:
        json.dump(stn_tot, f, indent=2)

In [12]:
# Create_stn_tot_dic()

In [14]:
# fill hole and store it (./clear_release/{stn}.json)
def Fill_data_hole(datelist=Load_datelist('release'), datadir='./data_clear_release', sourcedir='./html.2023.final.data/release'):
    # datelist = Load_datelist('release')
    print("datelist:", datelist)
    predict_stns = Load_predict_stns()
    for date in datelist:
        for stn in predict_stns:
            print(f"creating {date} {stn}...", end='\r')
            path = f"{sourcedir}/{date}/{stn}.json"
            with open(path, "r") as f:
                load = json.load(f)
            
            # fill hole with prev data (if time is 00:00, fill with 00:01)
            prev_i = "00:01"
            for i in load:
                if load[i] == {}:
                    # print(filename, i, prev_i, i, load[prev_i], int(prev_i[4:]) + 1,"jhgkjhgkjhg")
                    while load[prev_i] == {}:
                        prev_i = prev_i[:4] + str(int(prev_i[4:]) + 1)
                    load[i] = load[prev_i]
                prev_i = i
                
            if not os.path.exists(f"{datadir}/{date}/"):
                os.mkdir(f"{datadir}/{date}/")
                
            if not os.path.exists(f"{datadir}/{date}/{stn}.json"):
                with open(f"{datadir}/{date}/{stn}.json", "w") as f:
                    json.dump({}, f)
                    
            with open(f"{datadir}/{date}/{stn}.json", 'w') as f:
                json.dump(load, f, indent=2)

    print("\nComplete Fill data hole")

In [15]:
# date = ["20231211", "20231212", "20231213"]
# Fill_data_hole(datelist=date)
# Fill_data_hole(datelist=["20231208"])
# Fill_data_hole()

datelist: ['20231211', '20231212', '20231213']
creating 20231213 500119091...
Complete Fill data hole


In [17]:
# weather json
def Create_weather_data(datelist=None, weather_data_filename="./data_weather.json", ori_weather_datadir='./weather_original'):
    weather_data = {}
    
    if datelist == None:
        # all weather data in ./weather_original
        datelist = [filename[:8] for filename in os.listdir(f"{ori_weather_datadir}") if filename != '.DS_Store']
        datelist.sort()
    print("datelist:", datelist, end="\n\n")
    
    
    for date in datelist:
        with open(f'{weather_data_filename}', 'r') as f:
            weather_data = json.load(f)
        print(date, end="\r")
        with open(f"{ori_weather_datadir}/{date}.csv", 'r') as csv_file:
        # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

        # Iterate over each row in the CSV file
            for row in csv_reader:
                # skip first row
                if row['\ufeff"觀測時間(hour)"'] == 'ObsTime': 
                    continue
                
                hour = int(row['\ufeff"觀測時間(hour)"'])
                weather_data[date][str(hour)] = {}
                for feature in [('氣溫(℃)', 'temp'), ('降水量(mm)', 'rain'), ('相對溼度(%)', 'moist'), ('最大瞬間風(m/s)', 'wind')]:
                    if row[feature[0]] == '--' or row[feature[0]] == '&':
                        weather_data[date][str(hour)][feature[1]] = float(weather_data[date][str(hour - 1)][feature[1]])
                    else:
                        weather_data[date][str(hour)][feature[1]] = float(row[feature[0]])
                        
    with open(f'{weather_data_filename}', 'w') as f:
        json.dump(weather_data, f, indent=2)

    print(f"\nComplete creating weather data to {weather_data_filename} in json type")
    print("weather_data:", weather_data)

In [19]:
# Create_weather_data(datelist=["20231208", "20231209", "20231210"])
# Create_weather_data()

datelist: ['20231002', '20231003', '20231004', '20231005', '20231006', '20231007', '20231008', '20231009', '20231010', '20231011', '20231015', '20231016', '20231017', '20231018', '20231019', '20231020', '20231021', '20231022', '20231023', '20231024', '20231025', '20231026', '20231027', '20231028', '20231029', '20231030', '20231031', '20231101', '20231102', '20231103', '20231104', '20231105', '20231106', '20231107', '20231108', '20231109', '20231110', '20231111', '20231112', '20231113', '20231114', '20231115', '20231116', '20231117', '20231118', '20231119', '20231120', '20231121', '20231122', '20231123', '20231124', '20231125', '20231126', '20231127', '20231128', '20231129', '20231130', '20231201', '20231202', '20231203', '20231204', '20231205', '20231206', '20231207', '20231208', '20231209', '20231210', '20231211', '20231212', '20231213']

20231213
Complete creating weather data to ./data_weather.json in json type
weather_data: {'20231002': {'1': {'temp': 27.4, 'rain': 0.0, 'moist': 76

In [20]:
# sort data by station
# station.json -> time: { date: {... (some features) } }
# features: "tot", "sbi", "bemp", "act"

def Create_data_by_station_release(datelist=Load_datelist('release'), newdata_dir="./data_by_station", source_data_dir="./data_clear_release"):

    # load stations which we want to predict
    predict_stns = Load_predict_stns()
    print("datelist:", datelist)
    print("predict_stns", predict_stns)
    
    print("\nCreating data by station for training days...\n")
    
    if not os.path.exists(f"{newdata_dir}"):
        os.mkdir(f"{newdata_dir}")
    
    # for stn in predict_stns:
    for stn in predict_stns:
        # file we want to write
    
        filename = f"{newdata_dir}/{stn}.json"
        if not os.path.exists(filename):
            with open(filename, "w") as f:
                json.dump({}, f)
                
        with open(filename, "r") as f:
            new_data = json.load(f)
            
        for date in datelist:
            print(stn, date, end="\r")
            
            # original file
            with open(f"{source_data_dir}/{date}/{stn}.json", "r") as f:
                ori_data = json.load(f)
                
            for time in ori_data:
                if time not in new_data:
                    new_data[time] = {}
                if date not in new_data[time]:
                    new_data[time][date] = {}
                
                for feature in ["tot", "sbi", "bemp", "act"]:
                    new_data[time][date][feature] = int(ori_data[time][feature])
            
        with open(f"{newdata_dir}/{stn}.json", "w") as f:
            json.dump(new_data, f, indent=2)
            
    print(f"\nComplete creating data by station to dir={newdata_dir} from source_dir={source_data_dir} for training days...\n")
    

In [21]:
# date = ["20231211", "20231212", "20231213"]
# Create_data_by_station_release(datelist=date)
# Create_data_by_station_release(datelist=["20231208"])
# Create_data_by_station_release()

datelist: ['20231211', '20231212', '20231213']
predict_stns ['500101001', '500101002', '500101003', '500101004', '500101005', '500101006', '500101007', '500101008', '500101009', '500101010', '500101013', '500101014', '500101015', '500101018', '500101019', '500101020', '500101021', '500101022', '500101023', '500101024', '500101025', '500101026', '500101027', '500101028', '500101029', '500101030', '500101031', '500101032', '500101033', '500101034', '500101035', '500101036', '500101037', '500101038', '500101039', '500101040', '500101041', '500101042', '500101091', '500101092', '500101093', '500101094', '500101114', '500101115', '500101123', '500101166', '500101175', '500101176', '500101181', '500101184', '500101185', '500101188', '500101189', '500101190', '500101191', '500101193', '500101199', '500101209', '500101216', '500101219', '500105066', '500106002', '500106003', '500106004', '500119043', '500119044', '500119045', '500119046', '500119047', '500119048', '500119049', '500119050', '50

In [22]:
def Create_data_by_station_test(datelist=Load_datelist('test')):
    # load stations which we want to test
    predict_stns = Load_predict_stns()
    stn_tot_dic = Load_stn_tot_dic()
    time_list = Load_time_list()
    day_feature_dic = Load_day_feature_dic()
    weather_dic = Load_weather_dic()
    
    print("datelist:", datelist)
    print("predict_stns:", predict_stns)
    # print("stn_tot_dic:", stn_tot_dic)
    # print("time_list:", time_list)
    # print("day_feature_dic:", day_feature_dic)
    # print("weather_dic:", weather_dic)
    
    print("\nCreating data by station for testing days...\n")
    # datelist = ["20231021","20231022","20231023","20231024","20231204","20231205","20231206","20231207","20231208","20231209","20231210"]
    # for stn in predict_stns:
    for stn_i, stn in enumerate(predict_stns):
        # file we want to write
        with open(f"data_by_station/{stn}.json", "r") as f:
            new_data = json.load(f)
        for date_i, date in enumerate(datelist):
            print(stn, date, f"{100 * (stn_i * len(datelist) + date_i) / (len(predict_stns) * len(datelist)):<.2f}%", end='\r')
                
            for time in time_list:
                if time not in new_data:
                    new_data[time] = {}
                if date not in new_data[time]:
                    new_data[time][date] = {}
                
                    
                new_data[time][date]["tot"] = stn_tot_dic[stn]
                for feature in ["sbi", "bemp"]:
                    new_data[time][date][feature] = -1
                
                if int(date) <= 20231024:
                    new_data[time][date]["act"] = new_data[time]["20231020"]["act"]
                else:
                    new_data[time][date]["act"] = new_data[time]["20231129"]["act"]
                
                for feature in ['weekday', 'workingday', 'holiday']:
                    new_data[time][date][feature] = day_feature_dic[date][feature]
                
                if date in weather_dic:
                    for feature in weather_dic[date][str( int(time[:2]) + 1)]:
                        new_data[time][date][feature] = weather_dic[date][str( int(time[:2]) + 1)][feature]
                else:
                    for feature in weather_dic["20231002"]["1"]:
                        new_data[time][date][feature] = -1
                
        with open(f"data_by_station/{stn}.json", "w") as f:
            json.dump(new_data, f, indent=2)
            
    print("\nComplete creating data by station for testing days\n")

In [23]:
# print(Load_datelist('test'))
# date = ['20231211', '20231212', '20231213', '20231214', '20231215', '20231216', '20231217']
# Create_data_by_station_test()

datelist: ['20231021', '20231022', '20231023', '20231024', '20231218', '20231219', '20231220', '20231221', '20231222', '20231223', '20231224']
predict_stns: ['500101001', '500101002', '500101003', '500101004', '500101005', '500101006', '500101007', '500101008', '500101009', '500101010', '500101013', '500101014', '500101015', '500101018', '500101019', '500101020', '500101021', '500101022', '500101023', '500101024', '500101025', '500101026', '500101027', '500101028', '500101029', '500101030', '500101031', '500101032', '500101033', '500101034', '500101035', '500101036', '500101037', '500101038', '500101039', '500101040', '500101041', '500101042', '500101091', '500101092', '500101093', '500101094', '500101114', '500101115', '500101123', '500101166', '500101175', '500101176', '500101181', '500101184', '500101185', '500101188', '500101189', '500101190', '500101191', '500101193', '500101199', '500101209', '500101216', '500101219', '500105066', '500106002', '500106003', '500106004', '500119043

In [24]:
# add "day feature" like weekday workingday holiday

def Add_date_feature(datelist=Load_datelist('release') + Load_datelist('test')):
    # read csv data
    time_list = Load_time_list()
    predict_stns = Load_predict_stns()
    day_feature_dic = Load_day_feature_dic()
    # datelist += ["20231021","20231022","20231023","20231024","20231204","20231205","20231206","20231207","20231208","20231209","20231210"]
    
    # print("time_list:", time_list)
    print("datelist:", datelist)
    print("predict_stns:", predict_stns)
    print("day_feature_dic", day_feature_dic)
    
    # modify data.json file
    for stn in predict_stns:
        # file we want to write
        with open(f"data_by_station/{stn}.json", "r") as f:
            new_data = json.load(f)
            
        for date in datelist:
            print(stn, date, end='\r')
            
            for time in time_list:
                for feature in ['weekday', 'workingday', 'holiday']:
                    new_data[time][date][feature] = int(day_feature_dic[date][feature])
                
            
        with open(f"data_by_station/{stn}.json", "w") as f:
            json.dump(new_data, f, indent=2)

In [26]:
# date = ["20231211", "20231212", "20231213", "20231218", "20231219", "20231220", "20231221", "20231222", "20231223", "20231224"]
# Add_date_feature(datelist=date)
# Add_date_feature()

datelist: ['20231211', '20231212', '20231213', '20231218', '20231219', '20231220', '20231221', '20231222', '20231223', '20231224']
predict_stns: ['500101001', '500101002', '500101003', '500101004', '500101005', '500101006', '500101007', '500101008', '500101009', '500101010', '500101013', '500101014', '500101015', '500101018', '500101019', '500101020', '500101021', '500101022', '500101023', '500101024', '500101025', '500101026', '500101027', '500101028', '500101029', '500101030', '500101031', '500101032', '500101033', '500101034', '500101035', '500101036', '500101037', '500101038', '500101039', '500101040', '500101041', '500101042', '500101091', '500101092', '500101093', '500101094', '500101114', '500101115', '500101123', '500101166', '500101175', '500101176', '500101181', '500101184', '500101185', '500101188', '500101189', '500101190', '500101191', '500101193', '500101199', '500101209', '500101216', '500101219', '500105066', '500106002', '500106003', '500106004', '500119043', '50011904

In [27]:
# add weather fearure

def Add_weather_feature( datelist=Load_datelist('release') + Load_datelist('test')):
    
    time_list = Load_time_list()
    predict_stns = Load_predict_stns()
    weather_dic = Load_weather_dic()
    # datelist = ["20231021","20231022","20231023","20231024","20231204","20231205","20231206","20231207","20231208","20231209","20231210"]
    
    # print("time_list:", time_list)
    print("datelist:", datelist)
    print("predict_stns:", predict_stns)
    print("weather_dic", weather_dic)
    
    print("\nComplete Add weather feature...\n")

    # modify data.json file
    for stn in predict_stns:
        # file we want to write
        with open(f"data_by_station/{stn}.json", "r") as f:
            new_data = json.load(f)
            
        for date in datelist:
            print(stn, date, end="\r")
            
            for time in time_list:
                # print(time)
                for feature in weather_dic[date][str( int(time[:2]) + 1)]:
                # for feature in ['rain']:
                    new_data[time][date][feature] = weather_dic[date][str( int(time[:2]) + 1)][feature]

            # print(new_data)
        with open(f"data_by_station/{stn}.json", "w") as f:
            json.dump(new_data, f, indent=2)
    print("\nComplete Add weather feature...")

In [28]:
date = ["20231211", "20231212", "20231213", "20231218", "20231219", "20231220", "20231221", "20231222", "20231223", "20231224"]
Add_weather_feature(datelist=date)
# Add_weather_feature()


datelist: ['20231211', '20231212', '20231213', '20231218', '20231219', '20231220', '20231221', '20231222', '20231223', '20231224']
predict_stns: ['500101001', '500101002', '500101003', '500101004', '500101005', '500101006', '500101007', '500101008', '500101009', '500101010', '500101013', '500101014', '500101015', '500101018', '500101019', '500101020', '500101021', '500101022', '500101023', '500101024', '500101025', '500101026', '500101027', '500101028', '500101029', '500101030', '500101031', '500101032', '500101033', '500101034', '500101035', '500101036', '500101037', '500101038', '500101039', '500101040', '500101041', '500101042', '500101091', '500101092', '500101093', '500101094', '500101114', '500101115', '500101123', '500101166', '500101175', '500101176', '500101181', '500101184', '500101185', '500101188', '500101189', '500101190', '500101191', '500101193', '500101199', '500101209', '500101216', '500101219', '500105066', '500106002', '500106003', '500106004', '500119043', '50011904