In [1]:
import json
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np
from time import localtime, strftime

In [2]:
with open("./data_sno_test_set.txt", "r") as f:
    predict_stns = f.read().split()
    predict_stns.sort()
print("predict_stns:", predict_stns)

predict_stns: ['500101001', '500101002', '500101003', '500101004', '500101005', '500101006', '500101007', '500101008', '500101009', '500101010', '500101013', '500101014', '500101015', '500101018', '500101019', '500101020', '500101021', '500101022', '500101023', '500101024', '500101025', '500101026', '500101027', '500101028', '500101029', '500101030', '500101031', '500101032', '500101033', '500101034', '500101035', '500101036', '500101037', '500101038', '500101039', '500101040', '500101041', '500101042', '500101091', '500101092', '500101093', '500101094', '500101114', '500101115', '500101123', '500101166', '500101175', '500101176', '500101181', '500101184', '500101185', '500101188', '500101189', '500101190', '500101191', '500101193', '500101199', '500101209', '500101216', '500101219', '500105066', '500106002', '500106003', '500106004', '500119043', '500119044', '500119045', '500119046', '500119047', '500119048', '500119049', '500119050', '500119051', '500119052', '500119053', '500119054

In [3]:
def Err_func(b_predict, b_truth, total):
    return 3 * abs(b_predict - b_truth) / total * ( abs((3 * b_truth - total)/(3 *total)) + abs((3 * b_truth - 2 * total)/(3 * total)) )

In [4]:
def val(predic_y, y, total):
  err = 0
  for i in range(len(y)):
    err += Err_func(float(predic_y[i]), y[i], total)

  return err / len(y)

In [5]:
def Load_stn_tot():
  with open("./data_stn_tot.json", 'r') as f:
    stn_tot = json.load(f)
  return dict(stn_tot)

In [6]:
def read_data(stn, mode):
  data = []
  x = []
  y = []
  path = f"./data/{stn}_{mode}.txt"

  with open(path, 'r') as f:
    for line in f.readlines():
      tmp_list = line.split()
      for i in range(len(tmp_list)):
        if i not in [0, 1, 2]:
          tmp_list[i] = float(tmp_list[i])
      # if tmp_list[1] in week_list:
      min = int(tmp_list[2][:2]) * 60 + int(tmp_list[2][3:])
      weekday = int(tmp_list[4])
      all_min = 1440 * (weekday - 1) + min
      if tmp_list[4] == 5 and min > 1440 - 60 * 6:
        tmp_list[5] = 0

      def sin_a(a, b):
        return np.sin(a * 2 * np.pi / b)
      def cos_a(a, b):
        return np.cos(a * 2 * np.pi / b)
      k = 1440
      x_input = []
      rain = float(tmp_list[7])
      # for i in range(1, 7):
      #   x_input.append(sin_a(min, k * i))
      #   x_input.append(cos_a(min, k * i))
      x_input = [sin_a(min, k), cos_a(min, k), sin_a(all_min, k * 7), cos_a(all_min, k * 7)] + tmp_list[5:8]
      y_label = int(tmp_list[8])
      x.append(list(x_input))
      y.append(y_label)
      
      frac = y_label/ tmp_list[3]
      N = 10
      if mode == 'train':
        for i in range(int(N * 3 * ( abs(frac - 1/3) + abs(frac - 2/3))) - (N - 1)):
          x.append(list(x_input))
          y.append(y_label)
          if weekday in [6, 7]:
            for i in range(1):
              x.append(list(x_input))
              y.append(y_label)
          if rain != 0:
            for i in range(2):
              x.append(list(x_input))
              y.append(y_label)

          # if min <= 60 * 7 or min >= 1440 - 60 * 2:
          #   x.append(list(x_input))
          #   y.append(int(tmp_list[10]))
        # if min <= 60 * 7 or min >= 1440 - 60 * 2:
        #     x.append(list(x_input))
        #     y.append(int(tmp_list[10]))
          
      data.append(tmp_list)
      
  # print("data:", data)
  # print("x_train:", x)
  # print("y_train:", y)
  data = np.array(data)
  x = np.array(x)
  y = np.array(y)
  scaler = StandardScaler()
  # x = scaler.fit_transform(x)
  # y = scaler.transform(y)
  return data, x, y

In [7]:

all_val_err = 0
all_train_err = 0
count = 0
stn_tot = Load_stn_tot()
val_json = {}

predict_file = open(f"./lightGBM_predict_{strftime('%m%d-%H-%M-%S', localtime())}.csv", 'w')
predict_file.write("id,sbi\n")

predict_log_file = open(f"./lightGBM_predict_log_{strftime('%m%d-%H-%M-%S', localtime())}.txt", "w")

# predict_stns = ['500101001']
for stn in predict_stns:
  count += 1
  data_train, x_train, y_train = read_data(stn, 'train')
  data_val, x_val, y_val = read_data(stn, 'val')
  data_test, x_test, y_test = read_data(stn, 'test')

  train_data = lgb.Dataset(x_train, label=y_train)
  val_data = lgb.Dataset(x_val, label=y_val, reference=train_data)
  params = {
    'objective': 'regression',
    'metric': 'mse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'verbose' : -1
  }

  # Train the model
  num_round = 100
  model = lgb.train(params, train_data, num_round, valid_sets=[val_data])
  pre_train = model.predict(x_train)

  pre_val = model.predict(x_val)
  train_err = val(pre_train, y_train, int(stn_tot[stn]))
  val_err = val(pre_val, y_val, int(stn_tot[stn]))
  all_train_err += train_err
  all_val_err += val_err
  
  log_info = f"{stn}: train: {train_err:<19}, \033[34mval: {val_err:<19}\033[0m, avg_train: {all_train_err / count:<19}, avg_val: {all_val_err/count:<19}"
  print(log_info, file=predict_log_file)
  print(log_info)
  
  val_json[stn] = {}
  val_json[stn]['val'] = val_err
  val_json[stn]['train'] = train_err
  
  
  # plt.figure(figsize=(80, 5), dpi=300)
  # plt.plot(pre_train, label='pre_train', color='b')
  # plt.plot(y_train, label='truth', color='darkorange')
  # plt.legend()
  # plt.show()
  
  # plt.figure(figsize=(80, 5), dpi=300)
  # plt.plot(pre_val, label='pre_train', color='b')
  # plt.plot(y_val, label='truth', color='darkorange')
  # plt.legend()
  # plt.show()
  
  pre_test = model.predict(x_test)
  for i in range(len(data_test)):
    if data_test[i][2][3:] in ["00", "20", "40"]:
      pre_test[i] = pre_test[i] if pre_test[i] > 0 else 0
      id = f'{data_test[i][1]}_{int(data_test[i][0])}_{data_test[i][2]}'
      predict_file.write(f"{id},{pre_test[i]}\n")
      
log_info = f"final: train: {all_train_err / len(predict_stns)}, val: {all_val_err / len(predict_stns)}"
print(log_info, file=predict_log_file)
print(log_info)

predict_file.close()
predict_log_file.close()
with open("./val_lightGBM.json", 'w') as f: 
  json.dump(val_json, f, indent=2)

500101001: train: 0.19338767687897376, [34mval: 0.2241852264131448 [0m, avg_train: 0.19338767687897376, avg_val: 0.2241852264131448 
500101002: train: 0.16272887280448886, [34mval: 0.25908145497401575[0m, avg_train: 0.1780582748417313 , avg_val: 0.24163334069358028
500101003: train: 0.27623464887107035, [34mval: 0.3946498199975066 [0m, avg_train: 0.21078373285151097, avg_val: 0.29263883379488903
500101004: train: 0.36455005127800755, [34mval: 0.37496492364789547[0m, avg_train: 0.24922531245813512, avg_val: 0.31322035625814065
500101005: train: 0.27005884343940817, [34mval: 0.22216058234558378[0m, avg_train: 0.25339201865438976, avg_val: 0.29500840147562923
500101006: train: 0.2645270870194021 , [34mval: 0.16852325934295725[0m, avg_train: 0.2552478633818918 , avg_val: 0.27392754445351725
500101007: train: 0.2522307797449429 , [34mval: 0.1675967801293903 [0m, avg_train: 0.25481685143375626, avg_val: 0.2587374352643563 
500101008: train: 0.39864173126892793, [34mval: 0.5139