In [679]:
# Imports
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import optuna
import joblib

import torch

from darts import TimeSeries
from darts.models import  RandomForest, LinearRegressionModel, LightGBMModel, \
                        CatBoostModel, XGBModel,  BlockRNNModel, NBEATSModel, NHiTSModel, \
                        TCNModel, TFTModel
from darts.dataprocessing.transformers import Scaler

from sklearn.svm import SVC
from sklearn.linear_model import PoissonRegressor


from darts.utils.likelihood_models import GaussianLikelihood

import warnings
warnings.filterwarnings('ignore')

# Configuration

In [460]:
prj_path = '../'
data_path = prj_path + "data/new_data/DH/squeezed/"
prj_path_opt= prj_path + "optimize_hyperparam/opt_results/"
output_process = prj_path + "data/new_data/DH/processed_data/"
output_featureselection = prj_path + "data/new_data/DH/feature_selection/"

In [737]:
all_cities = [
        'An Giang', 'BR Vũng Tàu', 'Bình Phước', 'Bình Thuận', 'Bình Định',
        'Bạc Liêu', 'Bắc Kạn', 'Bắc Giang', 'Cao Bằng', 'Cà Mau',
        'Cần Thơ', 'Gia Lai', 'Hà Giang', 'Hà Nội', 'Hà Tĩnh',
        'Hòa Bình','Hưng Yên', 'Hải Dương', 'Hải Phòng', 'Khánh Hòa', 'Kiên Giang',
        'Kon Tum', 'Lai Châu', 'Long An', 'Lào Cai', 'Lâm Đồng',
        'Lạng Sơn','Nam Định', 'Nghệ An', 'Ninh Bình', 'Ninh Thuận',
        'Phú Thọ', 'Phú Yên', 'Quảng Bình', 'Quảng Nam', 'Quảng Ngãi',
        'Quảng Ninh', 'Quảng Trị', 'Sóc Trăng', 'Sơn La', 'TT Huế',
        'Thanh Hóa', 'Thái Bình', 'Thái Nguyên', 'Tiền Giang', 'Trà Vinh',
        'Tuyên Quang', 'Tây Ninh', 'Vĩnh Phúc', 'Yên Bái', 'Điện Biên',
        'Đà Nẵng', 'Đắk Nông', 'Đắk Lắk', 'Đồng Tháp'
]
cities = ['An Giang','BR Vũng Tàu']

In [462]:
# Set hyperparameters as args using the Configuration class
class Configuration():
    def __init__(self):
        # lấy bộ test dài 36 tháng = 3 năm
        self.test_size = 36
        # là nhìn vào dữ liệu trước 3 tháng và dự phóng        
        self.look_back = 3
        # dự phóng n-step trong 6 tháng
        self.n_predicted_period_months = 6
        self.n_features = 3
        self.seed = 42
        # mỗi phần tử x trong tập suppervise có độ lớn là 16 = 16 tháng
        self.batch_size = 16
        self.device = torch.device("cuda")
        self.epochs = 300
        #others
        self.labels = "Dengue_fever_rates"
        # Input param for Optimize Run
        self.ntry = 1
        self.njob = 1

args = Configuration()

# Seeding

In [463]:
def seed_everything(seed: int):
    import random
    import numpy as np
    import torch
    
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(args.seed)

# Supporting functions

In [464]:
import requests

def send_to_telegram(message):

    apiToken = '5908735099:AAGVSLrW62aXPBP-GrMvxoVgMsuJxXJpP1Q'
    chatID = '@ptn_announcement'
    apiURL = f'https://api.telegram.org/bot{apiToken}/sendMessage'

    try:
        response = requests.post(apiURL, json={'chat_id': chatID, 'text': message})
        print(response.text)
    except Exception as e:
        message_error = "Bị lỗi rùi: "+str(e)
        response = requests.post(apiURL, json={'chat_id': chatID, 'text': message_error})
        print(e)

In [465]:
def get_dict_all_city_data():
  cities_data = {}  
  for city in cities:
    city_result = pd.read_excel(prj_path+'data/new_data/DH/squeezed/squeezed_'+city+'.xlsx')  
    """Get all data from all city in 1997 - 2016""" 
    city_result = city_result.loc[city_result['year_month'] < '2017-1-1'] 
    cities_data[city] = city_result
  return cities_data

In [466]:
# Define data (pre-)processing functions
# modification
def get_city_data(city_name, dict_full_data):
    """Returns Diarrhoea rate and climate data""" 
    city_data = dict_full_data[city_name].drop(columns=['Diarrhoea_cases','Diarrhoea_rates', 'province',
                                                        'Influenza_rates','Influenza_cases',
                                                        'Dengue_fever_cases', 'year', 'month'], 
                                                                  axis=1, 
                                                                  inplace=False)    
    return city_data

def convert_to_stationary(city_data):
    """Subtracts previous value for all cols except disease rates"""
    for col_name in city_data.columns:
        if col_name != 'Dengue_fever_rates':
            try:
                city_data[col_name] = city_data[col_name] - city_data[col_name].shift()
            except:
                print(col_name)
    return city_data

def impute_missing_value(city_data):
    """
    Imputes 0 for first 12 months, 
    last year's value for months 12-24, 
    and minimum value of last two years for months 25+
    """
    for col in city_data.columns:
        for index in range(len(city_data[col])):
            if np.isnan(city_data[col].iloc[index]):
                if index < 12:
                    city_data[col].iloc[index] = 0
                elif index >= 12 and index <= 24:
                    city_data[col].iloc[index] = city_data[col].iloc[index - 12]
                else:
                    city_data[col].iloc[index] = min(city_data[col].iloc[index - 12], city_data[col].iloc[index - 24])
    return city_data

In [467]:
def clean_full_data(dict_full_data):
    climate_and_disease_feats = ['Total_Evaporation',
       'Total_Rainfall', 'Max_Daily_Rainfall', 'n_raining_days',
       'Average_temperature', 'Max_Average_Temperature',
       'Min_Average_Temperature', 'Max_Absolute_Temperature',
       'Min_Absolute_Temperature', 'Average_Humidity', 'Min_Humidity',
       'n_hours_sunshine', 'Dengue_fever_rates']
    for city in cities:
        city_data = get_city_data(city_name=city,dict_full_data = dict_full_data)
        city_data_features = city_data[climate_and_disease_feats]
        city_data_features = impute_missing_value(city_data_features)
        city_data_features = convert_to_stationary(city_data_features)
        city_data_features.dropna(inplace=True)
        city_data_features.loc[:, "year_month"] = city_data["year_month"]
        dict_full_data[city] = city_data_features
    return dict_full_data


In [468]:
def split_data(data, look_back, n_nextstep = args.n_predicted_period_months):
    """Splits data into train and test sets based on args (Configuration class)"""
    train = data[: -args.test_size]    
    test = data[-args.test_size - look_back-(n_nextstep - 1): ]
    return train, test

In [469]:
def to_supervised(data,  d_out, d_in, features_list=[]):
    """
    Frames time-series as supervised learning dataset.
    
    Args:
      d_in: lookback window
      d_out: number of predicted months
      features_list: list of all features **where last col is the disease incidence**

    Returns:
      Numpy arrays of disease incidence (y) and other predictors (X)
    """
    X, y = list(), list()
    for index, _ in enumerate(data):
        in_end = index + d_in
        out_end = in_end + d_out
        if out_end <= len(data):
            if len(features_list) == 0 :
                X.append(data[index: in_end, :-1])
            else:
                X.append(data[index: in_end, features_list])
            y.append(data[out_end-1: out_end, -1])
    return np.array(X), np.array(y)

In [None]:
model = SVC(
          random_state = 30,
          kernel = 'rbf',
          probability=True
      )

# model = NBEATSModel(
#                             input_chunk_length = 3,
#                             output_chunk_length = 6)

# model = PoissonRegressor(
#           max_iter = 20,
#       )
model_name = type(model).__name__
city = "An Giang"

nstep = 1

specific_data = pd.read_csv(output_process+city+'_train_preprocessed.csv', parse_dates=True, index_col= None, encoding = 'unicode_escape')
scaler = joblib.load(output_process+city+'_train_scalerMinMaxNorm.save') #ok
df_train, df_valid = split_data(specific_data, args.look_back,nstep)

# train_X, train_y = to_supervised(df_train, d_out=nstep, d_in=args.look_back )
# eval_X, eval_y = to_supervised(df_valid, d_out=nstep, d_in=args.look_back )
# model.fit(
#         train_X,
#         train_y
#     )
# prediction = model.predict(eval_X)
# print(prediction)
# if model_name in ["PoissonRegressor","SVC"]:
#     print(model_name)
# else:
#     print("HOi nè")
#     print(model_name)
# PoissonRegressor
# SVC

In [736]:
df.iloc[:,:-1]

Unnamed: 0,Total_Evaporation,Total_Rainfall,Max_Daily_Rainfall,n_raining_days,Average_temperature,Max_Average_Temperature,Min_Average_Temperature,Max_Absolute_Temperature,Min_Absolute_Temperature,Average_Humidity,Min_Humidity,n_hours_sunshine,Dengue_fever_rates
0,0.367241,0.466281,0.428136,0.658537,0.838710,0.582090,0.780822,0.5500,0.883495,0.45,0.578947,0.2030,0.519037
1,0.569540,0.462807,0.418119,0.439024,0.612903,0.552239,0.479452,0.7125,0.533981,0.25,0.368421,0.9885,0.514028
2,0.520115,0.496458,0.500436,0.707317,0.790323,0.626866,0.698630,0.6875,0.786408,0.45,0.236842,0.2800,0.553026
3,0.186207,0.491621,0.499565,0.658537,0.629032,0.268657,0.698630,0.3875,0.679612,0.50,0.789474,0.5210,0.538357
4,0.601149,0.435082,0.320122,0.634146,0.516129,0.313433,0.493151,0.4250,0.611650,0.45,0.526316,0.6420,0.583438
...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,0.494253,1.000000,0.401568,0.560976,0.596774,0.447761,0.589041,0.3750,0.689320,0.35,0.578947,0.6950,0.542559
199,0.275862,0.000000,0.493031,0.560976,0.532258,0.223881,0.493151,0.5500,0.611650,0.45,0.605263,0.4400,0.537881
200,0.465517,0.474796,0.475610,0.560976,0.548387,0.388060,0.479452,0.3125,0.524272,0.35,0.394737,0.8300,0.534874
201,0.500000,0.354905,0.244774,0.560976,0.612903,0.388060,0.534247,0.4125,0.660194,0.25,0.394737,0.6500,0.560604


In [733]:
df = pd.read_csv(output_process+city+'_train_preprocessed.csv', parse_dates=True, index_col= None, encoding = 'unicode_escape')
features_list = ['Total_Evaporation', 'Total_Rainfall']
columns = features_list + [args.labels]
train = df.iloc[:,:-1][columns].to_numpy()
train_X, train_y = to_supervised(train, d_out=nstep, d_in=args.look_back )
train_X

array([[[0.3672414, 0.4662807, 0.519037 ],
        [0.5695402, 0.4628065, 0.514028 ],
        [0.5201149, 0.4964578, 0.5530263]],

       [[0.5695402, 0.4628065, 0.514028 ],
        [0.5201149, 0.4964578, 0.5530263],
        [0.1862069, 0.4916213, 0.5383572]],

       [[0.5201149, 0.4964578, 0.5530263],
        [0.1862069, 0.4916213, 0.5383572],
        [0.6011494, 0.4350817, 0.5834378]],

       ...,

       [[0.5114943, 0.4659401, 0.545901 ],
        [0.4942529, 1.       , 0.5425595],
        [0.2758621, 0.       , 0.5378813]],

       [[0.4942529, 1.       , 0.5425595],
        [0.2758621, 0.       , 0.5378813],
        [0.4655172, 0.4747956, 0.5348739]],

       [[0.2758621, 0.       , 0.5378813],
        [0.4655172, 0.4747956, 0.5348739],
        [0.5      , 0.3549046, 0.5606038]]])

In [710]:
def to_supervised(data,  d_out, d_in, features_list=[]):
    """
    Frames time-series as supervised learning dataset.
    
    Args:
      d_in: lookback window
      d_out: number of predicted months
      features_list: list of all features **where last col is the disease incidence**

    Returns:
      Numpy arrays of disease incidence (y) and other predictors (X)
    """
    X, y = list(), list()
    for index, _ in enumerate(data):
        in_end = index + d_in
        out_end = in_end + d_out
        if out_end <= len(data):
            if len(features_list) == 0 :
                X.append(data[index: in_end, :])
            else:
                # display(data)
                print("index:",index)
                print("in_end:",in_end)
                X.append(data[index: in_end, features_list])
                
            y.append(data[out_end-1: out_end, -1])
    return np.array(X), np.array(y)

In [725]:
df.iloc[:,:]

Unnamed: 0,Total_Evaporation,Total_Rainfall,Max_Daily_Rainfall,n_raining_days,Average_temperature,Max_Average_Temperature,Min_Average_Temperature,Max_Absolute_Temperature,Min_Absolute_Temperature,Average_Humidity,Min_Humidity,n_hours_sunshine,Dengue_fever_rates,year_month
0,0.367241,0.466281,0.428136,0.658537,0.838710,0.582090,0.780822,0.5500,0.883495,0.45,0.578947,0.2030,0.519037,1997-02-28
1,0.569540,0.462807,0.418119,0.439024,0.612903,0.552239,0.479452,0.7125,0.533981,0.25,0.368421,0.9885,0.514028,1997-03-31
2,0.520115,0.496458,0.500436,0.707317,0.790323,0.626866,0.698630,0.6875,0.786408,0.45,0.236842,0.2800,0.553026,1997-04-30
3,0.186207,0.491621,0.499565,0.658537,0.629032,0.268657,0.698630,0.3875,0.679612,0.50,0.789474,0.5210,0.538357,1997-05-31
4,0.601149,0.435082,0.320122,0.634146,0.516129,0.313433,0.493151,0.4250,0.611650,0.45,0.526316,0.6420,0.583438,1997-06-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,0.494253,1.000000,0.401568,0.560976,0.596774,0.447761,0.589041,0.3750,0.689320,0.35,0.578947,0.6950,0.542559,2013-08-31
199,0.275862,0.000000,0.493031,0.560976,0.532258,0.223881,0.493151,0.5500,0.611650,0.45,0.605263,0.4400,0.537881,2013-09-30
200,0.465517,0.474796,0.475610,0.560976,0.548387,0.388060,0.479452,0.3125,0.524272,0.35,0.394737,0.8300,0.534874,2013-10-31
201,0.500000,0.354905,0.244774,0.560976,0.612903,0.388060,0.534247,0.4125,0.660194,0.25,0.394737,0.6500,0.560604,2013-11-30


In [723]:
df = pd.read_csv(output_process+city+'_train_preprocessed.csv', parse_dates=True, index_col= None, encoding = 'unicode_escape')
train = df.iloc[:,:-1].to_numpy()
train

array([[0.3672414, 0.4662807, 0.4281359, ..., 0.5789474, 0.203    ,
        0.519037 ],
       [0.5695402, 0.4628065, 0.4181185, ..., 0.3684211, 0.9885   ,
        0.514028 ],
       [0.5201149, 0.4964578, 0.5004355, ..., 0.2368421, 0.28     ,
        0.5530263],
       ...,
       [0.4655172, 0.4747956, 0.4756098, ..., 0.3947368, 0.83     ,
        0.5348739],
       [0.5      , 0.3549046, 0.2447735, ..., 0.3947368, 0.65     ,
        0.5606038],
       [0.5747126, 0.4536785, 0.5235192, ..., 0.5526316, 0.61     ,
        0.5315323]])

In [714]:
data =df_train
data[0:3,:]

InvalidIndexError: (slice(0, 3, None), slice(None, None, None))

In [711]:
# df_train, df_valid = split_data(specific_data, args.look_back,nstep)
features_list = ['Total_Rainfall', 'Min_Humidity', 'Dengue_fever_rates']
train_X, train_y = to_supervised(df_train, d_out=nstep, d_in=args.look_back, features_list=feature_list+[args.labels] )
train_X

index: 0
in_end: 3


InvalidIndexError: (slice(0, 3, None), ['Max_Daily_Rainfall', 'Min_Average_Temperature', 'Dengue_fever_rates', 'Dengue_fever_rates'])

In [669]:
def train_and_evaluate(df_train, df_eval, model, feature_list , labels, scaler, is_dl_algo,nstep):
  """
  $df: pandas.DataFrame object containing data for training and testing model:
  $model: darts model object
  $feature_list: Names of the features used as model input
  $label: the value the model will be trained to predict
  $scaler: scaler object. Note: the scaler will be fitted on training data and applied to test data
  $lags: how much to look back into the past to output prediction
  $split_index: the point at which to divide train and test_data

  """

  if is_dl_algo == 1:
    print("🍋")
  else:
    x_train = TimeSeries.from_dataframe(df_train, "year_month", feature_list)
    y_train = TimeSeries.from_dataframe(df_train, "year_month", labels)

    x_test = TimeSeries.from_dataframe(df_eval, "year_month", feature_list)
    y_test = TimeSeries.from_dataframe(df_eval, "year_month", labels)

    model.fit(y_train, past_covariates = x_train)

    prediction = model.predict(len(y_test)-args.look_back, past_covariates = x_test, num_samples=1)

    y_true = scaler.inverse_transform(df_eval.iloc[:,:-1])[:,[-1]].reshape(len(df_eval))[args.look_back:]

    df_eval[labels][args.look_back:] = np.array(prediction._xa).squeeze()
    y_pred = scaler.inverse_transform(df_eval.iloc[args.look_back:,:-1])[:,[-1]].reshape(len(prediction))

    # df_compare_test_predict = pd.DataFrame({'y_true':y_true, 'y_pred':y_pred})
    # df_compare_test_predict.plot()
    # plt.legend()
    # plt.show()

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mse**0.5
    mape = mean_absolute_percentage_error(y_true, y_pred)
    print(f"mean_squared_error: {mse:.4f}")
    print(f"rmse: {rmse}")
    print(f"mape: {mape}")
    return model, y_true, y_pred, mse, mae, rmse, mape

In [670]:
def output_prediction_for_location(df_train, df_eval, model, location, feature_list, 
                                                labels, scaler, is_dl_algo,nstep):
    """train and generate prediction for a province
    df: DataFrame object containing features and label(s) for training model
    localtion: location_name
    feature_list: list of features used as model input,  must be among the column names of df
    labels: the values model will be trained to predict
    scaler: sklearn scaler object
    lags: how long into the past to look back when making prediction
    split_index: the point at which to divide data into the train and test subsets.
    """
    model, y_true, prediction_inverse, mse, mae, rmse, mape = train_and_evaluate(df_train, df_eval, model, feature_list, labels, scaler,is_dl_algo,nstep)
    df_prediction = pd.DataFrame({"Date": df_eval["year_month"][-len(prediction_inverse):],
                                  "Observed": y_true[-len(prediction_inverse):],
                                  "1-month": prediction_inverse})
    
    df_prediction["City"] = location
    df_prediction["RMSE_1-month"] = rmse
    df_prediction["MAE_1-month"] = mae
    df_prediction["MAPE_1-month"] = mape

    return mae

In [472]:
def getDataWithSelectedFeature(city, next_predicted_month):
  selected_feature = []
  df = pd.read_csv(output_featureselection+str(next_predicted_month)+"step_feature_selection_3_most.csv", encoding = 'unicode_escape')
  for row in range(len(df)):
    if (df["City"][row] == city):
      selected_feature.append(df["1st_Feature"][row])
      selected_feature.append(df["2nd_Feature"][row])
      selected_feature.append(df["3rd_Feature"][row])
  return selected_feature

# Objective and Suggest Hyperparams of Darts Models


In [None]:

city = "An Giang"
nstep = 6

lags_by_nstep = args.look_back + nstep - 1
lags_past_covariates_by_nstep = [-lags_by_nstep+2,-lags_by_nstep+1,-lags_by_nstep] #Mảng này chứa ba giá trị tương ứng cho args.lookback 3

specific_data = pd.read_csv(output_process+city+'_train_preprocessed.csv', parse_dates=True, index_col= None, encoding = 'unicode_escape')
scaler = joblib.load(output_process+city+'_train_scalerMinMaxNorm.save') #ok
labels=args.labels

df_train, df_eval = split_data(specific_data, args.look_back,nstep)
selected_features = getDataWithSelectedFeature(city, nstep)


predicted_train_days = len(df_train)-args.look_back-nstep+1
predicted_test_days = len(df_eval)-args.look_back-nstep+1
x_train = TimeSeries.from_dataframe(df_train, "year_month", selected_features)
y_train = TimeSeries.from_dataframe(df_train, "year_month", labels)

x_test = TimeSeries.from_dataframe(df_eval, "year_month", selected_features)
y_test = TimeSeries.from_dataframe(df_eval, "year_month", labels)

random_state = 300#trial.suggest_int('random_state', 0, 1000)
n_rnn_layers =  2#trial.suggest_int('n_rnn_layers', 1, 3)
dropout =  0.2#trial.suggest_uniform('dropout', 0.1, 0.5)
hidden_dim =  10#trial.suggest_int('n_rnn_layers', 5, 20)
n_epochs =  100#trial.suggest_int('n_epochs', 50, 200)

pl_trainer_kwargs = {
              "accelerator": "cpu",
            #   "devices": -1,
            #   "auto_select_gpus": True,
          }

# model = BlockRNNModel(
#                     input_chunk_length = 3,
#                     output_chunk_length = 6,
#                     hidden_dim = hidden_dim,
#                     n_rnn_layers = n_rnn_layers,
#                     dropout = dropout,
#                     n_epochs = n_epochs,
#                     pl_trainer_kwargs = pl_trainer_kwargs,
#                     random_state=random_state)
# model = TFTModel(
#                     input_chunk_length = 3,
#                     output_chunk_length = 6,
#                     add_relative_index = True,
#                     dropout = dropout,
#                     n_epochs = n_epochs ,
#                     pl_trainer_kwargs = pl_trainer_kwargs,
#                     random_state=random_state)
# model = NHiTSModel(
#                           input_chunk_length = 3,
#                           output_chunk_length = 6,
#                           MaxPool1d = True,
#                           dropout = dropout,
#                           n_epochs = n_epochs ,
#                           pl_trainer_kwargs = pl_trainer_kwargs,
#                           random_state=random_state)
model = NBEATSModel(
                            input_chunk_length = 3,
                            output_chunk_length = 6,
                            dropout = dropout,
                            n_epochs = n_epochs ,
                            pl_trainer_kwargs = pl_trainer_kwargs,
                            random_state=random_state)
# model = TCNModel(
#           input_chunk_length = 3,
#           output_chunk_length = 6,
#           batch_size=16,
#           n_epochs=50,
#           nr_epochs_val_period=1,
#           kernel_size=2,
#           num_filters=1,
#           weight_norm=True,
#           dilation_base=3,
#           dropout=0.2,
#           optimizer_kwargs={"lr": 5e-5},
#           add_encoders=None,
#           likelihood=GaussianLikelihood(),
#           pl_trainer_kwargs=pl_trainer_kwargs,
#           model_name="tcn_model",
#           force_reset=True,
#           save_checkpoints=True,
#       )
model.fit(y_train, past_covariates = x_train)

In [627]:
prediction = model.predict(6, past_covariates = x_test[0:8], num_samples=1)
prediction

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Predicting DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 125.50it/s]


In [671]:
def objective(model_name, trial, city, nstep):   
    specific_data = pd.read_csv(output_process+city+'_train_preprocessed.csv', parse_dates=True, index_col= None, encoding = 'unicode_escape')
    scaler = joblib.load(output_process+city+'_train_scalerMinMaxNorm.save') #ok

    df_train, df_valid = split_data(specific_data, args.look_back,nstep)

    selected_features = getDataWithSelectedFeature(city, nstep)

    lags_by_nstep = args.look_back + nstep - 1
    lags_past_covariates_by_nstep = [-lags_by_nstep+2,-lags_by_nstep+1,-lags_by_nstep] #Mảng này chứa ba giá trị tương ứng cho args.lookback 3
    is_dl_algo = 0

    pl_trainer_kwargs = {
              "accelerator": "cpu",
              # "devices": -1,
              # "auto_select_gpus": True,
          }

    if model_name == "RandomForest":
      random_state = trial.suggest_int('random_state', 0, 42)
      n_estimators = trial.suggest_int('n_estimators', 50, 200)
      max_depth = trial.suggest_int('max_depth', 1, 15)
      # Create the RandomForest model
      model = RandomForest(
                    lags = lags_by_nstep,
                    lags_past_covariates = lags_past_covariates_by_nstep,
                    output_chunk_length = 1,
                    n_estimators = n_estimators,
                    max_depth = max_depth,
                    random_state=random_state)
    elif model_name == 'XGBModel':
      random_state = trial.suggest_int('random_state', 0, 43)
      likelihood = trial.suggest_categorical('likelihood', ['quantile'])
      # Create the  model
      model = XGBModel(
                      lags = lags_by_nstep,
                      lags_past_covariates = lags_past_covariates_by_nstep,
                      output_chunk_length = 1,
                      random_state=random_state,
                      likelihood = likelihood
                    )
    elif model_name == 'LinearRegressionModel':
      random_state = trial.suggest_int('random_state', 0, 43)
      # Create the  model
      model = LinearRegressionModel(
                      lags = lags_by_nstep,
                      lags_past_covariates = lags_past_covariates_by_nstep,
                      output_chunk_length = 1,
                      random_state=random_state)
    elif model_name == "CatBoostModel":
      #suggest hyperparams
      learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)
      n_estimators = trial.suggest_int('n_estimators', 50, 200)
      max_depth = trial.suggest_int('max_depth', 1, 15)
      random_state = trial.suggest_int('random_state', 0, 1000)
      likelihood = trial.suggest_categorical('likelihood', ['quantile'])
      quantiles =  trial.suggest_categorical('quantiles', [None, [0.1, 0.5, 0.9]])
      bagging_temperature = trial.suggest_float('bagging_temperature', 0.01, 100.0)
      border_count = trial.suggest_int('border_count', 1, 255)
      l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 0.1, 10)
      random_strength = trial.suggest_float('random_strength', 0.1, 10)
      model = CatBoostModel(
                            lags=lags_by_nstep,
                            lags_past_covariates=lags_past_covariates_by_nstep, 
                            learning_rate=learning_rate,
                            n_estimators=n_estimators,
                            max_depth=max_depth, 
                            output_chunk_length = 1,
                            likelihood = likelihood,
                            quantiles = quantiles,
                            bagging_temperature = bagging_temperature,
                            border_count = border_count,
                            l2_leaf_reg = l2_leaf_reg,
                            random_strength = random_strength,
                            random_state=random_state)
    elif model_name == "LightGBMModel":
      params = {
        "lags": lags_by_nstep,
        "lags_past_covariates": lags_past_covariates_by_nstep,
        "random_state": trial.suggest_int("random_state", 0, 999),
        "multi_models": trial.suggest_categorical("multi_models", [True, False]),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'verbose': -1,
        'likelihood' : trial.suggest_categorical("likelihood", ["quantile"])
      }

      param = params
      model = LightGBMModel(
          lags = param['lags'],
          lags_past_covariates = param['lags_past_covariates'],
          output_chunk_length = 1,
          random_state = param['random_state'],
          multi_models = param['multi_models'],
          likelihood = param['likelihood'],
          num_leaves = param['num_leaves'],
          learning_rate = param['learning_rate'],
          feature_fraction = param['feature_fraction'],
          bagging_fraction = param['bagging_fraction'],
          min_child_samples = param['min_child_samples'],
          lambda_l1 = param['lambda_l1'],
          verbose = param['verbose']
      )
    elif model_name == "SVMRBF":
      random_state = trial.suggest_int('random_state', 0, 42)
      model = SVC(
          random_state = random_state,
          kernel = 'rbf',
          probability=True
      )
    elif model_name == "PoissonRegressor":
      max_iter = trial.suggest_int('max_iter', 50, 200)
      model = PoissonRegressor(
          max_iter = max_iter,
      )
    elif model_name == "BlockRNNModel":
      #suggest hyperparams
      random_state = trial.suggest_int('random_state', 0, 1000)
      n_rnn_layers = trial.suggest_int('n_rnn_layers', 1, 3)
      dropout = trial.suggest_uniform('dropout', 0.1, 0.5)
      hidden_dim = trial.suggest_int('n_rnn_layers', 5, 20)
      n_epochs = trial.suggest_int('n_epochs', 50, 200)

      model = BlockRNNModel(
                          input_chunk_length = args.look_back,
                          output_chunk_length = args.n_predicted_period_months,
                          hidden_dim = hidden_dim,
                          n_rnn_layers = n_rnn_layers,
                          dropout = dropout,
                          n_epochs = n_epochs,
                          pl_trainer_kwargs = pl_trainer_kwargs,
                          random_state=random_state)
      is_dl_algo = 1
    elif model_name == 'TFTModel':
      # Define the hyperparameters to optimize
      random_state = trial.suggest_int('random_state', 0, 42)
      dropout = trial.suggest_uniform('dropout', 0.01, 0.8)
      n_epochs = trial.suggest_int('n_epochs', 50, 200)

      # Create the TFTModel model
      model = TFTModel(
                    input_chunk_length = args.look_back,
                    output_chunk_length = args.n_predicted_period_months,
                    add_relative_index = True,
                    dropout = dropout,
                    n_epochs = n_epochs ,
                    random_state=random_state)
      is_dl_algo = 1
    elif model_name == 'NHiTSModel':
      #suggest hyperparams
      random_state = trial.suggest_int('random_state', 0, 42)
      dropout = trial.suggest_uniform('dropout', 0.01, 0.80)
      n_epochs = trial.suggest_int('n_epochs', 100, 500, step=10)
      MaxPool1d = trial.suggest_categorical('MaxPool1d', [True, False])

      model = NHiTSModel(
                          input_chunk_length = args.look_back,
                          output_chunk_length = args.n_predicted_period_months,
                          MaxPool1d = MaxPool1d,
                          dropout = dropout,
                          n_epochs = n_epochs ,
                          pl_trainer_kwargs = pl_trainer_kwargs,
                          random_state=random_state)
      is_dl_algo = 1
    elif model_name == "NBEATSModel":
      random_state = trial.suggest_int('random_state', 0, 42)
      dropout = trial.suggest_uniform('dropout', 0.01, 0.80)
      n_epochs = trial.suggest_int('n_epochs', 50, 200)
      model = NBEATSModel(
                            input_chunk_length = args.look_back,
                            output_chunk_length = args.n_predicted_period_months,
                            dropout = dropout,
                            n_epochs = n_epochs ,
                            pl_trainer_kwargs = pl_trainer_kwargs,
                            random_state=random_state)
      is_dl_algo = 1
    elif model_name == "TCNModel":
      params = {
        'kernel_size': trial.suggest_int("kernel_size", 2, lags_by_nstep),
        'num_filters': trial.suggest_int("num_filters", 1, 5),
        'weight_norm': trial.suggest_categorical("weight_norm", [False, True]),
        'dilation_base': trial.suggest_int("dilation_base", 2, 4),
        'dropout': trial.suggest_float("dropout", 0.0, 0.4),
        'learning_rate': trial.suggest_float("learning_rate", 5e-5, 1e-3, log=True),
        'include_year': trial.suggest_categorical("year", [False, True]),
        'n_epochs': trial.suggest_int("n_epochs", 100, 300),
      }
      # select input and output chunk lengths
      params['input_chunk_length'] = args.look_back
      params['output_chunk_length'] = args.n_predicted_period_months  
      # optionally also add the (scaled) year value as a past covariate
      if params['include_year']:
          encoders = {"datetime_attribute": {"past": ["year"]},
                      "transformer": Scaler()}
      else:
          encoders = None
      params['encoders'] = encoders
      param = params
      model = TCNModel(
          input_chunk_length=param['input_chunk_length'],
          output_chunk_length=param['output_chunk_length'],
          batch_size=16,
          n_epochs=param['n_epochs'],
          nr_epochs_val_period=1,
          kernel_size=param['kernel_size'],
          num_filters=param['num_filters'],
          weight_norm=param['weight_norm'],
          dilation_base=param['dilation_base'],
          dropout=param['dropout'],
          optimizer_kwargs={"lr": param['learning_rate']},
          add_encoders=param['encoders'],
          likelihood=GaussianLikelihood(),
          pl_trainer_kwargs=pl_trainer_kwargs,
          model_name="tcn_model",
          force_reset=True,
          save_checkpoints=True,
      )
      is_dl_algo = 1
    
    mae_error = output_prediction_for_location(df_train, df_valid, model, location=city, feature_list=selected_features,
                                                labels=args.labels, scaler=scaler, is_dl_algo = is_dl_algo,nstep)

    return mae_error

# Main run optimize and save

In [738]:
#########################
# Main cell for optimize ML algorithm
#########################

model_name_list = [
     "RandomForest",
    #  "LinearRegressionModel",
    #  "LightGBMModel",
    #  "CatBoostModel",
    #  "XGBModel",
    # "PoissonRegressor",
    # "SVMRBF"
]


# Lưu thông tin traceback study và error city trong quá trình optimize
l_study_city ={}
l_errCity =[]

if __name__ == '__main__':
  for nstep in range(1,args.n_predicted_period_months+1):
    print("⭐️ Nstep: ",nstep)
    lags_by_nstep = args.look_back + nstep - 1
    lags_past_covariates_by_nstep = [-lags_by_nstep+2,-lags_by_nstep+1,-lags_by_nstep] #Mảng này chứa ba giá trị tương ứng cho args.lookback 3
    lags_past_covariates_in_str = str(-lags_by_nstep+2)+","+str(-lags_by_nstep+1)+","+str(-lags_by_nstep)
    for model_name in model_name_list: 
      print("⭐️ Model_name: ",model_name)
      best_param = pd.DataFrame()
      for city_index in range(len(cities)):
        print("⭐️ City: ",cities[city_index])
        # Use Tree-structured Parzen Estimator sampler to minimise RMSE
        sampler = optuna.samplers.TPESampler()
        study = optuna.create_study(sampler=sampler, direction='minimize', study_name = model_name)
        # truyền multiple param vào trong biến trial
        obj_func = lambda trial: objective(model_name, trial, cities[city_index], nstep = nstep)
        try:
          # Optimise over 100 trials
          study.optimize(obj_func, n_trials=args.ntry, n_jobs=args.njob)

          # Print results
          print("Study statistics for : ")
          print("  Number of finished trials: ", len(study.trials))
          print("Best trial of city: ",cities[city_index])

          best_trial = study.best_trial
          # lưu best param vào trong biến toàn cục

          if model_name == "LinearRegressionModel":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'LinearRegressionModel',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'lags' : lags_by_nstep,
                                'lags_past_covariates': [lags_past_covariates_in_str],
                                'output_chunk_length': 1,
                                'random_state':best_trial.params['random_state'],
                                })
          elif model_name == 'XGBModel':
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'XGBModel',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'lags' : lags_by_nstep,
                                'lags_past_covariates': [lags_past_covariates_in_str],
                                'output_chunk_length': 1,
                                'random_state':best_trial.params['random_state'],
                                'likelihood': best_trial.params['likelihood'],
                                })
          elif model_name == "LightGBMModel":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'LightGBMModel',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'lags': lags_by_nstep,
                                'lags_past_covariates': [lags_past_covariates_in_str],
                                'multi_models': best_trial.params['multi_models'],
                                'num_leaves': best_trial.params['num_leaves'], 
                                'feature_fraction': best_trial.params['feature_fraction'], 
                                'min_child_samples': best_trial.params['min_child_samples'], 
                                'lambda_l1': best_trial.params['lambda_l1'], 
                                'lambda_l2': best_trial.params['lambda_l2'], 
                                'likelihood': best_trial.params['likelihood'], 
                                'learning_rate': best_trial.params['learning_rate']
                                })
          elif model_name == "CatBoostModel":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'CatBoost',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'lags' : lags_by_nstep,
                                'lags_past_covariates': [lags_past_covariates_in_str],
                                'output_chunk_length': 1,
                                'likelihood': best_trial.params['likelihood'],
                                'learning_rate': best_trial.params['learning_rate'],
                                'n_estimators': best_trial.params['n_estimators'],
                                'max_depth': best_trial.params['max_depth'],
                                'bagging_temperature': best_trial.params['bagging_temperature'],
                                'l2_leaf_reg': best_trial.params['l2_leaf_reg'],
                                'random_strength':best_trial.params['random_strength'],
                                })
          elif model_name == "RandomForest":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'RandomForest',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'lags' : lags_by_nstep,
                                'lags_past_covariates': [lags_past_covariates_in_str],
                                'output_chunk_length': 1,
                                'n_estimators': best_trial.params['n_estimators'],
                                'max_depth': best_trial.params['max_depth'],
                                'random_state':best_trial.params['random_state'],
                                })
          elif model_name == "SVMRBF":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'RandomForest',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'random_state':best_trial.params['random_state'],
                                'kernel': 'rbf',
                                'probability': True,
                                })
          elif model_name == "PoissonRegressor":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'RandomForest',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'max_iter':best_trial.params['max_iter'],
                                })
          # file_path = 'opt_results/opt_res_ml_26102023/261023_DF_opt_hyperparam_'+ model_name + '_'+str(nstep)+'-nstep.xlsx'
          folder_path = f'opt_results/opt_res_ml_26102023/{model_name}/'
          file_path = folder_path+ f'261023_DF_opt_hyperparam_{model_name}_{nstep}-nstep.xlsx'
          if(os.path.isfile(file_path)):
              with pd.ExcelWriter(file_path,mode="a",engine="openpyxl",if_sheet_exists="overlay") as writer:
                  one_city_param.to_excel(writer, header=None, startrow=city_index+1,index=False)
          else:
              if(not (os.path.isdir(folder_path))):
                os.mkdir(folder_path)
              with pd.ExcelWriter(file_path,engine="openpyxl") as writer:
                  one_city_param.to_excel(writer, startrow=city_index,index=False)
        except:# có error thì lưu vào l_errCity để check lại sau 
          l_errCity.append(cities[city_index])
          #send_to_telegram(f'Tỉnh bị lỗi trong quá trình optimize bằng model {model_name}: {cities[city_index]}')

[I 2023-10-27 12:27:26,746] A new study created in memory with name: RandomForest


⭐️ Nstep:  1
⭐️ Model_name:  RandomForest
⭐️ City:  An Giang


[I 2023-10-27 12:27:27,158] Trial 0 finished with value: 6.097256629441549 and parameters: {'random_state': 12, 'n_estimators': 175, 'max_depth': 6}. Best is trial 0 with value: 6.097256629441549.
[I 2023-10-27 12:27:27,191] A new study created in memory with name: RandomForest
[I 2023-10-27 12:27:27,306] Trial 0 finished with value: 13.60339303873743 and parameters: {'random_state': 28, 'n_estimators': 88, 'max_depth': 12}. Best is trial 0 with value: 13.60339303873743.
[I 2023-10-27 12:27:27,312] A new study created in memory with name: RandomForest


mean_squared_error: 105.0770
rmse: 10.250708569440274
mape: 86762.76542375951
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  An Giang
⭐️ City:  BR Vũng Tàu
mean_squared_error: 319.2892
rmse: 17.868664186081478
mape: 5.1471560068223
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  BR Vũng Tàu
⭐️ Nstep:  2
⭐️ Model_name:  RandomForest
⭐️ City:  An Giang


[I 2023-10-27 12:27:27,439] Trial 0 finished with value: 5.416176071564901 and parameters: {'random_state': 42, 'n_estimators': 89, 'max_depth': 4}. Best is trial 0 with value: 5.416176071564901.
[I 2023-10-27 12:27:27,446] A new study created in memory with name: RandomForest
[I 2023-10-27 12:27:27,539] Trial 0 finished with value: 10.398870733354892 and parameters: {'random_state': 36, 'n_estimators': 62, 'max_depth': 13}. Best is trial 0 with value: 10.398870733354892.
[I 2023-10-27 12:27:27,549] A new study created in memory with name: RandomForest


mean_squared_error: 78.3160
rmse: 8.849632941120936
mape: 36683.25692995537
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  An Giang
⭐️ City:  BR Vũng Tàu
mean_squared_error: 186.4894
rmse: 13.656110371310708
mape: 5.926614935339083
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  BR Vũng Tàu
⭐️ Nstep:  3
⭐️ Model_name:  RandomForest
⭐️ City:  An Giang


[I 2023-10-27 12:27:27,767] Trial 0 finished with value: 5.883820809194738 and parameters: {'random_state': 4, 'n_estimators': 122, 'max_depth': 13}. Best is trial 0 with value: 5.883820809194738.
[I 2023-10-27 12:27:27,773] A new study created in memory with name: RandomForest
[I 2023-10-27 12:27:27,891] Trial 0 finished with value: 12.800498357354186 and parameters: {'random_state': 4, 'n_estimators': 82, 'max_depth': 14}. Best is trial 0 with value: 12.800498357354186.
[I 2023-10-27 12:27:27,898] A new study created in memory with name: RandomForest


mean_squared_error: 92.6774
rmse: 9.626912151081413
mape: 56348.513387068066
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  An Giang
⭐️ City:  BR Vũng Tàu
mean_squared_error: 262.6209
rmse: 16.205581472183017
mape: 12.117646372883518
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  BR Vũng Tàu
⭐️ Nstep:  4
⭐️ Model_name:  RandomForest
⭐️ City:  An Giang


[I 2023-10-27 12:27:28,094] Trial 0 finished with value: 4.5817607680430985 and parameters: {'random_state': 5, 'n_estimators': 154, 'max_depth': 2}. Best is trial 0 with value: 4.5817607680430985.
[I 2023-10-27 12:27:28,101] A new study created in memory with name: RandomForest
[I 2023-10-27 12:27:28,186] Trial 0 finished with value: 9.032816742330965 and parameters: {'random_state': 30, 'n_estimators': 55, 'max_depth': 6}. Best is trial 0 with value: 9.032816742330965.
[I 2023-10-27 12:27:28,193] A new study created in memory with name: RandomForest


mean_squared_error: 32.8157
rmse: 5.728502505361685
mape: 23855.24432182322
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  An Giang
⭐️ City:  BR Vũng Tàu
mean_squared_error: 154.3058
rmse: 12.42198751084482
mape: 3.6104415506733516
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  BR Vũng Tàu
⭐️ Nstep:  5
⭐️ Model_name:  RandomForest
⭐️ City:  An Giang


[I 2023-10-27 12:27:28,419] Trial 0 finished with value: 4.176993094262514 and parameters: {'random_state': 35, 'n_estimators': 175, 'max_depth': 1}. Best is trial 0 with value: 4.176993094262514.
[I 2023-10-27 12:27:28,426] A new study created in memory with name: RandomForest


mean_squared_error: 28.7323
rmse: 5.360255789860392
mape: 27959.55600678128
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  An Giang
⭐️ City:  BR Vũng Tàu
mean_squared_error: 121.1243
rmse: 11.00564820384095
mape: 2.5147025401791265


[I 2023-10-27 12:27:28,624] Trial 0 finished with value: 7.537401765990689 and parameters: {'random_state': 36, 'n_estimators': 131, 'max_depth': 13}. Best is trial 0 with value: 7.537401765990689.
[I 2023-10-27 12:27:28,631] A new study created in memory with name: RandomForest


Study statistics for : 
  Number of finished trials:  1
Best trial of city:  BR Vũng Tàu
⭐️ Nstep:  6
⭐️ Model_name:  RandomForest
⭐️ City:  An Giang


[I 2023-10-27 12:27:28,980] Trial 0 finished with value: 5.036410127138646 and parameters: {'random_state': 33, 'n_estimators': 187, 'max_depth': 13}. Best is trial 0 with value: 5.036410127138646.
[I 2023-10-27 12:27:28,987] A new study created in memory with name: RandomForest
[I 2023-10-27 12:27:29,108] Trial 0 finished with value: 9.733181115039953 and parameters: {'random_state': 10, 'n_estimators': 79, 'max_depth': 7}. Best is trial 0 with value: 9.733181115039953.


mean_squared_error: 40.9196
rmse: 6.39684056164939
mape: 60145.4629309903
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  An Giang
⭐️ City:  BR Vũng Tàu
mean_squared_error: 152.8215
rmse: 12.36210033301853
mape: 3.1381712810440496
Study statistics for : 
  Number of finished trials:  1
Best trial of city:  BR Vũng Tàu


In [None]:
#########################
# Main cell for optimize DL algorithm
#########################

model_name_list = [
    #  "BlockRNNModel",
    #  "NBEATSModel",
    #  "NHiTSModel",
    #  "TFTModel",
    #  "TCNModel",
    
]

# Lưu thông tin traceback study và error city trong quá trình optimize
l_study_city ={}
l_errCity =[]

if __name__ == '__main__':
    for model_name in model_name_list: 
      print("⭐️ Model_name: ",model_name)
      best_param = pd.DataFrame()
      for city_index in range(len(cities)):
        print("⭐️ City: ",cities[city_index])
        # Use Tree-structured Parzen Estimator sampler to minimise RMSE
        sampler = optuna.samplers.TPESampler()
        study = optuna.create_study(sampler=sampler, direction='minimize', study_name = model_name)
        # truyền multiple param vào trong biến trial
        obj_func = lambda trial: objective(model_name, trial, cities[city_index], nstep = args.n_predicted_period_months)
        try:
          # Optimise over 100 trials
          study.optimize(obj_func, n_trials=args.ntry, n_jobs=args.njob)

          # Print results
          print("Study statistics for : ")
          print("  Number of finished trials: ", len(study.trials))
          print("Best trial of city: ",cities[city_index])

          best_trial = study.best_trial
          # lưu best param vào trong biến toàn cục
          if model_name == "NHiTSModel":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'N-HiTS',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'input_chunk_length' : lags_by_nstep,
                                'output_chunk_length' : 1,
                                'MaxPool1d' : best_trial.params['MaxPool1d'],
                                'dropout' : best_trial.params['dropout'],
                                'n_epochs' : best_trial.params['n_epochs'],
                                'random_state' : best_trial.params['random_state'],
                                })
          elif model_name == "TCNModel":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'TCNModel',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'input_chunk_length': lags_by_nstep,
                                'output_chunk_length': 1,
                                'n_epochs':best_trial.params['n_epochs'],
                                'num_filters':best_trial.params['num_filters'],
                                'weight_norm':best_trial.params['weight_norm'],
                                'dilation_base':best_trial.params['dilation_base'],
                                'dropout':best_trial.params['dropout'],
                                'learning_rate':best_trial.params['learning_rate'],
                                'year':best_trial.params['year'],
                                })
          elif model_name == "NBEATSModel":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'NBeatsModel',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'output_chunk_length': 1,
                                'input_chunk_length': lags_by_nstep,
                                'n_epochs':best_trial.params['n_epochs'],
                                'dropout':best_trial.params['dropout'],
                                'random_state':best_trial.params['random_state'],
                                })  
          elif model_name == "TFTModel":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'TFTModel',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'output_chunk_length': 1,
                                'input_chunk_length': lags_by_nstep,
                                'add_relative_index': True,
                                'random_state':best_trial.params['random_state'],
                                'n_epochs':best_trial.params['n_epochs'],
                                'dropout':best_trial.params['dropout']
                                })
          elif model_name == "BlockRNNModel":
            one_city_param = pd.DataFrame({
                                'City':  cities[city_index],
                                'Alg_name': 'BlockRNNModel',
                                'Best_value': best_trial.value,
                                'n_try_opt': args.ntry,
                                'input_chunk_length': lags_by_nstep,
                                'output_chunk_length': 1,
                                'random_state':best_trial.params['random_state'],
                                'n_epochs':best_trial.params['n_epochs'],
                                'n_rnn_layers': best_trial.params['n_rnn_layers'],
                                'dropout':best_trial.params['dropout']
                                })
          # file_path = 'opt_results/opt_res_ml_26102023/261023_DF_opt_hyperparam_'+ model_name + '_'+str(nstep)+'-nstep.xlsx'
          folder_path = f'opt_results/opt_res_ml_26102023/{model_name}/'
          file_path = folder_path+ f'261023_DF_opt_hyperparam_{model_name}_{nstep}-nstep.xlsx'
          if(os.path.isfile(file_path)):
              with pd.ExcelWriter(file_path,mode="a",engine="openpyxl",if_sheet_exists="overlay") as writer:
                  one_city_param.to_excel(writer, header=None, startrow=city_index+1,index=False)
          else:
              if(not (os.path.isdir(folder_path))):
                os.mkdir(folder_path)
              with pd.ExcelWriter(file_path,engine="openpyxl") as writer:
                  one_city_param.to_excel(writer, startrow=city_index,index=False)
        except:# có error thì lưu vào l_errCity để check lại sau 
          l_errCity.append(cities[city_index])
          #send_to_telegram(f'Tỉnh bị lỗi trong quá trình optimize bằng model {model_name}: {cities[city_index]}')

In [475]:
# send_to_telegram("Chạy xong optimize rùiii!!Vô check thuiii!!!" )