In [1]:
import numpy as np
import pandas as pd
import os
import re
import math
import random
import pickle

import matplotlib.dates as mdates
from datetime import time
xformatter = mdates.DateFormatter('%H:%M')  # for time axis plots
import datetime
from dateutil.parser import parse
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
%matplotlib inline
from matplotlib import style
style.use('seaborn-whitegrid')

from bayes_opt import BayesianOptimization
from pandas.tseries.frequencies import to_offset
from pickle import load,dump
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,f_regression
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM,Bidirectional, BatchNormalization, Dropout
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras.backend import clear_session
import warnings
warnings.filterwarnings('ignore')

In [2]:
FORMAT_TIME='%Y-%m-%d %H:%M:%S'
START_TIME_DAY = time(5,0,0)
END_TIME_DAY = time(18,30,0)

In [3]:
TIME_1_STEP = 15 # minute
step_lag_1_day = 24*60//TIME_1_STEP
steps_2hours = 60*2//TIME_1_STEP

In [4]:
activation_list = ['linear','relu', 'tanh','sigmoid']
optimizer_list = ['adam', 'rmsprop','sgd','Adamax']
range_neurons = [8,100]
selectors = []

In [5]:
# Nhap du lieu train data
def import_train_data(path_file_train):
    df = pd.read_csv(path_file_train)
    df['TimeStamp'] = pd.to_datetime(df['TimeStamp']
                                      )
    df = df.set_index('TimeStamp')
    return df

In [6]:
def resample_df(df, resample_time, time_col='TimeStamp'):
    """
    resample_time: `minute`
    """
    resample_df = df.copy()
    if resample_time >= 30:
        resample_df = resample_df.set_index(
            resample_df[time_col] - to_offset(str(resample_time//2)+"min"))
    resample_df = resample_df.resample(str(resample_time)+'min', label='right').mean()
    return resample_df

In [None]:
all_plant = import_train_data('./LN2_training.csv')
all_plant = resample_df(all_plant, resample_time = 15, time_col='TimeStamp')

In [None]:
# Split train, val data theo ti le 8/2
def train_valid_split(df, split_ratio=[0.8, 0.2]):
    train_ratio, valid_ratio = split_ratio
    assert train_ratio + valid_ratio  == 1.0
    n_df = len(df)
    # Train / Validation  Split
    train_split = int(n_df * train_ratio)
    valid_split = int(n_df * (train_ratio + valid_ratio))

    train = df[:train_split]
    val = df[train_split:valid_split]

    print(f'Train set: {len(train)} ')
    print(f'Validation set: {len(val)} ')

    return train, val 

In [None]:
def num_step_1hour(df):
    """
    Get number step of 1 hours
    """
    step_hours = None
    if type(df.index) == pd.core.indexes.datetimes.DatetimeIndex:
        time_1step = int((df.index[1] - df.index[0]) /
                         np.timedelta64(1, 'm'))  # minute
        step_hours = 60 // time_1step
    return step_hours

In [None]:
def make_data_supervised(dt, num_pre_around=5, num_day_pre=3):
    step_lag_1_day = num_step_1hour(dt)*24
    dt_lag = pd.DataFrame()
    for col in dt.columns:
        for day_pre in range(num_pre_around+1):
            if day_pre == 0:
                dt_lag[col+'(t)'] = dt[col]
            else:
                dt_lag[col+'(t-'+str(day_pre)+')'] = dt[col].shift(day_pre)
        for day_pre in range(1, num_day_pre):
            step_lag = step_lag_1_day*day_pre
            for lag in range(num_pre_around+1):
                dt_lag[col+'(t-'+str(step_lag+lag) +
                       ')'] = dt[col].shift(step_lag+lag)
                dt_lag[col+'(t-'+str(step_lag-lag) +
                       ')'] = dt[col].shift(step_lag-lag)
    dt_lag = dt_lag.dropna()
    dt_lag['hour'] = dt_lag.index.hour
    dt_lag['day'] = dt_lag.index.day
    dt_lag['day_of_week'] = dt_lag.index.dayofweek
    dt_lag['month'] = dt_lag.index.month
    dt_lag['day_of_year'] = dt_lag.index.dayofyear
    return dt_lag

In [None]:
def select_Kbest(train,score_f):
    X_select = train[train.columns[1:]]
    y_select = train['TotW(t)']
    bestfeatures = SelectKBest(score_func=score_f, k='all')
    fit = bestfeatures.fit(X_select,y_select)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X_select.columns)
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  
    featureScores = featureScores.sort_values(by='Score', ascending=False)

    return featureScores

In [None]:
def make_model(train, val, bo_result):
    train_X, train_y = train.values[:, 1:], train.values[:, :1]
    train_X = train_X.reshape(train_X.shape[0], 1, train_X.shape[1])

    val_X, val_y = val.values[:, 1:], val.values[:, :1]
    val_X = val_X.reshape(val_X.shape[0], 1, val_X.shape[1])

    model = Sequential()
    model.add(
        Bidirectional(LSTM(bo_result['num_feature'],
             activation=bo_result['activations'],
             input_shape=(train_X.shape[1], train_X.shape[2])),merge_mode= 'ave'))   
    model.add(Dense(1))
    model.compile(loss='mae',
                  optimizer=bo_result['optimizers'],
                  metrics=['mse', 'mae', 'cosine_proximity'])
    # fit network
    history = model.fit(train_X,
                        train_y,
                        epochs=100,
                        batch_size=50,
                        validation_data=(val_X, val_y),
                        verbose=1,
                        shuffle=False,
                        callbacks=EarlyStopping(monitor='val_loss',
                                                patience=15,
                                                restore_best_weights=True))
    fig, ax = plt.subplots(figsize=(16, 8))
    fig.suptitle('Loss', y=0.93)
    ax.plot(history.history['mae'], label='train')
    ax.plot(history.history['val_mae'], label='val')
    ax.set_title('mae')
    ax.legend(loc='upper right')
    return model

In [None]:
def get_or_create_path(path):
    if os.path.isdir(path) is False:
        os.mkdir(path)

In [None]:
def scale_func(train, val, path_scale):
    get_or_create_path(path_scale)
    train_scale_df = pd.DataFrame(index=train.index)
    val_scale_df = pd.DataFrame(index=val.index)
    
    for col in train.columns:
        scaler = MinMaxScaler()
        train_scale_df[col] = scaler.fit_transform(train[col].values.reshape(-1,1))[:,0]
        val_scale_df[col] = scaler.transform(val[col].values.reshape(-1,1))[:,0]
        pickle.dump(scaler, open(path_scale + col+'.pkl','ab+'))
    return train_scale_df,val_scale_df

In [None]:
def BO_result(bo_model):
    best_neurons = (round(bo_model.max['params']['neurons']))
    best_activation = (round(bo_model.max['params']['activation']))
    best_optimizer = (round(bo_model.max['params']['optimizer']))
    return {
        'activations': activation_list[best_activation],
        'optimizers': optimizer_list[best_optimizer],
        'neurons': best_neurons,
        'num_feature': (round(bo_model.max['params']['number_features']))
    }

In [None]:
#def tong
def LSTM_BO(df, path_model, path_scale, path_feature):
    #Hàm evaluate
    def evaluate(neurons, activation, optimizer, number_features):
        neurons = round(neurons)
        activation = round(activation)
        optimizer = round(optimizer)
        number_features = round(number_features)
        train_X_ = train_X.copy()
        val_X_ = val_X.copy()
        train_y_ = train_y.copy()
        val_y_ = val_y.copy()
        print(neurons, activation, optimizer, number_features)
        selector = SelectKBest(score_func=f_regression, k=number_features)
        train_X_ = train_X_.reshape(train_X_.shape[0], train_X_.shape[2])
        val_X_ = val_X_.reshape(val_X_.shape[0], val_X_.shape[2])
        train_X_ = selector.fit_transform(train_X_, train_y_)
        val_X_ = selector.transform(val_X_)
        train_X_ = train_X_.reshape(train_X_.shape[0],1,train_X_.shape[1])
        val_X_ = val_X_.reshape(val_X_.shape[0],1,val_X_.shape[1])
        print('\nNumber of neurons: ', neurons,
              ', activation function: ', activation_list[activation],
              ', optimizer function: ', optimizer_list[optimizer],
              ', features: ', number_features)
        clear_session()
        model = Sequential()
        
        model.add(Bidirectional(LSTM(neurons, activation=activation_list[activation], 
                                     input_shape=(train_X_.shape[1], train_X_.shape[2])),merge_mode= 'ave'))
        model.add(Dense(1))
        model.compile(loss='mae', optimizer=optimizer_list[optimizer])
        history = model.fit(train_X_, train_y_,validation_data=(val_X_, val_y_),callbacks = EarlyStopping(
            monitor='val_loss',
            patience=15,
            restore_best_weights=True), 
            epochs=100, batch_size=50, verbose=0,shuffle=False)
        print('val_loss: ', min(history.history['val_loss']))
        return -min(history.history['val_loss'])
        
    
    train_solar, val_solar = train_valid_split(df,split_ratio=[0.8, 0.2]) # split train/val
    train_scale, val_scale = scale_func(train_solar, val_solar, path_scale)

    train_scaler_lag = make_data_supervised(train_scale)
    val_scaler_lag = make_data_supervised(val_scale)
    
    col_analysis = list(train_scaler_lag)
    train_solar = train_scaler_lag[col_analysis].copy()
    val_solar = val_scaler_lag[col_analysis].copy()
    
    featureScores = select_Kbest(train_solar[col_analysis],f_regression)
    
    train_X, train_y= train_solar.values[:, 1:],train_solar.values[:, :1]
    train_X = train_X.reshape(train_X.shape[0], 1, train_X.shape[1])
    val_X, val_y= val_solar.values[:, 1:],val_solar.values[:, :1]
    val_X = val_X.reshape(val_X.shape[0], 1, val_X.shape[1])
    
    best_activation, best_optimizer, best_neurons, selectors = [], [], [], []
    pbounds = {'neurons' : (8,100),
               'activation': (0,3),
               'optimizer': (0,3), 
               'number_features': (1, train_X.shape[2])}
    bo_optimize = BayesianOptimization(f=evaluate,
                                    pbounds=pbounds,
                                    random_state=3)
    bo_optimize.maximize(init_points=10 ,n_iter=25)
    print(bo_optimize.max)
    
    bo_result = BO_result(bo_optimize)
    best_feature = list(featureScores['Specs'][:round(bo_optimize.max['params']['number_features'])])
    pickle.dump(best_feature, open(path_feature + 'best_feature.pkl','wb'))
    
    train_solar = train_scaler_lag[['TotW(t)']+best_feature]
    val_solar = val_scaler_lag[['TotW(t)']+best_feature]
    
    model = make_model(train_solar,val_solar,bo_result)
    model.save(path_model)
    
    return bo_optimize, model

In [None]:
bo_optimize, model = LSTM_BO(all_plant,'./LN2/4H.h5','./LN2/','./LN2/')