In [None]:
import numpy as np
import pandas as pd
import os
import pickle

import matplotlib.dates as mdates
from datetime import time
xformatter = mdates.DateFormatter('%H:%M')  # for time axis plots
import datetime
from dateutil.parser import parse
from sklearn.metrics import mean_squared_error

%matplotlib inline
from matplotlib import style
import matplotlib.pyplot as plt
style.use('seaborn-whitegrid')


from pandas.tseries.frequencies import to_offset
from pickle import load,dump
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,f_regression
from keras.models import Sequential,tf
from keras.layers import Dense
from keras.layers import LSTM,Bidirectional, BatchNormalization, Dropout, Conv1D, MaxPooling1D, Flatten
from keras.callbacks import EarlyStopping
from geneticalgorithm import geneticalgorithm as ga
from keras.backend import clear_session
import warnings
warnings.filterwarnings('ignore')

In [None]:
FORMAT_TIME='%Y-%m-%d %H:%M:%S'
START_TIME_DAY = time(5,0,0)
END_TIME_DAY = time(18,30,0)
TIME_1_STEP = 15 # minute
step_lag_1_day = 24*60//TIME_1_STEP
steps_2hours = 60*2//TIME_1_STEP

In [None]:
activation_list = ['linear','relu', 'tanh','sigmoid']
optimizer_list = ['adam', 'rmsprop','sgd','Adamax']
range_neurons_lstm = [8,100]
range_neurons_cnn = [8,100]
selectors = []
range_pool_size = [1,2]
range_kernel_size = [1,3]
algorithm_param = {
    'max_num_iteration': 5,
    'population_size': 5,
    'mutation_probability': 0.1,
    'elit_ratio': 0.2,
    'crossover_probability': 0.5,
    'parents_portion': 0,
    'crossover_type': 'uniform',
    'max_iteration_without_improv': 2
}

In [None]:
# Nhap du lieu train data
def import_train_data(path_file_train):
    df = pd.read_csv(path_file_train)
    df['TimeStamp'] = pd.to_datetime(df['TimeStamp']
                                      )
    df = df.set_index('TimeStamp')
    return df

In [None]:
def resample_df(df, resample_time, time_col='TimeStamp'):
    """
    resample_time: `minute`
    """
    resample_df = df.copy()
    if resample_time >= 30:
        resample_df = resample_df.set_index(
            resample_df[time_col] - to_offset(str(resample_time//2)+"min"))
    resample_df = resample_df.resample(str(resample_time)+'min', label='right').mean()
    return resample_df

In [None]:
all_plant = import_train_data('./LN2_training.csv')
all_plant = resample_df(all_plant, resample_time = 15, time_col='TimeStamp')

In [None]:
# Split train, val data theo ti le 8/2
def train_valid_split(df, split_ratio=[0.8, 0.2]):
    train_ratio, valid_ratio = split_ratio
    assert train_ratio + valid_ratio  == 1.0
    n_df = len(df)
    # Train / Validation  Split
    train_split = int(n_df * train_ratio)
    valid_split = int(n_df * (train_ratio + valid_ratio))

    train = df[:train_split]
    val = df[train_split:valid_split]

    print(f'Train set: {len(train)} ')
    print(f'Validation set: {len(val)} ')

    return train, val 

In [None]:
def num_step_1hour(df):
    """
    Get number step of 1 hours
    """
    step_hours = None
    if type(df.index) == pd.core.indexes.datetimes.DatetimeIndex:
        time_1step = int((df.index[1] - df.index[0]) /
                         np.timedelta64(1, 'm'))  # minute
        step_hours = 60 // time_1step
    return step_hours

In [None]:
def make_data_supervised(dt, num_pre_around=5, num_day_pre=3):
    step_lag_1_day = num_step_1hour(dt)*24
    dt_lag = pd.DataFrame()
    for col in dt.columns:
        for day_pre in range(num_pre_around+1):
            if day_pre == 0:
                dt_lag[col+'(t)'] = dt[col]
            else:
                dt_lag[col+'(t-'+str(day_pre)+')'] = dt[col].shift(day_pre)
        for day_pre in range(1, num_day_pre):
            step_lag = step_lag_1_day*day_pre
            for lag in range(num_pre_around+1):
                dt_lag[col+'(t-'+str(step_lag+lag) +
                       ')'] = dt[col].shift(step_lag+lag)
                dt_lag[col+'(t-'+str(step_lag-lag) +
                       ')'] = dt[col].shift(step_lag-lag)
    dt_lag = dt_lag.dropna()
    dt_lag['hour'] = dt_lag.index.hour
    dt_lag['day'] = dt_lag.index.day
    dt_lag['day_of_week'] = dt_lag.index.dayofweek
    dt_lag['month'] = dt_lag.index.month
    dt_lag['day_of_year'] = dt_lag.index.dayofyear
    return dt_lag

In [None]:
def select_Kbest(train,score_f):
    X_select = train[train.columns[1:]]
    y_select = train['TotW(t)']
    bestfeatures = SelectKBest(score_func=score_f, k='all')
    fit = bestfeatures.fit(X_select,y_select)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X_select.columns)
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  
    featureScores = featureScores.sort_values(by='Score', ascending=False)

    return featureScores

In [None]:
def make_model(train, val, GA_result):
    train_X, train_y = train.values[:, 1:], train.values[:, :1]
    # train_X = train_X.reshape(train_X.shape[0], 1, train_X.shape[1])
    
    val_X, val_y = val.values[:, 1:], val.values[:, :1]
    # val_X = val_X.reshape(val_X.shape[0], 1, val_X.shape[1])
    train_X, train_y = create_window_data(train_X, train_y, window_size)
    val_X, val_y = create_window_data(val_X, val_y, window_size)
    

    model = Sequential()
    

    model.add(Conv1D(filters=GA_result['neurons_cnn'], kernel_size=GA_result['kernel_size'], 
                                                        padding='valid',
                                                        activation=GA_result['activations'], 
                                                        input_shape=(train_X.shape[1],train_X.shape[2])))
    model.add(MaxPooling1D(pool_size=GA_result['pool_size'], padding='SAME'))                                                  
    model.add(Flatten())
    model.add(tf.keras.layers.Reshape((1, model.output_shape[1])))
    model.add(
        Bidirectional(LSTM(GA_result['neurons_lstm'],
             activation=GA_result['activations'],return_sequences=True)))
    
    # model.add(BatchNormalization())
    # model.add(Dropout(0.2))   
    # model.add(Dense(16, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mae',
                  optimizer=GA_result['optimizers'],
                  metrics=['mse', 'mae', 'cosine_proximity'])
    # fit network
    history = model.fit(train_X,
                        train_y,
                        epochs=100,
                        batch_size=50,
                        validation_data=(val_X, val_y),
                        verbose=1,
                        shuffle=False,
                        callbacks=EarlyStopping(monitor='val_loss',
                                                patience=15,
                                                restore_best_weights=True))
    fig, ax = plt.subplots(figsize=(16, 8))
    fig.suptitle('Loss', y=0.93)
    ax.plot(history.history['mae'], label='train')
    ax.plot(history.history['val_mae'], label='val')
    ax.set_title('mae')
    ax.legend(loc='upper right')
    return model

In [None]:
def get_or_create_path(path):
    if os.path.isdir(path) is False:
        os.mkdir(path)

In [None]:
def scale_func(train, val, path_scale):
    get_or_create_path(path_scale)
    train_scale_df = pd.DataFrame(index=train.index)
    val_scale_df = pd.DataFrame(index=val.index)
    
    for col in train.columns:
        scaler = MinMaxScaler()
        train_scale_df[col] = scaler.fit_transform(train[col].values.reshape(-1,1))[:,0]
        val_scale_df[col] = scaler.transform(val[col].values.reshape(-1,1))[:,0]
        pickle.dump(scaler, open(path_scale + col+'.pkl','ab+'))
    return train_scale_df,val_scale_df

In [None]:
def GA_result(ga_model):
    best_neurons_lstm = (int(ga_model.best_variable[0]))
    best_neurons_cnn = (int(ga_model.best_variable[1]))
    best_activation = (int(ga_model.best_variable[2]))
    best_optimizer = (int(ga_model.best_variable[3]))
    best_pool_size = (int(ga_model.best_variable[4]))
    best_kernel_size = (int(ga_model.best_variable[5]))
    return {
        'activations': activation_list[best_activation],
        'optimizers': optimizer_list[best_optimizer],
        'neurons_lstm': best_neurons_lstm,
        'neurons_cnn': best_neurons_cnn,
        'pool_size': best_pool_size,
        'kernel_size': best_kernel_size,
        'num_feature': int(ga_model.best_variable[6])
    }

In [None]:
def get_or_create_path(path):
    if os.path.isdir(path) is False:
        os.mkdir(path)

In [None]:
def scale_func(train, val, path_scale):
    get_or_create_path(path_scale)
    train_scale_df = pd.DataFrame(index=train.index)
    val_scale_df = pd.DataFrame(index=val.index)
    
    for col in train.columns:
        scaler = MinMaxScaler()
        train_scale_df[col] = scaler.fit_transform(train[col].values.reshape(-1,1))[:,0]
        val_scale_df[col] = scaler.transform(val[col].values.reshape(-1,1))[:,0]
        pickle.dump(scaler, open(path_scale + col+'.pkl','ab+'))
    return train_scale_df,val_scale_df

In [None]:
train_solar, val_solar = train_valid_split(all_plant,split_ratio=[0.8, 0.2]) # split train/val
train_scale, val_scale = scale_func(train_solar, val_solar, './LN2/')

train_scaler_lag = make_data_supervised(train_scale)
val_scaler_lag = make_data_supervised(val_scale)

col_analysis = list(train_scaler_lag)
train_solar = train_scaler_lag[col_analysis].copy()
val_solar = val_scaler_lag[col_analysis].copy()

featureScores = select_Kbest(train_solar[col_analysis],f_regression)
# featureScores.to_csv('./LN2_4H_kbest.csv')

train_X, train_y= train_solar.values[:, 1:],train_solar.values[:, :1]
# train_X = train_X.reshape(train_X.shape[0] ,1, train_X.shape[1])
val_X, val_y= val_solar.values[:, 1:],val_solar.values[:, :1]
# val_X = val_X.reshape(val_X.shape[0], 1, val_X.shape[1])

In [None]:
def create_window_data(train_X, train_y, window_size):
    """
    Tạo window slide cho dự báo chuỗi thời gian bằng mô hình CNN-LSTM.

    Tham số:
    X: ndarray, đầu vào của chuỗi thời gian
    y: ndarray, đầu ra của chuỗi thời gian
    window_size: int, kích thước cửa sổ

    Trả về:
    X_window: ndarray, đầu vào được chia thành các cửa sổ
    y_window: ndarray, đầu ra được chia thành các cửa sổ
    """
    X_window, y_window = [], []
    for i in range(len(train_X) - window_size):
        X_window.append(train_X[i:i+window_size])
        y_window.append(train_y[i+window_size])
    return np.array(X_window), np.array(y_window)

In [None]:
window_size = 5

In [None]:
train_X.shape[1]

In [None]:
def evaluate(gene):
    g = [int(i) for i in gene]
    train_X_ = train_X.copy()
    val_X_ = val_X.copy()
    train_y_ = train_y.copy()
    val_y_ = val_y.copy()
    selector = SelectKBest(score_func=f_regression, k=g[6])
    train_X_ = train_X_.reshape(train_X_.shape[0], train_X_.shape[1])
    val_X_ = val_X_.reshape(val_X_.shape[0], train_X_.shape[1])
    train_X_ = selector.fit_transform(train_X_, train_y_)
    val_X_ = selector.transform(val_X_)
    train_X_, train_y_ = create_window_data(train_X, train_y, window_size)
    val_X_, val_y_ = create_window_data(val_X, val_y, window_size)
    
    # train_X_ = train_X_.reshape(train_X_.shape[0],window_size,train_X_.shape[1])
    # val_X_ = val_X_.reshape(val_X_.shape[0],window_size,val_X_.shape[1])
    neurons_lstm = g[0]
    neurons_cnn = g[1]
    activation = activation_list[g[2]]
    optimizer = optimizer_list[g[3]] 
    pool_size = g[4]
    kernel_size = g[5]
    print('\nNumber of neurons LSTM: ', neurons_lstm,
            ', Number of neurons CNN:',neurons_cnn,
            ', activation function: ', activation,
            ', optimizer function: ', optimizer,
            ', features: ', g[6] ,
            ', pool_size:', pool_size,
            ', kernel_size', kernel_size)
    clear_session()
    model = Sequential()
    model.add(Conv1D(filters=neurons_cnn, kernel_size= kernel_size, activation=activation, 
                                            padding='valid',
                                            input_shape=(train_X_.shape[1],train_X_.shape[2]))) 
    model.add(MaxPooling1D(pool_size=pool_size, padding='SAME'))   
    model.add(Flatten())   
    print(model.output_shape)
    model.add(tf.keras.layers.Reshape((1, model.output_shape[1])))                                             
    model.add(Bidirectional(LSTM(neurons_lstm,activation=activation,
                                    return_sequences=True)))
    # model.add(BatchNormalization())
    # model.add(Dropout(0.2))   
    # model.add(Dense(16, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer=optimizer)
    history = model.fit(train_X_, train_y_,validation_data=(val_X_, val_y_),callbacks = EarlyStopping(
        monitor='val_loss',
        patience=15,
        restore_best_weights=True), 
        epochs=100, batch_size=50, verbose=0,shuffle=False)
    print('val_loss: ', min(history.history['val_loss']))
    return min(history.history['val_loss'])
    
        
    

In [None]:
varbound = np.array([range_neurons_lstm, range_neurons_cnn,[0, 3], [0, 3] , 
                        range_pool_size,range_kernel_size ,[1, train_X.shape[1]]])
ga_model = ga(function=evaluate,
            dimension=7,
            variable_type='int',
            function_timeout=10000,
            variable_boundaries=varbound,
            convergence_curve=False,
            algorithm_parameters=algorithm_param)
ga_model.run()

In [None]:
ga_result = GA_result(ga_model)

In [None]:
ga_result

In [None]:
best_feature = list(featureScores['Specs'][:ga_result['num_feature']])

train_solar = train_scaler_lag[['TotW(t)']+best_feature]
val_solar = val_scaler_lag[['TotW(t)']+best_feature]

In [None]:
model = make_model(train_solar,val_solar,ga_result)