訓練模型

In [30]:
#%%
import tensorflow as tf
from keras.models import Sequential
from keras.models import load_model, save_model
from keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from keras.optimizers import Adam
from keras import backend as K
from keras.callbacks import ReduceLROnPlateau, Callback, EarlyStopping
from keras import regularizers

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, MaxAbsScaler
from sklearn.compose import ColumnTransformer

import joblib
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import os

#載入訓練資料
device = 'L1'
SourceData = pd.read_csv(f"..//Data//MergedSorted//{device}_Merged_Sorted.csv")


In [31]:
SourceData.columns.to_list()

['DateTime',
 'SeqNumber',
 'Device_ID',
 'Year',
 'Month',
 'Day',
 'Hour',
 'Minute',
 'Avg_Temperature(°C)',
 'Avg_Humidity(%)',
 'Avg_Sunlight(Lux)',
 'Avg_Power(mW)',
 'Avg_Diff_Temperature(°C)',
 'Avg_Diff_Humidity(%)',
 'Avg_Diff_Sunlight(Lux)',
 'Avg_Diff_Power(mW)',
 'Avg_Lag_1_Temperature(°C)',
 'Avg_Lag_2_Temperature(°C)',
 'Avg_Lag_1_Humidity(%)',
 'Avg_Lag_2_Humidity(%)',
 'Avg_Lag_1_Sunlight(Lux)',
 'Avg_Lag_2_Sunlight(Lux)',
 'Avg_Lag_1_Power(mW)',
 'Avg_Lag_2_Power(mW)',
 'Max_Temperature(°C)',
 'Max_Humidity(%)',
 'Max_Sunlight(Lux)',
 'Max_Power(mW)',
 'Max_Diff_Temperature(°C)',
 'Max_Diff_Humidity(%)',
 'Max_Diff_Sunlight(Lux)',
 'Max_Diff_Power(mW)',
 'Max_Lag_1_Temperature(°C)',
 'Max_Lag_2_Temperature(°C)',
 'Max_Lag_1_Humidity(%)',
 'Max_Lag_2_Humidity(%)',
 'Max_Lag_1_Sunlight(Lux)',
 'Max_Lag_2_Sunlight(Lux)',
 'Max_Lag_1_Power(mW)',
 'Max_Lag_2_Power(mW)',
 'Min_Temperature(°C)',
 'Min_Humidity(%)',
 'Min_Sunlight(Lux)',
 'Min_Power(mW)',
 'Min_Diff_Temperatu

In [13]:
one_hot_encode_features = [
    # 'Device_ID',
    # 'Year',
    # 'Month',
    # 'Day',
    # 'Hour',
    # 'Minute',
]

input_features_model_1 = to_predict_features_model_1 = [
    'Avg_Temperature(°C)',
    'Avg_Humidity(%)',
    'Avg_Sunlight(Lux)',
    'Avg_Power(mW)',
    'Avg_Diff_Temperature(°C)',
    'Avg_Diff_Humidity(%)',
    'Avg_Diff_Sunlight(Lux)',
    'Avg_Diff_Power(mW)',
    'Avg_Lag_1_Temperature(°C)',
    'Avg_Lag_2_Temperature(°C)',
    'Avg_Lag_1_Humidity(%)',
    'Avg_Lag_2_Humidity(%)',
    'Avg_Lag_1_Sunlight(Lux)',
    'Avg_Lag_2_Sunlight(Lux)',
    'Avg_Lag_1_Power(mW)',
    'Avg_Lag_2_Power(mW)',
    'Max_Temperature(°C)',
    'Max_Humidity(%)',
    'Max_Sunlight(Lux)',
    'Max_Power(mW)',
    'Max_Diff_Temperature(°C)',
    'Max_Diff_Humidity(%)',
    'Max_Diff_Sunlight(Lux)',
    'Max_Diff_Power(mW)',
    'Max_Lag_1_Temperature(°C)',
    'Max_Lag_2_Temperature(°C)',
    'Max_Lag_1_Humidity(%)',
    'Max_Lag_2_Humidity(%)',
    'Max_Lag_1_Sunlight(Lux)',
    'Max_Lag_2_Sunlight(Lux)',
    'Max_Lag_1_Power(mW)',
    'Max_Lag_2_Power(mW)',
    'Min_Temperature(°C)',
    'Min_Humidity(%)',
    'Min_Sunlight(Lux)',
    'Min_Power(mW)',
    'Min_Diff_Temperature(°C)',
    'Min_Diff_Humidity(%)',
    'Min_Diff_Sunlight(Lux)',
    'Min_Diff_Power(mW)',
    'Min_Lag_1_Temperature(°C)',
    'Min_Lag_2_Temperature(°C)',
    'Min_Lag_1_Humidity(%)',
    'Min_Lag_2_Humidity(%)',
    'Min_Lag_1_Sunlight(Lux)',
    'Min_Lag_2_Sunlight(Lux)',
    'Min_Lag_1_Power(mW)',
    'Min_Lag_2_Power(mW)',
    'MA6_Temperature(°C)',
    'MA6_Max_Temperature(°C)',
    'MA6_Min_Temperature(°C)',
    'MA6_Humidity(%)',
    'MA6_Max_Humidity(%)',
    'MA6_Min_Humidity(%)',
    'MA6_Sunlight(Lux)',
    'MA6_Max_Sunlight(Lux)',
    'MA6_Min_Sunlight(Lux)',
    'MA6_Power(mW)',
    'MA6_Max_Power(mW)',
    'MA6_Min_Power(mW)',
    'MA6_Diff_Temperature(°C)',
    'MA6_Max_Diff_Temperature(°C)',
    'MA6_Min_Diff_Temperature(°C)',
    'MA6_Diff_Humidity(%)',
    'MA6_Max_Diff_Humidity(%)',
    'MA6_Min_Diff_Humidity(%)',
    'MA6_Diff_Sunlight(Lux)',
    'MA6_Max_Diff_Sunlight(Lux)',
    'MA6_Min_Diff_Sunlight(Lux)',
    'MA6_Diff_Power(mW)',
    'MA6_Max_Diff_Power(mW)',
    'MA6_Min_Diff_Power(mW)',
    'MA6_Lag_1_Temperature(°C)',
    'MA6_Max_Lag_1_Temperature(°C)',
    'MA6_Min_Lag_1_Temperature(°C)',
    'MA6_Lag_2_Temperature(°C)',
    'MA6_Max_Lag_2_Temperature(°C)',
    'MA6_Min_Lag_2_Temperature(°C)',
    'MA6_Lag_1_Humidity(%)',
    'MA6_Max_Lag_1_Humidity(%)',
    'MA6_Min_Lag_1_Humidity(%)',
    'MA6_Lag_2_Humidity(%)',
    'MA6_Max_Lag_2_Humidity(%)',
    'MA6_Min_Lag_2_Humidity(%)',
    'MA6_Lag_1_Sunlight(Lux)',
    'MA6_Max_Lag_1_Sunlight(Lux)',
    'MA6_Min_Lag_1_Sunlight(Lux)',
    'MA6_Lag_2_Sunlight(Lux)',
    'MA6_Max_Lag_2_Sunlight(Lux)',
    'MA6_Min_Lag_2_Sunlight(Lux)',
    'MA6_Lag_1_Power(mW)',
    'MA6_Max_Lag_1_Power(mW)',
    'MA6_Min_Lag_1_Power(mW)',
    'MA6_Lag_2_Power(mW)',
    'MA6_Max_Lag_2_Power(mW)',
    'MA6_Min_Lag_2_Power(mW)',
    'Sin_Hour',
    'Cos_Hour',
    'Sin_Minute',
    'Cos_Minute',
    'Sin_Month',
    'Cos_Month',
    'Sin_Day',
    'Cos_Day'
 ]
target_column = ['Avg_Power(mW)']
SourceData = SourceData[['SeqNumber'] + to_predict_features_model_1 + one_hot_encode_features]
SourceData = pd.get_dummies(SourceData, columns=one_hot_encode_features, dtype='int')

In [14]:
def create_dataset(data, LookBackNum):
    X = []
    y = []

    #設定每i-12筆資料(X_train)就對應到第i筆資料(y_train)
    for i in range(LookBackNum,len(data)):
        X.append(data[i-LookBackNum:i, :])
        y.append(data[i, :])

    return np.array(X), np.array(y)
  
#設定LSTM往前看的筆數和預測筆數
n_timesteps = LookBackNum = 12 #LSTM往前看的筆數，一筆10分鐘


preprocess_pipe = make_pipeline(
    MinMaxScaler(),
    # PCA(n_components=11),
)

SourceData_encode = SourceData.copy()
SourceData_encode.dropna(inplace=True)
#正規化
SourceData_encode[to_predict_features_model_1] = preprocess_pipe.fit_transform(SourceData_encode[to_predict_features_model_1])


X_train, _ = create_dataset(SourceData_encode.drop(columns='SeqNumber').values, LookBackNum=LookBackNum)
_, y_train = create_dataset(SourceData_encode[to_predict_features_model_1].values, LookBackNum=LookBackNum)

n_features = X_train.shape[2]
n_prediction = y_train.shape[1]

# Reshaping
#(samples 是訓練樣本數量,timesteps 是每個樣本的時間步長,features 是每個時間步的特徵數量)
X_train = np.reshape(X_train,(X_train.shape[0], n_timesteps, n_features))
X_train.shape

(8194, 12, 104)

In [None]:
#============================建置&訓練「LSTM模型」============================
#建置LSTM模型
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=15, 
    restore_best_weights=True
    )

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.5,     # 衰減率
    patience=10,    
    min_lr=1e-7
    )

def build_lstm_model(n_timesteps, n_features, n_prediction):
    model = Sequential()
    
    model.add(LSTM(units=256, return_sequences=True, activation='tanh',input_shape=(n_timesteps, n_features)))
    model.add(Dropout(0.2))

    
    # model.add(LSTM(units=256, return_sequences=True, activation='tanh'))
    # model.add(Dropout(0.2))
   
    model.add(LSTM(units=256, return_sequences=False, activation='tanh'))
    model.add(Dropout(0.2))


    model.add(Dense(units=128))
    # model.add(Dropout(0.2))

    
    model.add(Dense(units=n_prediction, activation='relu'))

    
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss='mse',
        metrics=['mae', 'mse']
    )
    model.summary()
    return model

regressor = build_lstm_model(n_timesteps, n_features, n_prediction)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 12, 256)           369664    
                                                                 
 dropout_1 (Dropout)         (None, 12, 256)           0         
                                                                 
 lstm_2 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 104)               13416     
                                                                 
Total params: 941,288
Trainable params: 941,288
Non-tr

In [16]:
#開始訓練

history = regressor.fit(
    X_train, 
    y_train, 
    epochs = 100, 
    batch_size = 32,
    validation_split=0.2,
    callbacks=[reduce_lr,early_stopping],
    )


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [17]:
# import matplotlib.pyplot as plt


# train_loss = history.history['loss']
# val_loss = history.history['val_loss']


# plt.figure(figsize=(10, 6))
# plt.plot(train_loss, label='Train Loss', color='blue')
# plt.plot(val_loss, label='Validation Loss', color='orange')
# plt.title('Train Loss vs Validation Loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid(True)
# plt.show()

In [18]:
#保存模型
model_path = f'..//Model//WheatherLSTM_{device}.h5'
regressor.save(model_path)
print('Model Saved')

Model Saved


## 訓練迴歸模型

In [19]:
TrainData = pd.read_csv(f"..//Data//MergedSorted//{device}_Merged_Sorted.csv")
TrainData.dropna(inplace=True)

In [20]:
X_full = TrainData[input_features_model_1]
X_full[input_features_model_1] = preprocess_pipe.transform(X_full[input_features_model_1])

if 'Avg_Power(mW)' in input_features_model_1 :
    X_full = X_full.drop(columns='Avg_Power(mW)')
else:
    X_full = X_full
reg_scaler = MinMaxScaler()
X_full = reg_scaler.fit_transform(X_full.values)
y_full = TrainData['Avg_Power(mW)'].values

X_train, X_val, y_train, y_val = train_test_split(X_full,y_full,test_size=0.2,shuffle=True)

reg_model = make_pipeline(
    LinearRegression(),
)

cv_scores = cross_val_score(reg_model, X_train, y_train, cv=20)
cv_scores

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_full[input_features_model_1] = preprocess_pipe.transform(X_full[input_features_model_1])


array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.])

In [21]:
reg_model.fit(X_train, y_train)

In [22]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = reg_model.predict(X_val)
y_pred = y_pred = np.clip(y_pred, 0, None)

print('MSE: ',mean_squared_error(y_val, y_pred))
print('MAE: ',mean_absolute_error(y_val, y_pred))
print('R2:',r2_score(y_val, y_pred))

MSE:  2.3319201889583334e-05
MAE:  0.002975639124123466
R2: 0.9999999999048782


In [23]:
reg_model.fit(X_full, y_full)

## 預測答案

In [24]:
#載入模型
model_path = f'..//Model//WheatherLSTM_{device}.h5'
model = load_model(model_path, compile=False)
print('Model Loaded Successfully')

Model Loaded Successfully


In [25]:
TestData = pd.read_csv('..//Data/TestData//upload(no answer).csv')

TestData = TestData[TestData['序號'] % 100 == int(device[1:])]

to_predict_sequmber = TestData['序號'].to_list()

# 預測的資料 的 index
indices_1 = SourceData[SourceData['SeqNumber'].isin(to_predict_sequmber)][to_predict_features_model_1].index.to_list()
len(indices_1)

1152

In [26]:
index_min = min(indices_1) - n_timesteps
index_max = max(indices_1)

indices_2 = SourceData.loc[index_min:index_max][to_predict_features_model_1].index.tolist()

# 找出有 NaN 的 row
rows_with_na = SourceData.loc[indices_2, to_predict_features_model_1].isnull().any(axis=1)
rows_with_na_data = SourceData.loc[indices_2, to_predict_features_model_1][rows_with_na]

# 有 NaN 的 row 的 index
indices_with_na =  rows_with_na_data.index.to_list()
len(indices_with_na)

4054

In [27]:
 # 如果 LookBackNum > 12 選 indices_with_na
 # 其餘選 indices_1
PredictedData = SourceData.copy()
indices_to_use = indices_with_na if LookBackNum > 12 else indices_1

for index in indices_to_use:
    X = PredictedData.loc[index-LookBackNum : index-1].drop(columns="SeqNumber")
    # if 'Avg_Power(mW)' in X.columns.to_list():
    #     X = X.drop(columns='Avg_Power(mW)')
    
    X[to_predict_features_model_1] = preprocess_pipe.transform(X[to_predict_features_model_1])
    X = X.values
    X = np.reshape(X,(1, n_timesteps, n_features))
    
    pred = model.predict(X)
    pred = preprocess_pipe.inverse_transform(pred)
    PredictedData.loc[index, to_predict_features_model_1] = pred
    PredictedData.loc[index, ['Avg_Power(mW)']] = PredictedData.loc[index, ['Avg_Power(mW)']].apply(lambda x: 0 if x <= 0 else x)
    
    # X = PredictedData.loc[index, to_predict_features_model_1].to_frame().T
    # X[to_predict_features_model_1] = preprocess_pipe.transform(X[to_predict_features_model_1])
    # if 'Avg_Power(mW)' in to_predict_features_model_1:
    #     X = X.drop(columns='Avg_Power(mW)')
    # else:
    #     X = X
    # X = reg_scaler.transform(X.values)
    # pred = reg_model.predict(X)
    # pred = pred[0]
    # pred = np.clip(pred, 0, None)
    # PredictedData.loc[index, 'Avg_Power(mW)'] = pred



In [28]:
i = 0
day = 48
PredictedData.loc[indices_1][(day*i):(day*(i+1))]

Unnamed: 0,SeqNumber,Avg_Temperature(°C),Avg_Humidity(%),Avg_Sunlight(Lux),Avg_Power(mW),Avg_Diff_Temperature(°C),Avg_Diff_Humidity(%),Avg_Diff_Sunlight(Lux),Avg_Diff_Power(mW),Avg_Lag_1_Temperature(°C),...,MA3_Max_Lag_2_Power(mW)_3,MA3_Min_Lag_2_Power(mW)_3,Sin_Hour,Cos_Hour,Sin_Minute,Cos_Minute,Sin_Month,Cos_Month,Sin_Day,Cos_Day
317,20240106090001,28.473604,51.486279,65694.960938,991.481689,0.155564,-0.388955,-83.391907,-3.830927,27.926376,...,1213.013184,508.312012,0.740411,-0.607318,-0.038402,1.012552,0.536738,0.841031,0.936173,0.282
318,20240106091001,28.814751,50.724808,66618.484375,996.249451,0.098568,-0.332644,141.566696,3.813333,28.372652,...,1269.318115,641.396179,0.712215,-0.655345,0.816282,0.538932,0.530407,0.834995,0.904285,0.264188
319,20240106092001,29.460173,49.047424,67946.960938,1010.746399,0.055462,-0.276521,200.773468,4.34965,29.086966,...,1278.682251,720.041748,0.72575,-0.66143,0.878559,-0.44412,0.520937,0.830901,0.879182,0.26606
320,20240106093001,30.187696,46.769062,70365.421875,1049.217041,0.025441,-0.231738,164.326294,3.275194,29.885406,...,1295.888672,782.491882,0.752851,-0.641428,0.038517,-1.0,0.511442,0.83404,0.870346,0.268871
321,20240106094001,30.721939,44.927608,72853.5,1089.662842,0.00754,-0.160952,116.823296,3.219523,30.533499,...,1315.595825,832.139465,0.779952,-0.637831,-0.866025,-0.545756,0.508445,0.833726,0.869129,0.268123
322,20240106095001,31.085222,43.719566,74702.648438,1112.010376,0.008962,-0.110463,113.354027,4.017045,30.973843,...,1321.844116,871.682129,0.704904,-0.682969,-0.866025,0.479887,0.510995,0.832711,0.855856,0.256727
323,20240106100001,31.35173,42.880493,75973.414062,1119.430176,0.018614,-0.11659,111.112122,5.633994,31.304111,...,1316.392578,905.927612,0.56312,-0.759167,-0.054391,0.99979,0.514657,0.831297,0.836822,0.23922
324,20240106101001,31.771671,41.77705,77467.390625,1137.263184,0.022518,-0.142734,62.424332,5.126468,31.753025,...,1318.421753,944.903625,0.501869,-0.813513,0.821798,0.55423,0.514099,0.826395,0.822365,0.230318
325,20240106102001,32.431503,40.082825,78993.875,1163.43396,0.011956,-0.147119,27.273134,2.315014,32.349567,...,1337.201294,988.637329,0.505087,-0.826906,0.881771,-0.444014,0.508473,0.826827,0.807354,0.233565
326,20240106103001,33.045021,38.290871,80639.4375,1196.47522,-0.005835,-0.118277,-7.611725,0.155839,32.918606,...,1369.973633,1024.401001,0.529955,-0.813794,0.023614,-1.0,0.503228,0.831799,0.801625,0.237361


In [29]:
PredictedData.loc[indices_1].to_csv(f'..//Data//PredictedData//Predicted_{device}.csv', index=False)
PredictedData.to_csv(f'..//Data//PredictedOverAllData//Predicted_OverAll_{device}.csv', index=False)