# Importing Librairies

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
import kerastuner as kt
from tensorflow.keras.layers import LSTM, Dense, GRU
import numpy as np # linear algebra
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

  from kerastuner.tuners import RandomSearch


# Loading Data

In [3]:
folder_path = 'data'
files_list = glob.glob(folder_path + '/*.xlsx')
dfs = []
for file in files_list:
    df = pd.read_excel(file, parse_dates=(['created_at']))
    dfs.append(df)
    
merged_df = pd.concat(dfs, ignore_index=True)
merged_df.head()

Unnamed: 0,created_at,entry_id,field1
0,2018-02-02 04:59:00,17392,25.0
1,2018-02-02 04:59:19,17393,25.0
2,2018-02-02 04:59:38,17394,25.0
3,2018-02-02 04:59:58,17395,25.0
4,2018-02-02 05:00:17,17396,25.0


# Data Visualization

In [4]:
merged_df['created_at'] = pd.to_datetime(merged_df['created_at'], format='%d/%m/%Y %H:%M')
merged_df["year"] = merged_df["created_at"].dt.year
merged_df["month"] = merged_df["created_at"].dt.month
merged_df["day"] = merged_df["created_at"].dt.day
merged_df["hour"] = merged_df["created_at"].dt.hour
merged_df["minute"] = merged_df["created_at"].dt.minute
merged_df.head()

Unnamed: 0,created_at,entry_id,field1,year,month,day,hour,minute
0,2018-02-02 04:59:00,17392,25.0,2018,2,2,4,59
1,2018-02-02 04:59:19,17393,25.0,2018,2,2,4,59
2,2018-02-02 04:59:38,17394,25.0,2018,2,2,4,59
3,2018-02-02 04:59:58,17395,25.0,2018,2,2,4,59
4,2018-02-02 05:00:17,17396,25.0,2018,2,2,5,0


In [5]:
null_data = merged_df.isna().sum()
duplicate_rows = merged_df.duplicated()
print('Nulls', null_data)
print('Duplicates', duplicate_rows)

Nulls created_at    0
entry_id      0
field1        0
year          0
month         0
day           0
hour          0
minute        0
dtype: int64
Duplicates 0         False
1         False
2         False
3         False
4         False
          ...  
171692    False
171693    False
171694    False
171695    False
171696    False
Length: 171697, dtype: bool


In [6]:
# Detecting outliers using Z-score or IQR
# Let's focus on the "field1" column for outlier detection
Q1 = merged_df["field1"].quantile(0.25)
Q3 = merged_df["field1"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


# Identify and remove outliers
df_no_outliers = merged_df[
    (merged_df["field1"].iloc[:] >= lower_bound) &
    (merged_df["field1"].iloc[:] <= upper_bound)
]
df_no_outliers.head()

Unnamed: 0,created_at,entry_id,field1,year,month,day,hour,minute
0,2018-02-02 04:59:00,17392,25.0,2018,2,2,4,59
1,2018-02-02 04:59:19,17393,25.0,2018,2,2,4,59
2,2018-02-02 04:59:38,17394,25.0,2018,2,2,4,59
3,2018-02-02 04:59:58,17395,25.0,2018,2,2,4,59
4,2018-02-02 05:00:17,17396,25.0,2018,2,2,5,0


In [7]:
X = df_no_outliers.drop(['field1', 'created_at'], axis=1)
y = df_no_outliers['field1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train

Unnamed: 0,entry_id,year,month,day,hour,minute
70353,134969,2018,3,18,21,19
112325,117274,2018,3,14,5,25
157053,159672,2018,3,25,13,32
88896,28691,2018,2,10,13,22
123459,128408,2018,3,17,0,17
...,...,...,...,...,...,...
124272,129221,2018,3,17,4,44
106610,185624,2018,4,4,6,15
136325,15640,2018,1,27,0,39
151260,153879,2018,3,24,2,48


In [17]:
y_train

70353     45.0
112325    62.0
157053    47.0
88896     20.0
123459    28.0
          ... 
124272    29.0
106610    19.0
136325    37.0
151260    23.0
126351    64.0
Name: field1, Length: 133843, dtype: float64

In [62]:
# Applying Standard Scaling to the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Saving Scaler

In [63]:
import joblib
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

# Reshaping

In [19]:
# Reshape data for LSTM and  gru (samples, timesteps, features)
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

In [20]:
X_train_reshaped

array([[[ 0.61804076,  0.        ,  0.36062987,  0.3672895 ,
          1.3258919 , -0.60521328]],

       [[ 0.29388007,  0.        ,  0.36062987, -0.07072983,
         -0.92831012, -0.25871055]],

       [[ 1.07058337,  0.        ,  0.36062987,  1.13382334,
          0.19879089,  0.14554264]],

       ...,

       [[-1.56798755,  0.        , -2.25274803,  1.35283301,
         -1.63274825,  0.54979583]],

       [[ 0.96445944,  0.        ,  0.36062987,  1.02431851,
         -1.350973  ,  1.06954993]],

       [[-1.75070463,  0.        , -2.25274803,  1.02431851,
          0.90322902, -1.47147012]]])

# Modeling Training

## LSTM

In [21]:
lstm_model = Sequential([
    LSTM(128, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), return_sequences=True),
    LSTM(64, activation='relu'),
    Dense(1)  # Output layer
])

lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# # Use early stopping to prevent overfitting
# early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

lstm_model.fit(X_train_reshaped, y_train, epochs=100, validation_split=0.2)

# Evaluate the LSTM model
y_pred_lstm = lstm_model.predict(X_test_reshaped)

# Calculate and print R-squared
r_squared_lstm = r2_score(y_test, y_pred_lstm)
print("LSTM R-squared:", r_squared_lstm)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [22]:
r_squared = r2_score(y_test, y_pred_lstm)
mae = mean_absolute_error(y_test, y_pred_lstm)
rmse = mean_squared_error(y_test, y_pred_lstm, squared=False)
mse = mean_squared_error(y_test, y_pred_lstm)

print("TANK2:")
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Squared Error:", mse)

TANK2:
R-squared: 0.9903910077328384
Mean Absolute Error: 0.9431088099879118
Root Mean Squared Error: 1.8268546632331253
Mean Squared Error: 3.3373979605766153


In [25]:
lstm_model.save('model.h5')

# Finetuning

In [None]:
# Define a function to build the LSTM model with hyperparameters
def build_model(hp):
    model = keras.Sequential()
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=256, step=32),
                   activation=hp.Choice('activation', values=['relu', 'tanh']),
                   return_sequences=True,
                   input_shape=(1, X_train_scaled.shape[1])))
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=128, step=32),
                   activation=hp.Choice('activation', values=['relu', 'tanh'])))
    model.add(Dense(1))

    # Compile the model
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='mse',
                  metrics=['mse'])
    return model

# Define the tuner and search space
tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,  # Number of trials to run
    directory='my_tuning_directory',  # Directory to save results
    project_name='my_lstm_tuning'  # Name for this tuning project
)

# Perform hyperparameter tuning
tuner.search(X_train_reshaped, y_train, epochs=100, validation_data=(X_test_reshaped, y_test))

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the best model
best_model = tuner.hypermodel.build(best_hps)

# Train the best model
best_model.fit(X_train_reshaped, y_train, epochs=100, validation_data=(X_test_reshaped, y_test))

# Evaluate the best model
y_pred = best_model.predict(X_test_reshaped)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE) for Best Model:", mse)


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
32                |32                |units
tanh              |tanh              |activation
0.001             |0.001             |learning_rate

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

In [None]:
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mse = mean_squared_error(y_test, y_pred)

print("TANK2:")
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Squared Error:", mse)

# GRU

In [None]:
gru_model = Sequential([
    GRU(128, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), return_sequences=True),
    GRU(64, activation='relu'),
    Dense(1)  # Output layer
])

gru_model.compile(optimizer='adam', loss='mean_squared_error')

# Use early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

gru_model.fit(X_train_reshaped, y_train, epochs=100, validation_split=0.2)

# Evaluate the GRU model
y_pred_gru = gru_model.predict(X_test_reshaped)

# Calculate and print R-squared
r_squared_gru = r2_score(y_test, y_pred_gru)
print("GRU R-squared:", r_squared_gru)

In [None]:
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mse = mean_squared_error(y_test, y_pred)

print("TANK2:")
print("R-squared:", r_squared)
print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("Mean Squared Error:", mse)