In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.api import Sequential
from keras.api.layers import LSTM, Dense, Input

In [155]:
data = pd.read_csv('data/outputRegion.csv', delimiter=';', decimal=',') # read processed csv

features = ['AQI', 'Humidity', 'Temperature', 'Precipitation', 'PM2.5']
target = 'AQI' # target prediction parameter

# Scale the data for normalization
scaler = MinMaxScaler()
data[features] = scaler.fit_transform(data[features])

print(data)

             Date      Time           Station       AQI  Humidity  \
0         4/16/24  23:00:00  Халқлар дўстлиги  0.164537  1.000000   
1      2024-04-16  22:00:00  Халқлар дўстлиги  0.141720  1.000000   
2      2024-04-16  21:00:00  Халқлар дўстлиги  0.148239  1.000000   
3      2024-04-16  20:00:00  Халқлар дўстлиги  0.144979  1.000000   
4      2024-04-16  19:00:00  Халқлар дўстлиги  0.128681  1.000000   
...           ...       ...               ...       ...       ...   
11262  2022-11-07  17:00:00  Халқлар дўстлиги  0.018600  0.368113   
11263  2022-11-07  16:00:00  Халқлар дўстлиги  0.018600  0.368113   
11264  2022-11-07  15:00:00  Халқлар дўстлиги  0.000000  0.370457   
11265  2022-11-07  14:00:00  Халқлар дўстлиги  0.033066  0.366549   
11266  2022-11-07  13:00:00  Халқлар дўстлиги  0.043399       NaN   

       Temperature  Precipitation     PM2.5  
0         0.502660            0.0  0.063683  
1         0.508865            0.0  0.051635  
2         0.517730            0.0

In [156]:
def cleanDataset(dataset, look_back):
    subsets = []
    
    currentSet = []
    for row in dataset:
        if (np.isnan(row).any()):
            if (len(currentSet) > look_back):
                subsets.append(currentSet)    
            currentSet = []
        else:
            currentSet.append(row)
    
    return subsets

In [170]:
look_back = 7 * 24  # 1 days of hourly data
cleanedSubsets = cleanDataset(data[features].values, look_back)
print(cleanedSubsets[0][0])

[0.21017146 0.69109027 0.41843972 0.         0.08777969]


In [171]:
# creates a data sequences by look_back
def create_sequences(dataset, look_back):
    X, y = [], []        
    
    for i in range(len(dataset) - look_back - 1):
        X.append(dataset[i:(i + look_back)])
        y.append(dataset[i + look_back][0]) 
        
    return np.array(X), np.array(y)

In [172]:
X = []
y = []
for subset in cleanedSubsets:
    X_sub, y_sub = create_sequences(subset, look_back)

    X.extend(X_sub)
    y.extend(y_sub)

X = np.array(X)
y = np.array(y)

print(len(X[0]))
print(y[0])

168
0.1221620775471167


In [173]:
# Splitting the data into training and test sets
# 70 % training and 20 % validation and 10 % test
train_size = int(len(X) * 0.7) 
validation_size = int(len(X) * 0.2)

X_train = X[:train_size]
y_train = y[:train_size]

X_validation = X[train_size:(train_size + validation_size)]
y_validation = y[train_size:(train_size + validation_size)]

X_test = X[(train_size + validation_size):]
y_test = y[(train_size + validation_size):]

print(len(X_train))
print(len(y_train))
print(len(X_validation))
print(len(y_validation))
print(len(X_test))
print(len(y_test))

4468
4468
1276
1276
639
639


In [174]:
# Build the LSTM model
model = Sequential()
model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(50, return_sequences=True, activation='relu'))
model.add(LSTM(50, activation='relu'))
model.add(Dense(1)) 
model.compile(loss='mean_squared_error', optimizer='adam')


# 8. Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_validation, y_validation))



Epoch 1/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 56ms/step - loss: 0.0138 - val_loss: 0.0143
Epoch 2/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - loss: 0.0045 - val_loss: 0.0085
Epoch 3/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 54ms/step - loss: 0.0029 - val_loss: 0.0057
Epoch 4/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - loss: 0.0023 - val_loss: 0.0052
Epoch 5/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - loss: 0.0021 - val_loss: 0.0048
Epoch 6/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - loss: 0.0018 - val_loss: 0.0045
Epoch 7/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - loss: 0.0019 - val_loss: 0.0044
Epoch 8/50
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - loss: 0.0017 - val_loss: 0.0042
Epoch 9/50
[1m140/140[0m [32m

<keras.src.callbacks.history.History at 0x36112bfd0>

In [175]:
predictions = model.predict(X_test)
predictions = np.concatenate((predictions, np.zeros((len(predictions), 4))), axis=1)
tests = np.concatenate((np.array(y_test)[:, np.newaxis], np.zeros((len(y_test), 4))), axis=1)

# Inverse transform the predictions and actual values to original scale
predictions = scaler.inverse_transform(predictions)
tests = scaler.inverse_transform(tests)

diffs = []
for i in range(len(predictions)):
    diffs.append(int(abs(predictions[i][0] - tests[i][0])))
    # print(f"{predictions[i][0]} {tests[i][0]}")
diffs.sort(reverse=True)
print(len(diffs))
print(diffs)

from sklearn.metrics import mean_squared_error, mean_absolute_error
import math  

# Calculate RMSE
rmse = math.sqrt(mean_squared_error(tests, predictions))
print('RMSE:', rmse)

# Calculate MAE
mae = mean_absolute_error(tests, predictions)
print('MAE:', mae)

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
639
[79, 55, 52, 52, 52, 51, 51, 50, 47, 46, 46, 46, 45, 43, 43, 41, 41, 40, 40, 40, 40, 39, 39, 37, 37, 36, 36, 36, 36, 35, 33, 33, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1