In [28]:
!pip install tensorflow keras scikit-learn



Data Processing

In [29]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Step 1: Load Data
def preprocess_selected_params(csv_path, seq_len=24):
    # Load data
    df = pd.read_csv(csv_path)

    # Simulate CO if not present
    if 'co' not in df.columns:
        np.random.seed(42)
        df['co'] = np.random.normal(loc=0.8, scale=0.3, size=len(df))

    # Drop rows with missing target or required inputs
    df = df.dropna(subset=["pm2.5", "TEMP", "humidity", "co"])

    # Keep only selected features
    df = df[["co", "TEMP", "humidity", "pm2.5"]]

    # Normalize
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df)

    joblib.dump(scaler, "scaler.pkl")


    # Create time series sequences
    X, y = [], []
    for i in range(len(scaled_data) - seq_len):
        X.append(scaled_data[i:i+seq_len])
        y.append(scaled_data[i+seq_len, 3])  # pm2.5 is at index 3

    return np.array(X), np.array(y)




Model parameters

In [30]:
# Step 3: Process Data
SEQ_LEN = 24
X, y = preprocess_selected_params("PRSA_data_2010.1.1-2014.12.31.csv", seq_len=SEQ_LEN)

# Step 4: Train/Test Split
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Step 5: Define and Train LSTM
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

  super().__init__(**kwargs)


Train model

In [31]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1,
          callbacks=[EarlyStopping(patience=3)])

Epoch 1/10
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - loss: 0.0020 - val_loss: 6.1584e-04
Epoch 2/10
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 14ms/step - loss: 7.0667e-04 - val_loss: 5.1897e-04
Epoch 3/10
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - loss: 6.8719e-04 - val_loss: 4.4874e-04
Epoch 4/10
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - loss: 6.0045e-04 - val_loss: 4.8502e-04
Epoch 5/10
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - loss: 6.3455e-04 - val_loss: 4.5419e-04
Epoch 6/10
[1m939/939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 13ms/step - loss: 6.2013e-04 - val_loss: 4.8739e-04


<keras.src.callbacks.history.History at 0x791123971fd0>

Save model

In [32]:
model.save("lstm_model.keras")

Predict

In [34]:
# Load model
model = load_model("lstm_model.keras")

# Predict again
future_prediction = model.predict(X_test)

[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step


In [35]:
future_prediction

array([[0.05617213],
       [0.08636937],
       [0.17702842],
       ...,
       [0.00737313],
       [0.0066048 ],
       [0.00439043]], dtype=float32)