# Beijing Air Quality Forecasting Starter Notebook

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8

import os

# Create the directory if it doesn't exist
os.makedirs("C:\\Users\\Merveille\\.kaggle", exist_ok=True)

# Write kaggle.json file
with open("C:\\Users\\Merveille\\.kaggle\\kaggle.json", "w") as f:
    f.write('{"username":"merveillekangabire","key":"12573b5c7b1a50c1ad03b1d4923ba262"}')


: 

In [None]:
!pip install python-slugify


In [None]:
!pip install --upgrade kaggle


In [None]:
!pip install tqdm


In [None]:
!kaggle datasets list

In [None]:
import zipfile
import os

zip_path = r"C:\Users\Merveille\notebooks\assignment-1-time-series-forecasting-may-2025.zip"
extract_to = r"C:\Users\Merveille\notebooks\assignment-1"

# Create the destination folder if it doesn't exist
os.makedirs(extract_to, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("✅ Unzipped successfully to:", extract_to)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU, Bidirectional
import tensorflow as tf

In [None]:
# Mount Google Drive to access datasets
test_path = r'C:\Users\Merveille\notebooks\assignment-1\test.csv'
train_path = r'C:\Users\Merveille\notebooks\assignment-1\train.csv'


In [None]:
# Load the datasets
# Ensure train.csv and test.csv are saved in your Google Drive in the same folder.
# Replace the file paths below with the actual paths to your dataset.
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

train.head(5)


# Explore the training data

In this sections explore your dataset with appropiate statistics and visualisations to understand your better. Ensure that you explain output of every code cell and what it entails.

In [None]:
# Inspecting the first few rows of the dataset to understand its structure.
print("Training Data Overview:")
train.head()

In [None]:
train.columns

In [None]:
# Ensure 'datetime' column is in datetime format
train['datetime'] = pd.to_datetime(train['datetime'])

test['datetime'] = pd.to_datetime(test['datetime'])

# Set the 'datetime' column as the index for better time-series handling
train.set_index('datetime', inplace=True)
# val.set_index('datetime', inplace=True)
test.set_index('datetime', inplace=True)


# Handle missing values


- Check the dataset for missing values and decide how to handle them.
- In this example, missing values are filled with the mean. You can experiment with other strategies.

In [None]:
train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)


In [None]:
train.isnull().values.any()

# Separate features and target

- Feel free to trop any non-essential columns like that you think might not contribute to modeling.

In [None]:
X_train = train.drop(['pm2.5', 'No'], axis=1)
y_train = train['pm2.5']

In [None]:
# Reshape data for LSTM input
# LSTM models require data in the shape (samples, timesteps, features).
# Here, the data is reshaped to add a "timesteps" dimension.
X_train = np.expand_dims(X_train, axis=1)

# Build model

Below is a simple LSTM model. Your task is to experiment with different parameters like, numbers of layers, units, activation functions, and optimizers, etc to get the best performing model. Experiment with other optimizers (e.g., SGD) or hyperparameters to improve performance.

# model1

Using LSTM, Relu, and dropout of 0.3 

In [None]:
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Dropout
# define model
model = Sequential([
    LSTM(128, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),
    LSTM(32, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='mse',
    metrics=[lambda y, y_pred: tf.sqrt(tf.reduce_mean(tf.square(y - y_pred)))]  # RMSE metric
)

# Display the model architecture
model.summary()



In [None]:
# Train the model
# You can adjust the number of epochs and batch size to improve performance.
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32
)

In [None]:
from sklearn.metrics import mean_squared_error
def evaluate_and_plot_on_train_only(model, history, X_train, y_train, scaler_y=None, model_name='Model'):
    # Predict on training data
    predictions = model.predict(X_train)
    
    # Compute training RMSE
    rmse = mean_squared_error(y_train, predictions)
    
    # Rescale if a scaler was used
    if scaler_y:
        predictions_rescaled = scaler_y.inverse_transform(predictions)
        y_train_rescaled = scaler_y.inverse_transform(y_train)
    else:
        predictions_rescaled = predictions
        y_train_rescaled = y_train

    # Plot: Actual vs Predicted
    plt.figure(figsize=(10, 5))
    plt.plot(y_train_rescaled, label='Actual')
    plt.plot(predictions_rescaled, label='Predicted')
    plt.title(f'{model_name} - Predictions vs Actuals\nRMSE: {rmse:.4f}')
    plt.xlabel('Time Step')
    plt.ylabel('Target')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot: Training loss per epoch
    plt.figure(figsize=(8, 5))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.axhline(y=mean_squared_error(y_train, predictions), color='blue', linestyle='--', label='Final Train MSE')
    plt.title(f'{model_name} - Training Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (MSE)')
    plt.legend()
    plt.grid(True)
    plt.show()

    print(f"[{model_name}] Final Training RMSE: {rmse:.4f}")

In [None]:
evaluate_and_plot_on_train_only(
    model, history, X_train, y_train,
    scaler_y=None,
    model_name='Bidirectional LSTM'
)


# model2

This model uses a mixture of GRU and LSTM

In [None]:
model2 = Sequential([
    LSTM(128, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),
    GRU(32, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model2.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='mse',
    metrics=[lambda y, y_pred: tf.sqrt(tf.reduce_mean(tf.square(y - y_pred)))]  # RMSE metric
)

# Display the model architecture
model.summary()


In [None]:
history = model2.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32
)

In [None]:
evaluate_and_plot_on_train_only(
    model2, history, X_train, y_train,
    scaler_y=None,
    model_name='GRU'
)

# model3

This model uses a mixture of GRU and  bidirectional LSTM

In [None]:
model3 = Sequential([
    Bidirectional(LSTM(128, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]))),
    Dropout(0.2),
    GRU(32, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model3.compile(
    optimizer=Adam(learning_rate=0.007),
    loss='mse',
    metrics=[lambda y, y_pred: tf.sqrt(tf.reduce_mean(tf.square(y - y_pred)))]  # RMSE metric
)

# Display the model architecture
model3.summary()

In [None]:
history = model3.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32
)

In [None]:
evaluate_and_plot_on_train_only(
    model3, history, X_train, y_train,
    scaler_y=None,
    model_name='Bidirectional'
)

# model4

This model uses a mixture of GRU and Bidirectional LSTM, with change in compiler and learning rate 

In [None]:
model4 = Sequential([
    Bidirectional(LSTM(128, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]))),
    Dropout(0.2),
    GRU(32, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model4.compile(
    optimizer=RMSprop(learning_rate=0.001),
    loss='mse',
    metrics=[lambda y, y_pred: tf.sqrt(tf.reduce_mean(tf.square(y - y_pred)))]  # RMSE metric
)

# Display the model architecture
model4.summary()

In [None]:
history = model4.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32
)

In [None]:
evaluate_and_plot_on_train_only(
    model4, history, X_train, y_train,
    scaler_y=None,
    model_name='Bidirectional'
)

# model5

This model uses a mixture of LSTM and Bidirectional LSTM, with change in compiler and learning rate 

In [None]:
model5 = Sequential([
    LSTM(128, activation='tanh', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    LSTM(32, activation='tanh'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model5.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='mse',
    metrics=[lambda y, y_pred: tf.sqrt(tf.reduce_mean(tf.square(y - y_pred)))]  # RMSE metric
)

# Display the model architecture
model5.summary()

In [None]:
history = model5.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32
)

In [None]:
evaluate_and_plot_on_train_only(
    model5, history, X_train, y_train,
    scaler_y=None,
    model_name='LSTM'
)

In [None]:
model6 = Sequential([
    Bidirectional(LSTM(80, activation='tanh', return_sequences=True, dropout=0.25, recurrent_dropout=0.15)),
    Bidirectional(LSTM(40, activation='tanh', return_sequences=False, dropout=0.25, recurrent_dropout=0.15)),
    Dense(20, activation='relu'),
    Dropout(0.15),
    Dense(1)
])

model6.compile(
    optimizer=Adam(learning_rate=0.0007),
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')]
)

# Display the model architecture
model6.summary()

In [None]:
history = model6.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32
)

In [None]:
evaluate_and_plot_on_train_only(
    model5, history, X_train, y_train,
    scaler_y=None,
    model_name='LSTM'
)

In [None]:
# Prepare the test data
os.makedirs(r"C:\Users\Merveille\notebooks\assignment-1\data", exist_ok=True)

# Prepare the test data
X_test = test.drop(['No'], axis=1)
X_test = np.expand_dims(X_test, axis=1)

# Make predictions on the test set using the trained model
predictions = model.predict(X_test)

# Clean predictions
predictions = np.nan_to_num(predictions)
predictions = np.round(predictions).astype(int)

# Make sure number of predictions matches number of row IDs
assert len(predictions.flatten()) == len(sample_submission), "Mismatch between prediction count and row IDs"

# Create submission DataFrame
submission = pd.DataFrame({
    'row ID': test.index.strftime('%Y-%m-%d') + ' ' + test.index.hour.astype(str) + ':' + test.index.strftime('%M:%S'),
    'pm2.5': predictions.flatten()
})

# Save to CSV
submission_path = r"C:\Users\Merveille\notebooks\assignment-1\data\first_submission.csv"
submission.to_csv(submission_path, index=False)

# Submit to Kaggle
!kaggle competitions submit -c assignment-1-time-series-forecasting-may-2025 -f "C:/Users/Merveille/notebooks/assignment-1/data/first_submission.csv" -m "Test Submitted"