# Beijing Air Quality Forecasting Starter Notebook

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow as tf

In [5]:
# Mount Google Drive to access datasets
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Load the datasets
# Ensure train.csv and test.csv are saved in your Google Drive in the same folder.
# Replace the file paths below with the actual paths to your dataset.
train = pd.read_csv('/content/drive/MyDrive/Kaggle_competition_ML/air_quality_forcasting/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Kaggle_competition_ML/air_quality_forcasting/test.csv')


# Explore the training data

In this sections explore your dataset with appropiate statistics and visualisations to understand your better. Ensure that you explain output of every code cell and what it entails.

In [7]:
# Inspecting the first few rows of the dataset to understand its structure.
print("Training Data Overview:")
train.head()

Training Data Overview:


Unnamed: 0,No,DEWP,TEMP,PRES,Iws,Is,Ir,datetime,cbwd_NW,cbwd_SE,cbwd_cv,pm2.5
0,1,-1.580878,-1.92225,0.443328,-0.441894,-0.069353,-0.137667,2010-01-01 00:00:00,1.448138,-0.732019,-0.522096,
1,2,-1.580878,-2.004228,0.345943,-0.379306,-0.069353,-0.137667,2010-01-01 01:00:00,1.448138,-0.732019,-0.522096,
2,3,-1.580878,-1.92225,0.248559,-0.343514,-0.069353,-0.137667,2010-01-01 02:00:00,1.448138,-0.732019,-0.522096,
3,4,-1.580878,-2.168183,0.248559,-0.280926,-0.069353,-0.137667,2010-01-01 03:00:00,1.448138,-0.732019,-0.522096,
4,5,-1.511594,-2.004228,0.151174,-0.218339,-0.069353,-0.137667,2010-01-01 04:00:00,1.448138,-0.732019,-0.522096,


In [8]:
train.columns

Index(['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'datetime', 'cbwd_NW',
       'cbwd_SE', 'cbwd_cv', 'pm2.5'],
      dtype='object')

In [9]:
# Ensure 'datetime' column is in datetime format
train['datetime'] = pd.to_datetime(train['datetime'])

test['datetime'] = pd.to_datetime(test['datetime'])

# Set the 'datetime' column as the index for better time-series handling
train.set_index('datetime', inplace=True)
# val.set_index('datetime', inplace=True)
test.set_index('datetime', inplace=True)


# Handle missing values


- Check the dataset for missing values and decide how to handle them.
- In this example, missing values are filled with the mean. You can experiment with other strategies.

In [10]:
train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)


# Separate features and target

- Feel free to trop any non-essential columns like that you think might not contribute to modeling.

In [11]:
X_train = train.drop(['pm2.5', 'No'], axis=1)
y_train = train['pm2.5']

## Fixed sequence prep + proper LSTM


In [None]:
# === Sequence building and scaling (replace previous expand_dims approach) ===
import numpy as np
from sklearn.preprocessing import StandardScaler

SEQ_LEN = 24  # use last 24 hours to predict next hour

# Ensure datetime is parsed
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime']  = pd.to_datetime(test['datetime'])

# Feature list
feat_cols = ['DEWP','TEMP','PRES','Iws','Is','Ir','cbwd_NW','cbwd_SE','cbwd_cv']
for df_ in (train, test):
    df_['hour'] = df_['datetime'].dt.hour
    df_['dayofweek'] = df_['datetime'].dt.dayofweek
    df_['month'] = df_['datetime'].dt.month
feat_cols += ['hour','dayofweek','month']

X = train[feat_cols].values.astype('float32')
y = train['pm2.5'].values.astype('float32')

# Time-based split (last 20% as validation)
val_size = int(0.2 * len(train))
X_tr, X_val = X[:-val_size], X[-val_size:]
y_tr, y_val = y[:-val_size], y[-val_size:]

# Scale features
scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr)
X_val = scaler.transform(X_val)

def make_sequences(X, y, L):
    Xs, ys = [], []
    for i in range(L, len(X)):
        Xs.append(X[i-L:i])
        ys.append(y[i])
    return np.array(Xs, dtype='float32'), np.array(ys, dtype='float32')

Xtr_seq, ytr_seq = make_sequences(X_tr, y_tr, SEQ_LEN)
Xval_seq, yval_seq = make_sequences(X_val, y_val, SEQ_LEN)
Xtr_seq.shape, Xval_seq.shape


## Model training with callbacks


In [None]:
# === Build and train an LSTM model ===
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(Xtr_seq.shape[1], Xtr_seq.shape[2])),
    Dropout(0.2),
    LSTM(32),
    Dense(16, activation='relu'),
    Dense(1)
])
model.compile(optimizer=Adam(1e-3), loss='mse')
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True, monitor='val_loss'),
    ReduceLROnPlateau(factor=0.5, patience=2, verbose=1)
]
history = model.fit(
    Xtr_seq, ytr_seq,
    validation_data=(Xval_seq, yval_seq),
    epochs=50,
    batch_size=128,
    callbacks=callbacks,
    verbose=1
)
model.summary()


## Validation RMSE and sanity plot


In [None]:
# === Evaluate on validation ===
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

val_pred = model.predict(Xval_seq, verbose=0).ravel()
rmse = float(np.sqrt(mean_squared_error(yval_seq, val_pred)))
print(f'Validation RMSE: {rmse:.2f}')

plt.figure()
plt.plot(yval_seq[:500], label='Actual')
plt.plot(val_pred[:500], label='Pred')
plt.legend(); plt.title('Validation predictions (first 500)'); plt.xlabel('time'); plt.ylabel('pm2.5')
plt.tight_layout()


## Build Kaggle submission (aligned to sample)


In [None]:
# === Predict test set and create submission ===
X_test = test[feat_cols].values.astype('float32')
X_test = scaler.transform(X_test)

def make_test_sequences(X, L):
    Xs, idx = [], []
    for i in range(L, len(X)):
        Xs.append(X[i-L:i])
        idx.append(i)
    return np.array(Xs, dtype='float32'), np.array(idx)

Xte_seq, te_idx = make_test_sequences(X_test, SEQ_LEN)
te_pred = model.predict(Xte_seq, verbose=0).ravel()

# For the first SEQ_LEN rows, fill with a simple constant (train target mean)
fill_value = float(train['pm2.5'].mean())
final_pred = np.full(len(test), fill_value, dtype='float32')
final_pred[te_idx] = te_pred

# Match sample_submission order/format exactly
submission = sample_sub.copy()
submission['pm2.5'] = final_pred
submission_out = 'submission_lstm_simple.csv'
submission.to_csv(submission_out, index=False)
print('Saved:', submission_out)
