In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

In [None]:
# REPRODUCIBILITY
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# CONFIG
TRAIN_PATH = "LSTM-Multivariate_pollution.csv"
TEST_PATH = "pollution_test_data1.csv"

LAG = 12
ROLL_WINDOW = 3

# LOAD DATA
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

In [6]:
# Drop date column if exists
if "date" in df_train.columns:
          df_train = df_train.drop(columns=["date"])
if "date" in df_test.columns:
          df_test = df_test.drop(columns=["date"])
          
# Ensure consistent column order
cols = ["pollution", "dew", "temp", "press", "wnd_dir", "wnd_spd", "snow", "rain"]
df_train = df_train[cols]
df_test = df_test[cols]
df_train

Unnamed: 0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
0,129.0,-16,-4.0,1020.0,SE,1.79,0,0
1,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2,159.0,-11,-5.0,1021.0,SE,3.57,0,0
3,181.0,-7,-5.0,1022.0,SE,5.36,1,0
4,138.0,-7,-5.0,1022.0,SE,6.25,2,0
...,...,...,...,...,...,...,...,...
43795,8.0,-23,-2.0,1034.0,NW,231.97,0,0
43796,10.0,-22,-3.0,1034.0,NW,237.78,0,0
43797,10.0,-22,-3.0,1034.0,NW,242.70,0,0
43798,8.0,-22,-4.0,1034.0,NW,246.72,0,0


In [7]:
df_test

Unnamed: 0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
0,128,-16,4,1027,SE,3.58,0,0
1,77,-17,5,1027,SE,7.60,0,0
2,65,-16,4,1027,SE,9.39,0,0
3,79,-16,1,1028,cv,0.89,0,0
4,93,-14,0,1028,NE,1.79,0,0
...,...,...,...,...,...,...,...,...
341,8,-23,-2,1034,NW,231.97,0,0
342,10,-22,-3,1034,NW,237.78,0,0
343,10,-22,-3,1034,NW,242.70,0,0
344,8,-22,-4,1034,NW,246.72,0,0


In [8]:
# HANDLE MISSING VALUES
num_cols = ["pollution", "dew", "temp", "press", "wnd_spd", "snow", "rain"]
cat_cols = ["wnd_dir"]

df_train[num_cols] = df_train[num_cols].interpolate(limit_direction="both", axis=0)
df_test[num_cols] = df_test[num_cols].interpolate(limit_direction="both", axis=0)

df_train[cat_cols] = df_train[cat_cols].ffill().bfill()
df_test[cat_cols] = df_test[cat_cols].ffill().bfill()

In [10]:
# FEATURE ENGINEERING: LAG & ROLLING
lag_rolling_train = []
for col in num_cols:
          # lag features
          for lag in range(1, LAG+1):
                    lag_rolling_train.append(df_train[col].shift(lag).rename(f"{col}_lag{lag}"))
          # rolling statistics
          roll = df_train[col].shift(1).rolling(ROLL_WINDOW)
          lag_rolling_train.extend([
                    roll.mean().rename(f"{col}_roll_mean{ROLL_WINDOW}"),
                    roll.std().rename(f"{col}_roll_std{ROLL_WINDOW}"),
                    roll.min().rename(f"{col}_roll_min{ROLL_WINDOW}"),
                    roll.max().rename(f"{col}_roll_max{ROLL_WINDOW}")
          ])
          
df_train = pd.concat([df_train] + lag_rolling_train, axis=1)

lag_rolling_test = []
for col in num_cols:
          # lag features
          for lag in range(1, LAG+1):
                    lag_rolling_test.append(df_test[col].shift(lag).rename(f"{col}_lag{lag}"))
          # rolling statistics
          roll = df_test[col].shift(1).rolling(ROLL_WINDOW)
          lag_rolling_test.extend([
                    roll.mean().rename(f"{col}_roll_mean{ROLL_WINDOW}"),
                    roll.std().rename(f"{col}_roll_std{ROLL_WINDOW}"),
                    roll.min().rename(f"{col}_roll_min{ROLL_WINDOW}"),
                    roll.max().rename(f"{col}_roll_max{ROLL_WINDOW}")
          ])

df_test = pd.concat([df_test] + lag_rolling_test, axis=1)

# Drop rows with NaN from lag/rolling
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

df_train

Unnamed: 0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain,pollution_lag1,pollution_lag2,...,rain_lag7,rain_lag8,rain_lag9,rain_lag10,rain_lag11,rain_lag12,rain_roll_mean3,rain_roll_std3,rain_roll_min3,rain_roll_max3
24,90.0,-7,-6.0,1027.0,SE,58.56,4,0,126.0,156.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,63.0,-8,-6.0,1026.0,SE,61.69,5,0,90.0,126.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,65.0,-8,-7.0,1026.0,SE,65.71,6,0,63.0,90.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,55.0,-8,-7.0,1025.0,SE,68.84,7,0,65.0,63.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,65.0,-8,-7.0,1024.0,SE,72.86,8,0,55.0,65.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43795,8.0,-23,-2.0,1034.0,NW,231.97,0,0,10.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43796,10.0,-22,-3.0,1034.0,NW,237.78,0,0,8.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43797,10.0,-22,-3.0,1034.0,NW,242.70,0,0,10.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43798,8.0,-22,-4.0,1034.0,NW,246.72,0,0,10.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df_test

Unnamed: 0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain,pollution_lag1,pollution_lag2,...,rain_lag7,rain_lag8,rain_lag9,rain_lag10,rain_lag11,rain_lag12,rain_roll_mean3,rain_roll_std3,rain_roll_min3,rain_roll_max3
24,142,-7,2,1027,SE,3.58,0,0,133.0,132.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,163,-7,2,1026,SE,5.37,0,0,142.0,133.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,166,-7,1,1026,SE,7.16,0,0,163.0,142.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,189,-6,-1,1026,cv,0.89,0,0,166.0,163.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,202,-6,-1,1025,cv,1.78,0,0,189.0,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,8,-23,-2,1034,NW,231.97,0,0,10.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
342,10,-22,-3,1034,NW,237.78,0,0,8.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
343,10,-22,-3,1034,NW,242.70,0,0,10.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
344,8,-22,-4,1034,NW,246.72,0,0,10.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
