In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

In [None]:
# REPRODUCIBILITY
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# CONFIG
TRAIN_PATH = "LSTM-Multivariate_pollution.csv"
TEST_PATH = "pollution_test_data1.csv"

LAG = 12
ROLL_WINDOW = 3

# LOAD DATA
df_train = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

In [None]:
# Drop date column if exists
if "date" in df_train.columns:
          df_train = df_train.drop(columns=["date"])
if "date" in df_test.columns:
          df_test = df_test.drop(columns=["date"])
          
# Ensure consistent column order
cols = ["pollution", "dew", "temp", "press", "wnd_dir", "wnd_spd", "snow", "rain"]
df_train = df_train[cols]
df_test = df_test[cols]
df_train

In [None]:
df_test

In [None]:
# HANDLE MISSING VALUES
num_cols = ["pollution", "dew", "temp", "press", "wnd_spd", "snow", "rain"]
cat_cols = ["wnd_dir"]

df_train[num_cols] = df_train[num_cols].interpolate(limit_direction="both", axis=0)
df_test[num_cols] = df_test[num_cols].interpolate(limit_direction="both", axis=0)

df_train[cat_cols] = df_train[cat_cols].ffill().bfill()
df_test[cat_cols] = df_test[cat_cols].ffill().bfill()

In [None]:
# FEATURE ENGINEERING: LAG & ROLLING
lag_rolling_train = []
for col in num_cols:
          # lag features
          for lag in range(1, LAG+1):
                    lag_rolling_train.append(df_train[col].shift(lag).rename(f"{col}_lag{lag}"))
          # rolling statistics
          roll = df_train[col].shift(1).rolling(ROLL_WINDOW)
          lag_rolling_train.extend([
                    roll.mean().rename(f"{col}_roll_mean{ROLL_WINDOW}"),
                    roll.std().rename(f"{col}_roll_std{ROLL_WINDOW}"),
                    roll.min().rename(f"{col}_roll_min{ROLL_WINDOW}"),
                    roll.max().rename(f"{col}_roll_max{ROLL_WINDOW}")
          ])
          
df_train = pd.concat([df_train] + lag_rolling_train, axis=1)

lag_rolling_test = []
for col in num_cols:
          # lag features
          for lag in range(1, LAG+1):
                    lag_rolling_test.append(df_test[col].shift(lag).rename(f"{col}_lag{lag}"))
          # rolling statistics
          roll = df_test[col].shift(1).rolling(ROLL_WINDOW)
          lag_rolling_test.extend([
                    roll.mean().rename(f"{col}_roll_mean{ROLL_WINDOW}"),
                    roll.std().rename(f"{col}_roll_std{ROLL_WINDOW}"),
                    roll.min().rename(f"{col}_roll_min{ROLL_WINDOW}"),
                    roll.max().rename(f"{col}_roll_max{ROLL_WINDOW}")
          ])

df_test = pd.concat([df_test] + lag_rolling_test, axis=1)

# Drop rows with NaN from lag/rolling
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

df_train

In [None]:
df_test

In [None]:
# PREPROCESSING
feature_cols = [c for c in df_train.columns if c != 'pollution']
target_col = 'pollution'

# Numeric/categorical separation
num_features = [c for c in feature_cols if df_train[c].dtype in [np.float32, np.float64, np.int32, np.int64]]
cat_features = [c for c in feature_cols if c not in num_features]

preprocessor = ColumnTransformer([
          ('num', MinMaxScaler(), num_features),
          ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
], remainder='drop'
)

X_train = preprocessor.fit_trainsform(df_train[feature_cols])
y_train = df_train[target_col].values
X_test = preprocessor.transform(df_test[feature_cols])
y_test = df_test[target_col].values

In [None]:
# DEFINE MODELS
models = {
          "LinearRegression": LinearRegression(),
          "Ridge": Ridge(),
          
}