# Phase 3 - Starter Script


In [18]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error



## Step 1: Load & Inspect the Data

In [19]:
# Load CSV
path = "../data/cleaned_air_quality.csv"
df = pd.read_csv(path)

# Combine Date + Time into a proper datetime
df['Datetime'] = pd.to_datetime(
    df['Date'] + ' ' + df['Time'],
    format="%d/%m/%Y %H.%M.%S"  # <-- matches "10/03/2004 18.00.00"
)

# Set as index
df = df.set_index('Datetime').drop(columns=['Date', 'Time'])

print("Data shape:", df.shape)
print(df.head())


Data shape: (6941, 14)
                     CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  \
Datetime                                                                      
2004-03-10 18:00:00     2.6       1360.0     150.0      11.9         1046.0   
2004-03-10 19:00:00     2.0       1292.0     112.0       9.4          955.0   
2004-03-10 20:00:00     2.2       1402.0      88.0       9.0          939.0   
2004-03-10 21:00:00     2.2       1376.0      80.0       9.2          948.0   
2004-03-10 22:00:00     1.6       1272.0      51.0       6.5          836.0   

                     NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  \
Datetime                                                            
2004-03-10 18:00:00    166.0        1056.0    113.0        1692.0   
2004-03-10 19:00:00    103.0        1174.0     92.0        1559.0   
2004-03-10 20:00:00    131.0        1140.0    114.0        1555.0   
2004-03-10 21:00:00    172.0        1092.0    122.0        1584.0   
2004-03-1

## Step 2: Feature Engineering

In [20]:
# Temporal Features
df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek  # 0=Monday
df['month'] = df.index.month
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

# Cyclical Encoding for hour + month
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Lagged Features (example for NO2, can repeat for others)
target_pollutant = 'NO2(GT)'  # confirm this is the right column name
for lag in [1, 6, 12, 24]:
    df[f'{target_pollutant}_lag{lag}'] = df[target_pollutant].shift(lag)

# Rolling Statistics
for window in [3, 6, 12]:
    df[f'{target_pollutant}_rollmean{window}'] = df[target_pollutant].rolling(window=window).mean()
    df[f'{target_pollutant}_rollstd{window}'] = df[target_pollutant].rolling(window=window).std()

# Drop rows with NaNs
df = df.dropna()

print("\nFinal dataset shape after feature engineering:", df.shape)
print(df.head())


Final dataset shape after feature engineering: (803, 32)
                     CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  \
Datetime                                                                      
2004-03-11 20:00:00     6.1       1640.0     401.0      24.0         1404.0   
2004-03-11 21:00:00     3.9       1313.0     197.0      12.8         1076.0   
2004-03-11 22:00:00     1.5        965.0      61.0       4.7          749.0   
2004-03-11 23:00:00     1.0        913.0      26.0       2.6          629.0   
2004-03-12 00:00:00     1.7       1080.0      55.0       5.9          805.0   

                     NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  \
Datetime                                                            
2004-03-11 20:00:00    351.0         743.0    165.0        2191.0   
2004-03-11 21:00:00    240.0         957.0    136.0        1707.0   
2004-03-11 22:00:00     94.0        1325.0     85.0        1333.0   
2004-03-11 23:00:00     47.0        1565.0 