In [13]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

# Settings
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Load cleaned data
df = pd.read_csv("../data/processed/air_quality_cleaned.csv", parse_dates=["Datetime"], index_col="Datetime")

# Ensure index is datetime and valid
df.index = pd.to_datetime(df.index, errors='coerce')
df = df[~df.index.isna()]

# Drop duplicates
df = df[~df.index.duplicated(keep='first')]

# Focus on target
target = "C6H6(GT)"

# Interpolate target early to avoid data loss
df[target] = df[target].interpolate(method='time')

# Ensure we still have enough data
if df[target].isna().sum() > 0:
    df = df.dropna(subset=[target])

# Confirm minimum data
if len(df) < 100:
    raise ValueError(f"Insufficient data for modeling after cleaning. Only {len(df)} rows available.")

# Create lag features (24 hours)
for lag in range(1, 25):
    df[f"lag_{lag}"] = df[target].shift(lag)

# Drop rows with NaN caused by lagging
df.dropna(inplace=True)

# Final check
if df.empty or len(df) < 100:
    raise ValueError(f"Insufficient usable rows after lagging. Only {len(df)} rows remain.")

# Train-test split
split_index = int(len(df) * 0.8)
df_train = df.iloc[:split_index]
df_test = df.iloc[split_index:]

# Feature and target
feature_cols = [col for col in df.columns if col.startswith("lag_")]
X_train, y_train = df_train[feature_cols], df_train[target]
X_test, y_test = df_test[feature_cols], df_test[target]

# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")

# Plot actual vs predicted
plt.figure(figsize=(14, 6))
plt.plot(y_test.index, y_test, label="Actual", color="blue")
plt.plot(y_test.index, y_pred, label="Predicted", color="red", alpha=0.7)
plt.title("Actual vs Predicted C6H6(GT) Concentrations")
plt.xlabel("Datetime")
plt.ylabel("C6H6(GT)")
plt.legend()
plt.tight_layout()
plt.show()


ValueError: Insufficient data for modeling after cleaning. Only 0 rows available.