In [2]:
# Week 2 - Real-Time Air Quality Prediction System
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Simulated correlated dataset for one city (Delhi)
np.random.seed(42)
n = 200

# Generate correlated pollutants
PM25 = np.random.normal(60, 15, n)
PM25_next = PM25 * 0.8 + np.random.normal(0, 5, n)

PM10 = np.random.normal(100, 25, n)
PM10_next = PM10 * 0.8 + np.random.normal(0, 10, n)

O3 = np.random.normal(30, 10, n)
O3_next = O3 * 0.7 + np.random.normal(0, 3, n)

CO = np.random.normal(0.5, 0.1, n)
CO_next = CO * 0.7 + np.random.normal(0, 0.05, n)

SO2 = np.random.normal(20, 5, n)
SO2_next = SO2 * 0.75 + np.random.normal(0, 2, n)

NO2 = np.random.normal(40, 10, n)
NO2_next = NO2 * 0.75 + np.random.normal(0, 4, n)

# Combine into DataFrame
df = pd.DataFrame({
    "datetime": pd.date_range(end=pd.Timestamp.now(), periods=n, freq="h"),  # 'h' to remove warning
    "PM2.5": PM25,
    "PM10": PM10,
    "O3": O3,
    "CO": CO,
    "SO2": SO2,
    "NO2": NO2,
})

# Prepare target columns
df["PM2.5_next"] = PM25_next
df["PM10_next"] = PM10_next
df["O3_next"] = O3_next
df["CO_next"] = CO_next
df["SO2_next"] = SO2_next
df["NO2_next"] = NO2_next

# Train Random Forest and evaluate
results = []
for pollutant in ["PM2.5", "PM10", "O3", "CO", "SO2", "NO2"]:
    data = df[[pollutant, f"{pollutant}_next"]].dropna()
    X = data[[pollutant]].values
    y = data[f"{pollutant}_next"].values

    split = int(len(X) * 0.8)
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append([pollutant, round(mae, 3), round(r2, 3)])

# Show evaluation results
eval_df = pd.DataFrame(results, columns=["Pollutant", "MAE", "R2_Score"])
print("Model Evaluation Results:")
print(eval_df.to_string(index=False))


Model Evaluation Results:
Pollutant    MAE  R2_Score
    PM2.5  4.566     0.763
     PM10 10.270     0.696
       O3  3.392     0.749
       CO  0.046     0.516
      SO2  1.649     0.665
      NO2  4.004     0.727
