In [None]:
%pip install wandb xgboost scikit-learn pandas joblib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import os
import joblib
import xgboost as xgb
import wandb

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [4]:
# ---------------------------
# 1. Initialize Weights & Biases
# ---------------------------
wandb.login()
wandb.init(
    project="pm25_airquality",
    name="model_comparison_v1"
)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Vineet Raj\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mrajvineet968[0m ([33mrajvineet968-rv-college[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
print("üì• Loading dataset...")
df = pd.read_csv("data/master_airquality_clean.csv", low_memory=False)
df.head()

üì• Loading dataset...


Unnamed: 0,Timestamp,PM2.5,PM10,O3,CO,Source
0,2016-07-01 10:00:00,10.67,39.0,14.5,0.48,KaggleStation
1,2016-07-01 11:00:00,2.0,39.0,15.0,0.49,KaggleStation
2,2016-07-01 14:00:00,20.5,50.0,10.5,0.47,KaggleStation
3,2016-07-01 15:00:00,15.25,59.5,6.6,0.51,KaggleStation
4,2016-07-01 16:00:00,11.67,60.0,17.43,0.46,KaggleStation


In [6]:
df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
df.dropna(subset=["Timestamp"], inplace=True)

df["hour"] = df["Timestamp"].dt.hour
df["dayofweek"] = df["Timestamp"].dt.dayofweek
df["month"] = df["Timestamp"].dt.month

In [7]:
for col in ["PM2.5", "PM10", "O3", "CO"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df.dropna(subset=["PM2.5"], inplace=True)

FEATURES = ["PM10", "O3", "CO", "hour", "dayofweek", "month"]
df[FEATURES] = df[FEATURES].fillna(df[FEATURES].median())

In [8]:
n = len(df)
test_size = int(0.2 * n)

train_df = df.iloc[: n - test_size]
test_df = df.iloc[n - test_size :]

X_train = train_df[FEATURES]
y_train = train_df["PM2.5"]

X_test = test_df[FEATURES]
y_test = test_df["PM2.5"]


def rmse(pred, true):
    return mean_squared_error(true, pred) ** 0.5


In [9]:
print("üéØ Training models...")

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_rmse = rmse(lr.predict(X_test), y_test)

# Random Forest
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train, y_train)
rf_rmse = rmse(rf.predict(X_test), y_test)

# XGBoost
xgr = xgb.XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    device="cpu",
    tree_method="hist",
    objective="reg:squarederror",
    random_state=42
)
xgr.fit(X_train, y_train)
xgb_rmse = rmse(xgr.predict(X_test), y_test)

üéØ Training models...


In [10]:
# ---------------------------
# 2. Log results to W&B
# ---------------------------
wandb.log({
    "lr_rmse": lr_rmse,
    "rf_rmse": rf_rmse,
    "xgb_rmse": xgb_rmse
})

In [11]:
results = {
    "Linear Regression": lr_rmse,
    "Random Forest": rf_rmse,
    "XGBoost": xgb_rmse
}

results

{'Linear Regression': 56.94103561458107,
 'Random Forest': 50.916393935274385,
 'XGBoost': 50.13245736754763}

In [14]:
best_model_name = min(results, key=results.get)
best_model_name

best_model = {
    "Linear Regression": lr,
    "Random Forest": rf,
    "XGBoost": xgr
}[best_model_name]

In [15]:
# ---------------------------
# 3. Save model as W&B artifact
# ---------------------------
model_path = "models/best_pm25_model.pkl"
os.makedirs("models", exist_ok=True)
joblib.dump(best_model, model_path)

artifact = wandb.Artifact("pm25-best-model", type="model")
artifact.add_file(model_path)
wandb.log_artifact(artifact)


<Artifact pm25-best-model>

In [16]:
os.makedirs("models", exist_ok=True)
joblib.dump(best_model, "models/best_pm25_model.pkl")

print("Model saved successfully:", best_model_name)

Model saved successfully: XGBoost


In [None]:
%pip install shap matplotlib

In [None]:
import shap
import matplotlib.pyplot as plt

# ---------------------------
# 4. Interpretability with SHAP
# ---------------------------
print("üîç Running SHAP analysis...")

# SHAP works best with tree-based models (XGBoost, RF)
if best_model_name in ["XGBoost", "Random Forest"]:
    # Use TreeExplainer for tree-based models
    explainer = shap.TreeExplainer(best_model)
    
    # Calculate SHAP values (using a sample of test data for speed if needed, here using full X_test)
    # For large datasets, consider: shap_values = explainer.shap_values(X_test.sample(1000))
    shap_values = explainer.shap_values(X_test)
    
    # Create artifacts directory if it doesn't exist
    os.makedirs("artifacts", exist_ok=True)
    
    # Summary Plot
    plt.figure()
    shap.summary_plot(shap_values, X_test, show=False)
    plt.savefig("artifacts/shap_summary.png", bbox_inches='tight')
    plt.close()
    
    # Log to W&B
    wandb.log({"shap_summary": wandb.Image("artifacts/shap_summary.png")})
    print("SHAP summary plot saved and logged.")
else:
    print(f"SHAP analysis skipped for {best_model_name} (Linear Regression not fully supported in this snippet).")

wandb.finish()