<a href="https://colab.research.google.com/github/Mr-Hrishikesh-Ghosh/Real-Time-Air-Quality-Prediction-System/blob/main/air_checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# real_time_air_quality_predictor_fixed.py
# Run in Google Colab or locally.
# Requirements:
# pip install pandas numpy scikit-learn requests joblib matplotlib gradio python-dateutil

import os
import time
import math
import joblib
import requests
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import gradio as gr

# -------------------------
# Configuration & Cities
# -------------------------
CITIES = [
    "New Delhi", "Mumbai", "Kolkata", "Chennai", "Bengaluru",
    "Hyderabad", "Ahmedabad", "Pune", "Lucknow", "Jaipur"
]
POLLUTANTS = ["pm25", "pm10", "o3", "co", "so2", "no2"]

OPENAQ_MEASUREMENTS = "https://api.openaq.org/v2/measurements"
MODEL_DIR = "aq_models"
os.makedirs(MODEL_DIR, exist_ok=True)

# -------------------------
# Data fetching
# -------------------------
def fetch_openaq_city_measurements(city, days=7, limit=10000):
    to_date = datetime.utcnow()
    from_date = to_date - timedelta(days=days)
    params = {
        "city": city,
        "date_from": from_date.strftime("%Y-%m-%dT%H:%M:%SZ"),
        "date_to": to_date.strftime("%Y-%m-%dT%H:%M:%SZ"),
        "limit": 10000,
        "page": 1,
        "sort": "desc",
        "order_by": "datetime"
    }
    all_results = []
    try:
        while True:
            resp = requests.get(OPENAQ_MEASUREMENTS, params=params, timeout=15)
            resp.raise_for_status()
            j = resp.json()
            results = j.get("results", [])
            if not results:
                break
            all_results.extend(results)
            meta = j.get("meta", {})
            found = meta.get("found", 0)
            per_page = meta.get("limit", len(results))
            if len(all_results) >= min(found, limit):
                break
            params["page"] += 1
            time.sleep(0.2)
            if params["page"] > 10:
                break
    except Exception as e:
        print(f"[fetch_openaq_city_measurements] error for {city}: {e}")
        return None

    if not all_results:
        return None

    rows = []
    for r in all_results:
        dt = r.get("date", {}).get("utc") or r.get("date", {}).get("local")
        try:
            dt = parser.isoparse(dt)
        except Exception:
            continue
        rows.append({
            "datetime": dt,
            "parameter": r.get("parameter"),
            "value": r.get("value"),
            "unit": r.get("unit")
        })
    if not rows:
        return None
    df = pd.DataFrame(rows)
    return df

def pivot_and_hourly_average(df):
    if df is None or df.empty:
        return None
    df["datetime"] = pd.to_datetime(df["datetime"])
    df["datetime_hour"] = df["datetime"].dt.floor("H")
    df = df[df["parameter"].isin(POLLUTANTS)].copy()
    if df.empty:
        return None
    pivot = df.groupby(["datetime_hour", "parameter"])["value"].mean().unstack(level=-1)
    for p in POLLUTANTS:
        if p not in pivot.columns:
            pivot[p] = np.nan
    pivot = pivot.sort_index()
    idx = pd.date_range(start=pivot.index.min(), end=pivot.index.max(), freq="H")
    pivot = pivot.reindex(idx)
    pivot = pivot.ffill(limit=3).bfill(limit=3)
    return pivot

# -------------------------
# Synthetic fallback
# -------------------------
def generate_synthetic_series(days=7, freq="H", seed=42, base_values=None):
    rng = np.random.RandomState(seed)
    idx = pd.date_range(end=datetime.utcnow(), periods=int(days*24), freq=freq)
    df = pd.DataFrame(index=idx)
    if base_values is None:
        base_values = {"pm25": 60, "pm10": 100, "o3": 30, "co": 0.7, "so2": 10, "no2": 40}
    for p in POLLUTANTS:
        hours = np.arange(len(idx))
        daily = 1 + 0.2 * np.sin(2 * math.pi * (hours % 24) / 24.0)
        weekly = 1 + 0.05 * np.sin(2 * math.pi * (hours % (24*7)) / (24*7))
        noise = rng.normal(scale=0.1 * base_values[p], size=len(idx))
        df[p] = np.maximum(0.0, base_values[p] * daily * weekly + noise)
    return df

# -------------------------
# Feature engineering
# -------------------------
def create_features(df):
    df_feat = df.copy()
    df_feat["hour"] = df_feat.index.hour
    df_feat["dayofweek"] = df_feat.index.dayofweek
    for p in POLLUTANTS:
        df_feat[f"{p}_lag1"] = df_feat[p].shift(1)
        df_feat[f"{p}_lag2"] = df_feat[p].shift(2)
        df_feat[f"{p}_roll3"] = df_feat[p].rolling(3, min_periods=1).mean()
        df_feat[f"{p}_roll24"] = df_feat[p].rolling(24, min_periods=1).mean()
    y = {p: df_feat[p].shift(-1) for p in POLLUTANTS}
    combined = df_feat.copy()
    for p in POLLUTANTS:
        combined[f"{p}_target"] = y[p]
    combined = combined.dropna()
    feature_cols = ["hour", "dayofweek"]
    for p in POLLUTANTS:
        feature_cols += [f"{p}_lag1", f"{p}_lag2", f"{p}_roll3", f"{p}_roll24"]
    X = combined[feature_cols]
    y_dict = {p: combined[f"{p}_target"] for p in POLLUTANTS}
    return X, y_dict, combined.index

# -------------------------
# Training models
# -------------------------
def train_models_for_city(city, df_hourly, force_retrain=False):
    models, metrics = {}, {}
    if df_hourly is None or df_hourly.empty:
        return models, metrics

    X, y_dict, idx = create_features(df_hourly)
    if X.empty:
        return models, metrics

    for p in POLLUTANTS:
        model_path = os.path.join(MODEL_DIR, f"{city.replace(' ', '_')}_{p}.joblib")
        if os.path.exists(model_path) and not force_retrain:
            try:
                models[p] = joblib.load(model_path)
                metrics[p] = None
                continue
            except Exception:
                pass
        y = y_dict[p]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        rmse = math.sqrt(mean_squared_error(y_test, y_pred))
        models[p] = rf
        metrics[p] = rmse
        try:
            joblib.dump(rf, model_path)
        except Exception:
            pass
    return models, metrics

# -------------------------
# Prediction
# -------------------------
def load_models_if_exist(city):
    models = {}
    for p in POLLUTANTS:
        path = os.path.join(MODEL_DIR, f"{city.replace(' ', '_')}_{p}.joblib")
        if os.path.exists(path):
            try:
                models[p] = joblib.load(path)
            except Exception:
                models[p] = None
        else:
            models[p] = None
    return models

def make_prediction_for_city(city, df_hourly, models=None):
    if df_hourly is None or df_hourly.empty:
        return None, None
    if models is None:
        models = load_models_if_exist(city)
    missing = [p for p in POLLUTANTS if models.get(p) is None]
    if missing:
        trained, metrics = train_models_for_city(city, df_hourly, force_retrain=False)
        for p in trained:
            models[p] = trained[p]
    X, y_dict, idx = create_features(df_hourly)
    if X.empty:
        last = df_hourly.iloc[-1]
        feature_row = {"hour": last.name.hour, "dayofweek": last.name.dayofweek}
        for p in POLLUTANTS:
            feature_row[f"{p}_lag1"] = df_hourly[p].iloc[-1]
            feature_row[f"{p}_lag2"] = df_hourly[p].iloc[-2] if len(df_hourly) > 1 else df_hourly[p].iloc[-1]
            feature_row[f"{p}_roll3"] = df_hourly[p].iloc[-3:].mean() if len(df_hourly) >= 3 else df_hourly[p].mean()
            feature_row[f"{p}_roll24"] = df_hourly[p].iloc[-24:].mean() if len(df_hourly) >= 24 else df_hourly[p].mean()
        X_latest = pd.DataFrame([feature_row])
    else:
        X_latest = X.tail(1)

    preds = {}
    for p in POLLUTANTS:
        model = models.get(p)
        if model is None:
            preds[p] = float(df_hourly[p].iloc[-1])
        else:
            try:
                pred = model.predict(X_latest)[0]
                preds[p] = float(max(0.0, pred))
            except Exception:
                preds[p] = float(df_hourly[p].iloc[-1])
    return preds, df_hourly

# -------------------------
# Health advice
# -------------------------
def simple_health_advice(preds):
    pm25 = preds.get("pm25", None)
    if pm25 is None:
        return "No prediction available."
    if pm25 <= 12:
        return "Good — Everyone can go outside as usual."
    elif pm25 <= 35.4:
        return "Moderate — Sensitive groups should reduce prolonged outdoor exertion."
    elif pm25 <= 55.4:
        return "Unhealthy for sensitive groups — Respiratory patients should be careful."
    elif pm25 <= 150.4:
        return "Unhealthy — Older adults and children should avoid prolonged outdoor exertion."
    else:
        return "Very Unhealthy/Hazardous — Everyone should reduce outdoor exposure."

# -------------------------
# Plotting
# -------------------------
def plot_history(df_hourly, pollutant_list=None, hours=24):
    if df_hourly is None or df_hourly.empty:
        fig, ax = plt.subplots(figsize=(8,3))
        ax.text(0.5, 0.5, "No historical data available", ha="center", va="center")
        ax.axis("off")
        return fig
    sub = df_hourly.copy().tail(hours)
    if pollutant_list is None:
        pollutant_list = POLLUTANTS
    fig, ax = plt.subplots(figsize=(9,4))
    for p in pollutant_list:
        if p in sub.columns:
            ax.plot(sub.index, sub[p], label=p.upper())
    ax.set_title(f"Past {hours} hours pollutant history")
    ax.set_xlabel("Time (UTC)")
    ax.set_ylabel("Concentration")
    ax.legend()
    fig.tight_layout()
    return fig

# -------------------------
# Data preparation
# -------------------------
def prepare_city_data(city, days=7, fallback_bases=None):
    df_raw = fetch_openaq_city_measurements(city, days=days)
    df_hourly = pivot_and_hourly_average(df_raw)
    if df_hourly is None or df_hourly.isna().all().all():
        df_hourly = generate_synthetic_series(days=days, base_values=fallback_bases.get(city) if fallback_bases else None)
    for p in POLLUTANTS:
        if p not in df_hourly.columns:
            df_hourly[p] = np.nan
    df_hourly = df_hourly.ffill(limit=3).bfill(limit=3)
    return df_hourly

# -------------------------
# Gradio UI
# -------------------------
def gradio_interface():
    def predict_ui(city, history_hours):
        df_city = prepare_city_data(city, days=7)
        preds, df_hourly = make_prediction_for_city(city, df_city)
        if preds is None:
            return [], "No data available", None
        table = pd.DataFrame({
            "Pollutant": [p.upper() for p in POLLUTANTS],
            "Predicted": [round(preds[p], 3) for p in POLLUTANTS]
        })
        advice = simple_health_advice(preds)
        fig = plot_history(df_hourly, pollutant_list=POLLUTANTS, hours=history_hours)
        return table.values.tolist(), advice, fig

    with gr.Blocks() as demo:
        gr.Markdown("# Real-Time Air Quality Prediction System")
        gr.Markdown("Select a city and get predicted pollutant levels, health advice, and a historical chart.")
        with gr.Row():
            city_sel = gr.Dropdown(choices=CITIES, value=CITIES[0], label="City")
            hours_sel = gr.Slider(minimum=6, maximum=168, step=6, value=48, label="History (hours)")
            run_btn = gr.Button("Get Prediction")
        with gr.Row():
            table_out = gr.Dataframe(headers=["Pollutant","Predicted"], label="Predicted next-hour values")
        with gr.Row():
            advice_out = gr.Textbox(label="Health Advice")
        with gr.Row():
            plot_out = gr.Plot(label="Historical Chart")
        run_btn.click(fn=predict_ui, inputs=[city_sel, hours_sel], outputs=[table_out, advice_out, plot_out])
    return demo

# -------------------------
# Main
# -------------------------
if __name__ == "__main__":
    print("Launching Real-Time Air Quality Prediction System...")
    demo = gradio_interface()
    demo.launch(share=False)


Launching Real-Time Air Quality Prediction System...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>