In [59]:
!pip install -q gradio pandas numpy scikit-learn openpyxl

In [60]:
!pip install xgboost



In [61]:
import gradio as gr
import pandas as pd
import numpy as np
import os, traceback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [62]:
_model = None
_scaler = None
_features = []

In [63]:
def _load_file(file_obj):
    # file_obj in Colab has .name pointing to the uploaded temp file path
    if file_obj is None:
        raise ValueError("No file provided")
    path = getattr(file_obj, "name", None)
    if not path or not os.path.exists(path):
        raise ValueError("Uploaded file path not found.")
    ext = os.path.splitext(path)[1].lower()
    if ext in [".xls", ".xlsx"]:
        return pd.read_excel(path)
    else:
        # try csv, fallback to semicolon
        try:
            return pd.read_csv(path)
        except Exception:
            return pd.read_csv(path, sep=';')

In [64]:
def train(file_obj):
    global _model, _scaler, _features
    try:
        df = _load_file(file_obj)
        if "AQI" not in df.columns:
            return "‚ùå 'AQI' column missing in the dataset. Please include it."
        # drop missing AQI and fill numeric missing
        df = df.dropna(subset=["AQI"]).fillna(df.mean(numeric_only=True))
        df = df[df["AQI"] > 0]  # remove invalid AQI

        # detect pollutant columns
        keys = ["PM", "NO", "CO", "SO", "O3", "NH3", "BENZ", "TOLU", "XYL"]
        features = [c for c in df.columns if any(k in c.upper() for k in keys)]
        if "City" in features:
           features.remove("City")
        if not features:
            # fallback to numeric columns except AQI
            numeric = df.select_dtypes(include=[np.number]).columns.tolist()
            features = [c for c in numeric if c != "AQI"]
        if not features:
            return "‚ùå No pollutant/numeric features found. Ensure dataset has pollutant columns."
        # optional: encode City
        if "Date" in df.columns:
            df["Month"] = pd.to_datetime(df["Date"]).dt.month
            df["Day"] = pd.to_datetime(df["Date"]).dt.day
        if "City" in df.columns and df["City"].dtype == object:
            try:
                le = LabelEncoder()
                df["City"] = le.fit_transform(df["City"].astype(str))
                pass
            except Exception:
                pass
        X = df[features]
        # Clip outliers to reduce noise
        for col in features:
            df[col] = np.clip(df[col], df[col].quantile(0.01), df[col].quantile(0.99))

        y = df["AQI"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        _scaler = StandardScaler().fit(X_train)
        X_train_s = _scaler.transform(X_train)
        X_test_s = _scaler.transform(X_test)

        from sklearn.ensemble import RandomForestRegressor
        from xgboost import XGBRegressor
        # Train multiple models
        lin_model = LinearRegression()
        rf_model = RandomForestRegressor(
            n_estimators=300,
            max_depth=15,
            min_samples_split=4,
            min_samples_leaf=2,
            random_state=42
        )
        xgb_model = XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=10,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )

        # Fit all models
        lin_model.fit(X_train_s, y_train)
        rf_model.fit(X_train_s, y_train)
        xgb_model.fit(X_train_s, y_train)

        # Combine their predictions
        y_pred_lin = lin_model.predict(X_test_s)
        y_pred_rf = rf_model.predict(X_test_s)
        y_pred_xgb = xgb_model.predict(X_test_s)

        # Average predictions (ensemble)
        y_pred = (0.2 * y_pred_lin + 0.3 * y_pred_rf + 0.5 * y_pred_xgb)

        # Evaluate
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        accuracy = r2 * 100

        # Save the ensemble (you can pick one model to store if needed)
        _model = rf_model  # optional, to use later for manual prediction
        _features = features
        result = (f"‚úÖ Model trained!\nDetected features: {', '.join(features)}\n\n"
            f"MAE: {mae:.2f} | RMSE: {rmse:.2f} | R¬≤: {r2:.3f} | Accuracy ‚âà {accuracy:.1f}%")
        print(result)
        return result
    except Exception as e:
        return "‚ùå Training failed: " + str(e) + "\n\n" + traceback.format_exc()

In [65]:
def predict_manual(text):
    global _model, _scaler, _features
    if _model is None:
        return "‚ö†Ô∏è Train the model first."
    if not text:
        return "‚ö†Ô∏è Paste comma-separated pollutant values matching detected features."
    try:
        vals = [float(x.strip()) for x in text.split(",")]
        if len(vals) != len(_features):
            return f"‚ö†Ô∏è Expected {len(_features)} values (features: {', '.join(_features)}). You provided {len(vals)}."
        arr = np.array(vals).reshape(1, -1)
        arr_s = _scaler.transform(arr)
        pred = _model.predict(arr_s)[0]
        return f"üå§Ô∏è Predicted AQI: {pred:.2f}"
    except Exception as e:
        return "‚ùå Prediction error: " + str(e)

In [68]:
# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## AQI Predictor ‚Äî Minimal (Upload dataset, Train, Predict)")
    with gr.Row():
        with gr.Column():
            file_in = gr.File(label="Upload CSV or Excel (must include 'AQI')")
            train_btn = gr.Button("Train Model")
            train_out = gr.Textbox(label="Training Output", lines=6)
        with gr.Column():
            manual_in = gr.Textbox(label="Paste comma-separated pollutant values (order shown after training)", placeholder="e.g. 50,80,10,30,...")
            pred_btn = gr.Button("Predict AQI")
            pred_out = gr.Textbox(label="Prediction Output", lines=2)
    train_btn.click(train, inputs=file_in, outputs=train_out)
    pred_btn.click(predict_manual, inputs=manual_in, outputs=pred_out)

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8755e9b4df00f7f453.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


