In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
DYNAMIC_FEATURES = [
    "ndvi", "t2m_c", "td2m_c",
    "rh_pct", "tp_m", "ssrd_jm2"
]

STATIC_FEATURES = [
    "sand", "silt", "clay", "soc", "ph",
    "bdod", "cec", "nitrogen",
    "phosphorus", "potassium", "lc_type1"
]

LAGS = [1, 2, 3]

REQUIRED_DYNAMIC = [
    "year", "month",
    "ndvi", "t2m_c", "td2m_c",
    "rh_pct", "tp_m", "ssrd_jm2"
]

In [3]:
# UTILS
class FeatureFilter(BaseEstimator, TransformerMixin):
    def __init__(self, expected_features):
        self.expected_features = expected_features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X = X[[c for c in self.expected_features if c in X.columns]]
        for c in self.expected_features:
            if c not in X.columns:
                X[c] = np.nan
        return X[self.expected_features]


def validate_api_payload(payload):
    if len(payload) < 3:
        raise ValueError("At least 3 months of data are required")

    for item in payload:
        for f in REQUIRED_DYNAMIC:
            if f not in item["features"]:
                raise ValueError(f"Missing required feature: {f}")


def sanitize_features(df):
    df["ndvi"] = df["ndvi"].clip(-1, 1)
    df["rh_pct"] = df["rh_pct"].clip(0, 100)
    df["tp_m"] = df["tp_m"].clip(lower=0)
    df["ssrd_jm2"] = df["ssrd_jm2"].clip(lower=0)
    return df

In [4]:
forecast_models = {
    f: joblib.load(f"xgb_forecast_{f}.pkl")
    for f in DYNAMIC_FEATURES
}

cls_model = joblib.load("xgb_desertification_pipeline.pkl")
label_encoder = joblib.load("label_encoder.pkl")
feature_order = joblib.load("feature_names.pkl")

In [5]:
# API â†’ DataFrame
def api_payload_to_df(payload):
    rows = []
    for item in payload:
        row = item["features"].copy()
        row["location_name"] = item["metadata"]["location_name"]
        rows.append(row)

    df = pd.DataFrame(rows)

    df = df.rename(columns={"LC_Type1": "lc_type1"})
    df = df.drop(columns=["latitude", "longitude"], errors="ignore")

    return df

In [6]:
# PREPARE TIME SERIES
def prepare_time_series(df):
    df = df.sort_values(
        ["location_name", "year", "month"]
    ).reset_index(drop=True)

    df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
    df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

    for col in DYNAMIC_FEATURES:
        for lag in LAGS:
            df[f"{col}_lag{lag}"] = (
                df.groupby("location_name")[col].shift(lag)
            )

    df = df.dropna().reset_index(drop=True)
    return df

In [7]:
# FORECAST
def forecast_future(df, years=5):
    months = years * 12

    forecast_feature_cols = (
        DYNAMIC_FEATURES +
        ["month_sin", "month_cos"] +
        [f"{c}_lag{l}" for c in DYNAMIC_FEATURES for l in LAGS]
    )

    last_row = df.iloc[-1].copy()
    forecasts = []

    year = int(last_row["year"])
    month = int(last_row["month"])

    for _ in range(months):
        month += 1
        if month > 12:
            month = 1
            year += 1

        last_row["month_sin"] = np.sin(2 * np.pi * month / 12)
        last_row["month_cos"] = np.cos(2 * np.pi * month / 12)

        row_pred = {"year": year, "month": month}

        for feature in DYNAMIC_FEATURES:
            X = last_row[forecast_feature_cols].values.reshape(1, -1)
            pred = forecast_models[feature].predict(X)[0]

            if feature == "ssrd_jm2":
                pred = np.expm1(pred)

            row_pred[feature] = pred

            last_row[f"{feature}_lag3"] = last_row[f"{feature}_lag2"]
            last_row[f"{feature}_lag2"] = last_row[f"{feature}_lag1"]
            last_row[f"{feature}_lag1"] = pred

        forecasts.append(row_pred)

    return pd.DataFrame(forecasts)

In [8]:
# CLASSIFICATION
def classify_risk(future_df, static_row):
    for col in STATIC_FEATURES:
        future_df[col] = static_row[col]

    feature_filter = FeatureFilter(feature_order)
    X = feature_filter.transform(future_df)

    encoded = cls_model.predict(X)
    labels = label_encoder.inverse_transform(encoded)
    proba = cls_model.predict_proba(X).max(axis=1)

    future_df["risk_level"] = labels
    future_df["risk_confidence"] = proba

    return future_df

In [9]:
# MAIN PIPELINE
def run_pipeline(api_payload, years=5):
    validate_api_payload(api_payload)

    df = api_payload_to_df(api_payload)
    df = sanitize_features(df)

    ts_df = prepare_time_series(df)

    future = forecast_future(ts_df, years)
    static_row = ts_df.iloc[-1]

    final = classify_risk(future, static_row)

    return {
        "success": True,
        "forecast": final[
            ["year", "month"] +
            DYNAMIC_FEATURES +
            ["risk_level", "risk_confidence"]
        ].to_dict(orient="records")
    }