In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import json
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    mean_squared_error, mean_absolute_error, r2_score
)

from catboost import CatBoostClassifier, CatBoostRegressor
from xgboost import XGBClassifier, XGBRegressor

In [2]:
# ---------------------------
# Configuration
# ---------------------------
CONFIG = {
    "CSV_PATH": "dataset/mango_price_dataset.csv",
    "RANDOM_STATE": 42,
    "TEST_SIZE": 0.20,
    "OUT_ROOT": "output",
    "MODEL_DIR": os.path.join("output", "models"),
    "PLOTS_DIR": os.path.join("output", "artifacts", "plots"),
    "ARTIFACT_DIR": os.path.join("output", "artifacts"),
    # classifier/regressor hyperparameters (from your tuning; tweak if desired)
    "CAT_CLASS_PARAMS": {"depth": 4, "iterations": 395, "learning_rate": 0.23419603304121425, "l2_leaf_reg": 8.600804638103362},
    "XGB_CLASS_PARAMS": {"n_estimators": 329, "max_depth": 5, "learning_rate": 0.05603751468349913, "subsample": 0.8435991664891758, "colsample_bytree": 0.9750220237762934},
    "CAT_REG_PARAMS": {"depth": 5, "iterations": 566, "learning_rate": 0.12481222299146678, "l2_leaf_reg": 1.8997742423620259},
    "XGB_REG_PARAMS": {"n_estimators": 441, "max_depth": 3, "learning_rate": 0.21386535711370855, "subsample": 0.8916028672163949, "colsample_bytree": 0.602208846849441},
    "BEST_PETTAH_WEIGHT": 1.05
}

In [3]:
# ---------------------------
# Ensure folders
# ---------------------------
for p in (CONFIG["OUT_ROOT"], CONFIG["MODEL_DIR"], CONFIG["PLOTS_DIR"], CONFIG["ARTIFACT_DIR"]):
    os.makedirs(p, exist_ok=True)

In [4]:
# ---------------------------
# Feature engineering
# ---------------------------
def prepare_full_features(df):
    """Create time features, lag/rolling, momentum, engineered features and interactions."""
    d = df.copy()
    d['Date'] = pd.to_datetime(d['Date'])
    d = d.sort_values(['Region', 'Date']).reset_index(drop=True)

    # time-based
    d['day'] = d['Date'].dt.day
    d['month'] = d['Date'].dt.month
    d['dayofyear'] = d['Date'].dt.dayofyear
    d['year'] = d['Date'].dt.year
    d['month_sin'] = np.sin(2 * np.pi * d['month'] / 12)
    d['month_cos'] = np.cos(2 * np.pi * d['month'] / 12)
    d['doy_sin'] = np.sin(2 * np.pi * d['dayofyear'] / 365)
    d['doy_cos'] = np.cos(2 * np.pi * d['dayofyear'] / 365)

    # lags
    d['local_price_lag1'] = d.groupby('Region')['Local_Price_LKR'].shift(1)
    d['export_price_lag1'] = d.groupby('Region')['Export_Price_USD'].shift(1)

    # rolling means
    d['local_price_roll7'] = d.groupby('Region')['Local_Price_LKR'].transform(lambda s: s.rolling(7, min_periods=1).mean())
    d['local_price_roll14'] = d.groupby('Region')['Local_Price_LKR'].transform(lambda s: s.rolling(14, min_periods=1).mean())
    d['export_price_roll7'] = d.groupby('Region')['Export_Price_USD'].transform(lambda s: s.rolling(7, min_periods=1).mean())
    d['export_price_roll14'] = d.groupby('Region')['Export_Price_USD'].transform(lambda s: s.rolling(14, min_periods=1).mean())

    # momentum
    d['local_price_mom1'] = d['Local_Price_LKR'] - d['local_price_lag1']
    d['export_price_mom1'] = d['Export_Price_USD'] - d['export_price_lag1']

    # fills for lag/rolling
    grp_med_local = d.groupby('Region')['Local_Price_LKR'].transform('median')
    grp_med_export = d.groupby('Region')['Export_Price_USD'].transform('median')

    for col in ['local_price_lag1','local_price_roll7','local_price_roll14']:
        d[col] = d[col].fillna(grp_med_local).fillna(d['Local_Price_LKR'].median())
    for col in ['export_price_lag1','export_price_roll7','export_price_roll14']:
        d[col] = d[col].fillna(grp_med_export).fillna(d['Export_Price_USD'].median())
    d['local_price_mom1'] = d['local_price_mom1'].fillna(0)
    d['export_price_mom1'] = d['export_price_mom1'].fillna(0)

    # new features
    d['local_price_region_mean'] = d.groupby('Region')['Local_Price_LKR'].transform('mean')
    d['price_dev'] = d['Local_Price_LKR'] - d['local_price_region_mean']
    d['price_to_age_ratio'] = d['Local_Price_LKR'] / (d['Mango_Age_Days'].replace(0,1))
    d['local_price_vol7'] = d.groupby('Region')['Local_Price_LKR'].transform(lambda s: s.rolling(7, min_periods=1).std()).fillna(0)
    d['local_price_vol14'] = d.groupby('Region')['Local_Price_LKR'].transform(lambda s: s.rolling(14, min_periods=1).std()).fillna(0)
    d['local_mom_region_mean'] = d.groupby('Region')['local_price_mom1'].transform('mean')
    d['momentum_dev'] = d['local_price_mom1'] - d['local_mom_region_mean']

    # interactions
    d['dev_x_ratio'] = d['price_dev'] * d['price_to_age_ratio']
    d['vol_x_mom'] = d['local_price_vol14'] * d['momentum_dev']
    d['age_x_price'] = d['Mango_Age_Days'] * d['local_price_lag1']
    d['roll14_x_mom'] = d['local_price_roll14'] * d['local_price_mom1']

    # fill residual NaNs for engineered numeric columns
    fill_cols = ['price_dev','price_to_age_ratio','local_price_vol7','local_price_vol14','momentum_dev','dev_x_ratio','vol_x_mom','age_x_price','roll14_x_mom']
    for c in fill_cols:
        if c in d.columns:
            d[c] = d[c].fillna(0)

    return d

In [5]:
# ---------------------------
# Utilities: plotting & feature name extraction
# ---------------------------
def save_confusion_matrix(cm, labels, outpath, title="Confusion Matrix"):
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.xlabel("Predicted"); plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(outpath)
    plt.close()

def save_feature_importance_catboost(cat_model, preprocessor, numeric_features, categorical_features, outpath_csv, outpath_png, top_n=20):
    try:
        imp = cat_model.get_feature_importance(type='FeatureImportance')
    except Exception:
        imp = getattr(cat_model, 'feature_importances_', None)
    if imp is None:
        print("Feature importances not available")
        return None

    # get feature names from preprocessor
    names = list(numeric_features)
    ohe = preprocessor.named_transformers_['cat']
    try:
        ohe_names = list(ohe.get_feature_names_out(categorical_features))
    except Exception:
        # fallback older sklearn
        cats = ohe.categories_
        ohe_names = []
        for col, cat_list in zip(categorical_features, cats):
            for cat in cat_list:
                ohe_names.append(f"{col}__{cat}")
    all_names = names + ohe_names
    fi = pd.Series(imp, index=all_names).sort_values(ascending=False)
    fi.head(top_n).to_csv(outpath_csv)
    plt.figure(figsize=(8,6))
    fi.head(top_n).sort_values().plot.barh()
    plt.title("Top feature importances")
    plt.tight_layout()
    plt.savefig(outpath_png)
    plt.close()
    return fi

In [6]:
# ---------------------------
# Train & evaluate function
# ---------------------------
def train_and_evaluate(csv_path, config):
    # load
    df = pd.read_csv(csv_path)
    print("Loaded:", df.shape)

    # prepare features
    dfc = prepare_full_features(df)
    print("Prepared features; columns:", len(dfc.columns))

    # features & targets
    feature_cols = [
        'month_sin','month_cos','doy_sin','doy_cos',
        'Mango_Age_Days','Days_To_Maturity','Temp_C','Humidity_%',
        'Region','weather',
        'local_price_lag1','local_price_roll7','local_price_roll14','local_price_mom1',
        'export_price_lag1','export_price_roll7','export_price_roll14','export_price_mom1',
        'price_dev','price_to_age_ratio','local_price_vol7','local_price_vol14','momentum_dev',
        'dev_x_ratio','vol_x_mom','age_x_price','roll14_x_mom'
    ]

    missing = set(feature_cols) - set(dfc.columns)
    if missing:
        raise ValueError("Missing engineered features: " + str(missing))

    X = dfc[feature_cols].copy()
    y_local = dfc['Local_Market'].values
    y_export = dfc['Export_Market'].values
    y_reg = dfc[["Local_Price_LKR","Export_Price_USD","Harvesting_after_3months_price"]].values

    # encoders
    le_local = LabelEncoder(); y_local_enc = le_local.fit_transform(y_local)
    le_export = LabelEncoder(); y_export_enc = le_export.fit_transform(y_export)

    # train/test splits
    X_train_cls, X_test_cls, y_local_train, y_local_test, y_export_train, y_export_test = train_test_split(
        X, y_local_enc, y_export_enc, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"],
        shuffle=True, stratify=y_local_enc
    )

    X_train_reg, X_test_reg, y_reg_train, y_reg_test = train_test_split(
        X, y_reg, test_size=config["TEST_SIZE"], random_state=config["RANDOM_STATE"], shuffle=True
    )

    print("Classifier Train/Test:", X_train_cls.shape, X_test_cls.shape)
    print("Regressor Train/Test:", X_train_reg.shape, X_test_reg.shape)

    # Preprocessor
    numeric_features = [
        'Mango_Age_Days','Days_To_Maturity','Temp_C','Humidity_%',
        'month_sin','month_cos','doy_sin','doy_cos',
        'local_price_lag1','local_price_roll7','local_price_roll14','local_price_mom1',
        'export_price_lag1','export_price_roll7','export_price_roll14','export_price_mom1',
        'price_dev','price_to_age_ratio','local_price_vol7','local_price_vol14','momentum_dev',
        'dev_x_ratio','vol_x_mom','age_x_price','roll14_x_mom'
    ]
    categorical_features = ['Region','weather']

    preprocessor = ColumnTransformer(transformers=[
        ('num','passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
    ], remainder='drop')

    # ---------- Local Market classifier (CatBoost weighted) ----------
    print("\nTraining Local Market classifier (CatBoost weighted)...")
    cat_params = config["CAT_CLASS_PARAMS"]
    cb_local = CatBoostClassifier(
        depth=cat_params["depth"],
        iterations=cat_params["iterations"],
        learning_rate=cat_params["learning_rate"],
        l2_leaf_reg=cat_params["l2_leaf_reg"],
        random_state=config["RANDOM_STATE"],
        verbose=0,
        loss_function='MultiClass',
        class_weights={0:1.0, 1:1.0, 2:config["BEST_PETTAH_WEIGHT"]}
    )
    pipe_local = Pipeline([('pre', preprocessor), ('clf', cb_local)])
    pipe_local.fit(X_train_cls, y_local_train)

    # evaluate local
    y_local_pred = pipe_local.predict(X_test_cls)
    acc_local = accuracy_score(y_local_test, y_local_pred)
    print("Local Accuracy:", acc_local)
    creport_local = classification_report(y_local_test, y_local_pred, target_names=le_local.classes_)

    # save confusion matrix + report
    cm_local = confusion_matrix(y_local_test, y_local_pred)
    save_confusion_matrix(cm_local, le_local.classes_, os.path.join(CONFIG["PLOTS_DIR"], "confusion_local.png"), title="Local Market Confusion Matrix")
    with open(os.path.join(CONFIG["ARTIFACT_DIR"], "report_local.txt"), "w") as f:
        f.write(creport_local)

    # persist local model + encoders
    joblib.dump(pipe_local, os.path.join(CONFIG["MODEL_DIR"], "local_catboost_weighted.joblib"))
    joblib.dump(le_local, os.path.join(CONFIG["MODEL_DIR"], "label_encoder_local.joblib"))
    print("Saved Local model and encoder to:", CONFIG["MODEL_DIR"])

    # feature importances (CatBoost)
    try:
        cat_core = pipe_local.named_steps['clf']
        fi = save_feature_importance_catboost(
            cat_core, preprocessor, numeric_features, categorical_features,
            outpath_csv=os.path.join(CONFIG["ARTIFACT_DIR"], "local_feature_importances_top20.csv"),
            outpath_png=os.path.join(CONFIG["PLOTS_DIR"], "feature_importances_local_top20.png"),
            top_n=20
        )
    except Exception as e:
        print("Feature importance extraction failed:", e)

    # ---------- Export Market classifier (XGBoost) ----------
    print("\nTraining Export Market classifier (XGBoost)...")
    xgbc_params = config["XGB_CLASS_PARAMS"]
    xgb_export = Pipeline([('pre', preprocessor), ('clf', XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        n_estimators=xgbc_params['n_estimators'],
        max_depth=xgbc_params['max_depth'],
        learning_rate=xgbc_params['learning_rate'],
        subsample=xgbc_params['subsample'],
        colsample_bytree=xgbc_params['colsample_bytree'],
        random_state=config["RANDOM_STATE"],
        n_jobs=-1
    ))])
    xgb_export.fit(X_train_cls, y_export_train)

    # evaluate export
    y_export_pred = xgb_export.predict(X_test_cls)
    acc_export = accuracy_score(y_export_test, y_export_pred)
    print("Export Accuracy:", acc_export)
    creport_export = classification_report(y_export_test, y_export_pred, target_names=le_export.classes_ if 'le_export' in locals() else np.unique(y_export))
    cm_export = confusion_matrix(y_export_test, y_export_pred)
    save_confusion_matrix(cm_export, (le_export.classes_ if 'le_export' in locals() else np.unique(y_export)), os.path.join(CONFIG["PLOTS_DIR"], "confusion_export.png"), title="Export Market Confusion Matrix")
    with open(os.path.join(CONFIG["ARTIFACT_DIR"], "report_export.txt"), "w") as f:
        f.write(creport_export)

    joblib.dump(xgb_export, os.path.join(CONFIG["MODEL_DIR"], "export_xgboost.joblib"))
    joblib.dump(le_export, os.path.join(CONFIG["MODEL_DIR"], "label_encoder_export.joblib"))
    print("Saved Export model and encoder to:", CONFIG["MODEL_DIR"])

    # ---------- Multi-output regressors ----------
    print("\nTraining multi-output regressors (XGBoost & CatBoost)...")
    # XGBoost regressor (wrapped)
    xgbr_params = config["XGB_REG_PARAMS"]
    xgb_reg_base = XGBRegressor(
        n_estimators=xgbr_params['n_estimators'],
        max_depth=xgbr_params['max_depth'],
        learning_rate=xgbr_params['learning_rate'],
        subsample=xgbr_params['subsample'],
        colsample_bytree=xgbr_params['colsample_bytree'],
        random_state=config["RANDOM_STATE"],
        n_jobs=1
    )
    xgb_reg = Pipeline([('pre', preprocessor), ('reg', MultiOutputRegressor(xgb_reg_base))])
    xgb_reg.fit(X_train_reg, y_reg_train)

    catr_params = config["CAT_REG_PARAMS"]
    cat_reg_base = CatBoostRegressor(
        depth=catr_params['depth'],
        iterations=catr_params['iterations'],
        learning_rate=catr_params['learning_rate'],
        l2_leaf_reg=catr_params['l2_leaf_reg'],
        random_state=config["RANDOM_STATE"],
        verbose=0
    )
    cat_reg = Pipeline([('pre', preprocessor), ('reg', MultiOutputRegressor(cat_reg_base))])
    cat_reg.fit(X_train_reg, y_reg_train)

    # evaluation helper
    def eval_and_plot_reg(name, pipeline, X_test, y_test, prefix):
        y_pred = pipeline.predict(X_test)
        n_targets = y_test.shape[1]
        summary = {}
        for i in range(n_targets):
            rmse = mean_squared_error(y_test[:,i], y_pred[:,i], squared=False)
            mae = mean_absolute_error(y_test[:,i], y_pred[:,i])
            r2 = r2_score(y_test[:,i], y_pred[:,i])
            summary[f"target_{i}"] = {"rmse": rmse, "mae": mae, "r2": r2}
            # scatter
            plt.figure(figsize=(5,4))
            plt.scatter(y_test[:,i], y_pred[:,i], alpha=0.4, s=10)
            mn = min(y_test[:,i].min(), y_pred[:,i].min()); mx = max(y_test[:,i].max(), y_pred[:,i].max())
            plt.plot([mn,mx],[mn,mx],'k--')
            plt.xlabel("True"); plt.ylabel("Pred")
            plt.title(f"{name} target_{i} (RMSE={rmse:.3f})")
            plt.tight_layout()
            plt.savefig(os.path.join(CONFIG["PLOTS_DIR"], f"{prefix}_reg_target{i}_scatter.png"))
            plt.close()
        return summary

    xgb_reg_summary = eval_and_plot_reg("XGB_Regressor", xgb_reg, X_test_reg, y_reg_test, "xgb")
    cat_reg_summary = eval_and_plot_reg("Cat_Regressor", cat_reg, X_test_reg, y_reg_test, "cat")
    print("XGB reg summary:", xgb_reg_summary)
    print("Cat reg summary:", cat_reg_summary)

    joblib.dump(xgb_reg, os.path.join(CONFIG["MODEL_DIR"], "regressor_xgb_multi.joblib"))
    joblib.dump(cat_reg, os.path.join(CONFIG["MODEL_DIR"], "regressor_cat_multi.joblib"))
    print("Saved regressors to:", CONFIG["MODEL_DIR"])

    # ---------- Save training summary & metadata ----------
    summary = {
        "timestamp": datetime.utcnow().isoformat(),
        "local_accuracy": float(acc_local),
        "export_accuracy": float(acc_export),
        "xgb_reg_summary": xgb_reg_summary,
        "cat_reg_summary": cat_reg_summary,
        "models_saved": {
            "local": os.path.join(CONFIG["MODEL_DIR"], "local_catboost_weighted.joblib"),
            "export": os.path.join(CONFIG["MODEL_DIR"], "export_xgboost.joblib"),
            "xgb_reg": os.path.join(CONFIG["MODEL_DIR"], "regressor_xgb_multi.joblib"),
            "cat_reg": os.path.join(CONFIG["MODEL_DIR"], "regressor_cat_multi.joblib"),
            "le_local": os.path.join(CONFIG["MODEL_DIR"], "label_encoder_local.joblib"),
            "le_export": os.path.join(CONFIG["MODEL_DIR"], "label_encoder_export.joblib")
        }
    }
    with open(os.path.join(CONFIG["ARTIFACT_DIR"], "training_summary.json"), "w") as f:
        json.dump(summary, f, indent=2)

    # also write human-readable summary
    with open(os.path.join(CONFIG["ARTIFACT_DIR"], "training_summary.txt"), "w") as f:
        f.write("Local Accuracy: %.6f\n\n" % acc_local)
        f.write("Local Classification Report:\n")
        f.write(creport_local + "\n\n")
        f.write("Export Accuracy: %.6f\n\n" % acc_export)
        f.write("Export Classification Report:\n")
        f.write(creport_export + "\n\n")
        f.write("XGB Reg Summary:\n"); f.write(str(xgb_reg_summary) + "\n\n")
        f.write("Cat Reg Summary:\n"); f.write(str(cat_reg_summary) + "\n\n")
    print("Training artifacts and summary saved under:", CONFIG["ARTIFACT_DIR"])

    return summary

In [7]:
# ---------------------------
# Main
# ---------------------------
def main():
    cfg = CONFIG.copy()
    summary = train_and_evaluate(cfg["CSV_PATH"], cfg)
    print("\nDone. See output folders:")
    print(" - models:", cfg["MODEL_DIR"])
    print(" - artifacts:", cfg["ARTIFACT_DIR"])
    print(" - plots:", cfg["PLOTS_DIR"])

if __name__ == "__main__":
    main()

Loaded: (12000, 12)
Prepared features; columns: 39
Classifier Train/Test: (9600, 27) (2400, 27)
Regressor Train/Test: (9600, 27) (2400, 27)

Training Local Market classifier (CatBoost weighted)...
Local Accuracy: 0.6625
Saved Local model and encoder to: output\models

Training Export Market classifier (XGBoost)...
Export Accuracy: 0.6341666666666667
Saved Export model and encoder to: output\models

Training multi-output regressors (XGBoost & CatBoost)...
XGB reg summary: {'target_0': {'rmse': 3.355285689206157, 'mae': 2.543561069488525, 'r2': 0.996905558402411}, 'target_1': {'rmse': 0.021986383960749194, 'mae': 0.015838696230053902, 'r2': 0.9944478753949237}, 'target_2': {'rmse': 13.486434440791902, 'mae': 10.489394558461507, 'r2': 0.9620814529146919}}
Cat reg summary: {'target_0': {'rmse': 1.984412312491834, 'mae': 1.4958844840631613, 'r2': 0.9989176016851002}, 'target_1': {'rmse': 0.013014207330710462, 'mae': 0.008179379915632905, 'r2': 0.9980546980113943}, 'target_2': {'rmse': 13.00

#### INFERANCE

In [13]:
# predict_batch_from_json.py
import os
import json
import joblib
import numpy as np
import pandas as pd
from datetime import datetime

# -------------------------
# User inputs (sample)
# -------------------------
sample_inputs = [
    {
        "Date": "2025-11-15",
        "Mango_Age_Days": 40,
        "Days_To_Maturity": 40,
        "Temp_C": 30.2,
        "Humidity_%": 72,
        "Region": "Colombo",
        "weather": "Clear"
    },
    {
        "Date": "2025-01-10",
        "Mango_Age_Days": 120,
        "Days_To_Maturity": 5,
        "Temp_C": 27.5,
        "Humidity_%": 78,
        "Region": "Hambantota",
        "weather": "Rain"
    }
]

# -------------------------
# Paths (try both conventions)
# -------------------------
possible_model_dirs = ["model", "output/models", "output/model", "models"]
def find_path(rel):
    for base in possible_model_dirs:
        p = os.path.join(base, rel)
        if os.path.exists(p):
            return p
    # fallback to direct path
    if os.path.exists(rel):
        return rel
    return None

LOCAL_MODEL_PATH = find_path("local_catboost_weighted.joblib")
EXPORT_MODEL_PATH = find_path("export_xgboost.joblib")
REG_MODEL_PATH = find_path("regressor_cat_multi.joblib")
LE_LOCAL_PATH = find_path("label_encoder_local.joblib")
LE_EXPORT_PATH = find_path("label_encoder_export.joblib")
CSV_PATH = "dataset/mango_price_dataset.csv"

# check
for p,name in [(LOCAL_MODEL_PATH,"Local model"), (EXPORT_MODEL_PATH,"Export model"), (REG_MODEL_PATH,"Regressor"), (LE_LOCAL_PATH,"LE Local"), (LE_EXPORT_PATH,"LE Export"), (CSV_PATH,"History CSV")]:
    if p is None or (isinstance(p,str) and not os.path.exists(p)):
        if name in ("LE Local","LE Export"):
            # label encoders optional if pipeline outputs labels directly
            print(f"Warning: {name} not found at expected places. If model pipelines return labels directly that's okay.")
        else:
            raise FileNotFoundError(f"Required artifact not found: {name}. Tried several locations. Looking for files like: {p}")

# -------------------------
# Load models & encoders
# -------------------------
print("Loading models...")
local_pipe = joblib.load(LOCAL_MODEL_PATH)
export_pipe = joblib.load(EXPORT_MODEL_PATH)
reg_pipe = joblib.load(REG_MODEL_PATH)

le_local = joblib.load(LE_LOCAL_PATH) if LE_LOCAL_PATH and os.path.exists(LE_LOCAL_PATH) else None
le_export = joblib.load(LE_EXPORT_PATH) if LE_EXPORT_PATH and os.path.exists(LE_EXPORT_PATH) else None

# -------------------------
# Feature engineering function (same as training)
# -------------------------
def prepare_full_features(df):
    d = df.copy()
    d['Date'] = pd.to_datetime(d['Date'])
    d = d.sort_values(['Region', 'Date']).reset_index(drop=True)
    # time features
    d['day'] = d['Date'].dt.day
    d['month'] = d['Date'].dt.month
    d['dayofyear'] = d['Date'].dt.dayofyear
    d['year'] = d['Date'].dt.year
    d['month_sin'] = np.sin(2 * np.pi * d['month'] / 12)
    d['month_cos'] = np.cos(2 * np.pi * d['month'] / 12)
    d['doy_sin'] = np.sin(2 * np.pi * d['dayofyear'] / 365)
    d['doy_cos'] = np.cos(2 * np.pi * d['dayofyear'] / 365)

    # lags
    d['local_price_lag1'] = d.groupby('Region')['Local_Price_LKR'].shift(1)
    d['export_price_lag1'] = d.groupby('Region')['Export_Price_USD'].shift(1)

    # rolling means
    d['local_price_roll7'] = d.groupby('Region')['Local_Price_LKR'].transform(lambda s: s.rolling(7, min_periods=1).mean())
    d['local_price_roll14'] = d.groupby('Region')['Local_Price_LKR'].transform(lambda s: s.rolling(14, min_periods=1).mean())
    d['export_price_roll7'] = d.groupby('Region')['Export_Price_USD'].transform(lambda s: s.rolling(7, min_periods=1).mean())
    d['export_price_roll14'] = d.groupby('Region')['Export_Price_USD'].transform(lambda s: s.rolling(14, min_periods=1).mean())

    # momentum
    d['local_price_mom1'] = d['Local_Price_LKR'] - d['local_price_lag1']
    d['export_price_mom1'] = d['Export_Price_USD'] - d['export_price_lag1']

    # fills for lag/rolling
    grp_med_local = d.groupby('Region')['Local_Price_LKR'].transform('median')
    grp_med_export = d.groupby('Region')['Export_Price_USD'].transform('median')

    for col in ['local_price_lag1','local_price_roll7','local_price_roll14']:
        d[col] = d[col].fillna(grp_med_local).fillna(d['Local_Price_LKR'].median())
    for col in ['export_price_lag1','export_price_roll7','export_price_roll14']:
        d[col] = d[col].fillna(grp_med_export).fillna(d['Export_Price_USD'].median())
    d['local_price_mom1'] = d['local_price_mom1'].fillna(0)
    d['export_price_mom1'] = d['export_price_mom1'].fillna(0)

    # new features
    d['local_price_region_mean'] = d.groupby('Region')['Local_Price_LKR'].transform('mean')
    d['price_dev'] = d['Local_Price_LKR'] - d['local_price_region_mean']
    d['price_to_age_ratio'] = d['Local_Price_LKR'] / (d['Mango_Age_Days'].replace(0,1))
    d['local_price_vol7'] = d.groupby('Region')['Local_Price_LKR'].transform(lambda s: s.rolling(7, min_periods=1).std()).fillna(0)
    d['local_price_vol14'] = d.groupby('Region')['Local_Price_LKR'].transform(lambda s: s.rolling(14, min_periods=1).std()).fillna(0)
    d['local_mom_region_mean'] = d.groupby('Region')['local_price_mom1'].transform('mean')
    d['momentum_dev'] = d['local_price_mom1'] - d['local_mom_region_mean']

    # interactions
    d['dev_x_ratio'] = d['price_dev'] * d['price_to_age_ratio']
    d['vol_x_mom'] = d['local_price_vol14'] * d['momentum_dev']
    d['age_x_price'] = d['Mango_Age_Days'] * d['local_price_lag1']
    d['roll14_x_mom'] = d['local_price_roll14'] * d['local_price_mom1']

    # fill residual NaNs
    fill_cols = ['price_dev','price_to_age_ratio','local_price_vol7','local_price_vol14','momentum_dev','dev_x_ratio','vol_x_mom','age_x_price','roll14_x_mom']
    for c in fill_cols:
        if c in d.columns:
            d[c] = d[c].fillna(0)
    return d

# -------------------------
# Load historical dataset
# -------------------------
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"History CSV not found at '{CSV_PATH}'. It is required to compute lag/rolling features.")

hist = pd.read_csv(CSV_PATH)
hist['Date'] = pd.to_datetime(hist['Date'])
# ensure required price columns exist
for col in ['Local_Price_LKR','Export_Price_USD','Harvesting_after_3months_price']:
    if col not in hist.columns:
        hist[col] = np.nan

# -------------------------
# Build DataFrame of new inputs + dummy price columns for lagging
# -------------------------
new_rows = []
for inp in sample_inputs:
    row = inp.copy()
    row['Date'] = pd.to_datetime(row['Date'])
    # For lag/rolling logic we need a Local/Export/Harvesting price in new row;
    # use latest available from same Region in history (if exists), else region median, else global median.
    region = row['Region']
    region_hist = hist[hist['Region'] == region]
    if len(region_hist) > 0:
        last_local = float(region_hist.sort_values('Date').iloc[-1]['Local_Price_LKR'])
        last_export = float(region_hist.sort_values('Date').iloc[-1]['Export_Price_USD'])
        last_harv = float(region_hist.sort_values('Date').iloc[-1]['Harvesting_after_3months_price'])
    else:
        # fallbacks
        last_local = float(hist['Local_Price_LKR'].median())
        last_export = float(hist['Export_Price_USD'].median())
        last_harv = float(hist['Harvesting_after_3months_price'].median())
    row['Local_Price_LKR'] = last_local
    row['Export_Price_USD'] = last_export
    row['Harvesting_after_3months_price'] = last_harv
    new_rows.append(row)

df_new = pd.DataFrame(new_rows)

# -------------------------
# Concatenate hist + new, compute features, then keep only new final rows
# -------------------------
combined = pd.concat([hist, df_new], ignore_index=True, sort=False)
combined = prepare_full_features(combined)

# Extract final N rows corresponding to our new inputs in the same order
# We'll match by Date+Region+Mango_Age_Days heuristic (if duplicates exist the last occurrence will be used)
results = []
for inp in sample_inputs:
    # find matching rows
    mask = (
        (combined['Region'] == inp['Region']) &
        (combined['Date'] == pd.to_datetime(inp['Date'])) &
        (combined['Mango_Age_Days'] == inp['Mango_Age_Days'])
    )
    match = combined[mask]
    if match.empty:
        # fallback: take last row for that region with that date
        match = combined[(combined['Region']==inp['Region']) & (combined['Date']==pd.to_datetime(inp['Date']))]
    if match.empty:
        # as ultimate fallback, take the last row for that region
        match = combined[combined['Region']==inp['Region']].tail(1)
    if match.empty:
        raise RuntimeError("Could not align engineered features for input: " + str(inp))
    sample_row = match.tail(1)  # ensure single row
    # keep original order values
    sample_row = sample_row.reset_index(drop=True)

    # -------------------------
    # Make predictions
    # -------------------------
    # classifiers
    local_pred_idx = local_pipe.predict(sample_row)[0]
    try:
        local_proba = local_pipe.predict_proba(sample_row)[0].tolist()
    except Exception:
        # pipeline might not expose predict_proba (unlikely), handle gracefully
        local_proba = None

    export_pred_idx = export_pipe.predict(sample_row)[0]
    try:
        export_proba = export_pipe.predict_proba(sample_row)[0].tolist()
    except Exception:
        export_proba = None

    # regressors (multi-output)
    reg_pred = reg_pipe.predict(sample_row)[0].tolist()

    # decode labels if encoders available
    local_label = le_local.inverse_transform([local_pred_idx])[0] if le_local is not None else str(local_pred_idx)
    export_label = le_export.inverse_transform([export_pred_idx])[0] if le_export is not None else str(export_pred_idx)

    out = {
        "input": inp,
        "Local_Market": local_label,
        "Local_Market_proba": local_proba,
        "Export_Market": export_label,
        "Export_Market_proba": export_proba,
        "Predicted_Local_Price_LKR": reg_pred[0],
        "Predicted_Export_Price_USD": reg_pred[1],
        "Predicted_Harvesting_after_3months_price": reg_pred[2]
    }
    results.append(out)

# -------------------------
# Print results (table)
# -------------------------
df_res = pd.DataFrame([{
    "Date": r["input"]["Date"],
    "Region": r["input"]["Region"],
    "Local_Market": r["Local_Market"],
    "Local_Price_LKR": r["Predicted_Local_Price_LKR"],
    "Export_Market": r["Export_Market"],
    "Export_Price_USD": r["Predicted_Export_Price_USD"],
    "Harvesting_after_3months_price": r["Predicted_Harvesting_after_3months_price"]
} for r in results])

print("\nPredictions:")
print(df_res.to_string(index=False))

# Also print JSON
print("\nJSON results:")
print(json.dumps(results, indent=2, default=str))


Loading models...

Predictions:
      Date     Region Local_Market  Local_Price_LKR Export_Market  Export_Price_USD  Harvesting_after_3months_price
2025-11-15    Colombo      Colombo       384.316063         Dubai          1.360861                      414.158614
2025-01-10 Hambantota      Colombo       421.674971         Paris          2.060027                      483.117209

JSON results:
[
  {
    "input": {
      "Date": "2025-11-15",
      "Mango_Age_Days": 40,
      "Days_To_Maturity": 40,
      "Temp_C": 30.2,
      "Humidity_%": 72,
      "Region": "Colombo",
      "weather": "Clear"
    },
    "Local_Market": "Colombo",
    "Local_Market_proba": [
      0.9981093464095814,
      3.1291909487282736e-05,
      0.0018593616809314231
    ],
    "Export_Market": "Dubai",
    "Export_Market_proba": [
      0.08683250844478607,
      0.6086015105247498,
      0.13983270525932312,
      0.16473327577114105
    ],
    "Predicted_Local_Price_LKR": 384.31606265034884,
    "Predicted_Exp