In [None]:
import pandas as pd
import numpy as np
import re
from pathlib import Path


from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, average_precision_score, roc_auc_score, precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.utils import Bunch


from xgboost import XGBClassifier

import joblib
import warnings
warnings.filterwarnings("ignore")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:

CSV_PATH = "Data/Wheat/Wheat_Crop_Disease_Environment_Cures_Maharashtra.csv"  # <-- change this
df = pd.read_csv(CSV_PATH)

needed_cols = ["Crop Stage", "Crop Disease"]
missing = [c for c in needed_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

df = df[needed_cols].copy()

df["Crop Stage"] = (
    df["Crop Stage"]
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
)

df["Crop Disease"] = (
    df["Crop Disease"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s*\|\s*", "|", regex=True) 
)
df = df.dropna(subset=["Crop Stage", "Crop Disease"]).reset_index(drop=True)

print("Rows:", len(df))
df.head()


Rows: 140


Unnamed: 0,Crop Stage,Crop Disease
0,pre-sowing,Loose Smut
1,pre-sowing,Common Bunt
2,pre-sowing,Karnal Bunt
3,pre-sowing,Loose Smut
4,pre-sowing,Common Bunt


In [3]:
def to_label_list(s):
    s = str(s).strip()
    if "|" in s:
        return [t.strip() for t in s.split("|") if t.strip()]
    return [s]

y_list = df["Crop Disease"].apply(to_label_list)

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_list)

print("Num classes:", len(mlb.classes_))
print("Classes:", list(mlb.classes_)[:10], "..." if len(mlb.classes_) > 10 else "")


Num classes: 19
Classes: ['Barley Yellow Dwarf Virus (BYDV)', 'Black Point', 'Common Bunt', 'Crown Rot', 'Damping-off (Pythium)', 'Fusarium Head Blight', 'Karnal Bunt', 'Leaf Blight', 'Leaf Rust', 'Loose Smut'] ...


In [4]:
X = df[["Crop Stage"]].copy()

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
preproc = ColumnTransformer(
    transformers=[("stage_ohe", ohe, ["Crop Stage"])],
    remainder="drop"
)


In [None]:
from sklearn.model_selection import train_test_split

stage_series = df["Crop Stage"]


X_train, X_temp, Y_train, Y_temp, stg_train, stg_temp = train_test_split(
    X, Y, stage_series,
    test_size=0.30,
    random_state=42,
    stratify=stage_series
)

stage_counts = stg_temp.value_counts()
can_stratify = (stage_counts >= 2).all()

if can_stratify:
    X_val, X_test, Y_val, Y_test, stg_val, stg_test = train_test_split(
        X_temp, Y_temp, stg_temp,
        test_size=0.50,
        random_state=42,
        stratify=stg_temp
    )
else:
    print(" Skipping stratify on 2nd split (some stages have only 1 sample)")
    X_val, X_test, Y_val, Y_test, stg_val, stg_test = train_test_split(
        X_temp, Y_temp, stg_temp,
        test_size=0.50,
        random_state=42,
        stratify=None 
    )

print("Train:", X_train.shape, Y_train.shape)
print("Val  :", X_val.shape, Y_val.shape)
print("Test :", X_test.shape, Y_test.shape)


⚠️ Skipping stratify on 2nd split (some stages have only 1 sample)
Train: (98, 1) (98, 19)
Val  : (21, 1) (21, 19)
Test : (21, 1) (21, 19)


In [None]:

from xgboost import XGBClassifier
from sklearn.utils import Bunch

def fit_ovr_xgb(preprocessor, X_train, Y_train, random_state=42, base_params=None):
    """
    Fits one XGBClassifier per label (OvR) using preprocessed features.
    Returns: Bunch(preprocessor, estimators, classes_)
    """
    if base_params is None:
        base_params = dict(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=4,
            subsample=0.85,
            colsample_bytree=0.9,
            reg_lambda=1.0,
            min_child_weight=1.0,
            objective="binary:logistic",
            eval_metric="logloss",
            tree_method="hist",
            random_state=random_state,
            n_jobs=-1
        )


    Xtr = preprocessor.fit_transform(X_train)

    estimators = []
    L = Y_train.shape[1]
    for j in range(L):
        yj = Y_train[:, j]
        pos = yj.sum()
        neg = len(yj) - pos
        spw = float(neg / max(1.0, pos)) if pos > 0 else 1.0

        clf = XGBClassifier(**{**base_params, "scale_pos_weight": spw})
        clf.fit(Xtr, yj)
        estimators.append(clf)

    return Bunch(preprocessor=preprocessor, estimators=estimators, classes_=mlb.classes_)


def predict_proba_ovr(model_bunch, X):
    """Return probability matrix for each class."""
    Xp = model_bunch.preprocessor.transform(X)
    probs = np.column_stack([est.predict_proba(Xp)[:, 1] for est in model_bunch.estimators])
    return probs


In [7]:
from sklearn.metrics import f1_score
import numpy as np

def evaluate_f1_micro(preproc, params):
    mb = fit_ovr_xgb(preproc, X_train, Y_train, base_params=params)
    P_val = predict_proba_ovr(mb, X_val)
    Y_pred = (P_val >= 0.5).astype(int)
    return f1_score(Y_val, Y_pred, average="micro")

param_space = [
    dict(
        n_estimators=n,
        learning_rate=lr,
        max_depth=md,
        subsample=ss,
        colsample_bytree=cs,
        reg_lambda=rl,
        min_child_weight=mcw,
    )
    for n in [300, 500, 800]
    for lr in [0.03, 0.05, 0.08]
    for md in [3, 4, 5]
    for ss in [0.8, 0.9]
    for cs in [0.8, 0.9]
    for rl in [1.0, 2.0]
    for mcw in [1.0, 2.0]
]

rng = np.random.default_rng(42)
sample_idx = rng.choice(len(param_space), size=min(20, len(param_space)), replace=False)

best_params = None
best_score = -1

for idx in sample_idx:
    params = param_space[idx]
    score = evaluate_f1_micro(preproc, params)
    if score > best_score:
        best_score = score
        best_params = params

print("Best validation micro-F1:", round(best_score, 4))
print("Best params:", best_params)


Best validation micro-F1: 0.6176
Best params: {'n_estimators': 800, 'learning_rate': 0.03, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'min_child_weight': 1.0}


In [8]:
final_params = dict(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.85,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    min_child_weight=1.0,
)
if best_params is not None:
    final_params.update(best_params)

model = fit_ovr_xgb(preproc, X_train, Y_train, base_params=final_params)


In [9]:
from sklearn.metrics import precision_recall_curve

P_val = predict_proba_ovr(model, X_val)

def best_thresholds_from_val(P, Y_true):
    thresholds = np.zeros(P.shape[1], dtype=float)
    for j in range(P.shape[1]):
        pj = P[:, j]
        yj = Y_true[:, j]
        prec, rec, thr = precision_recall_curve(yj, pj)
        f1s = (2 * prec * rec) / np.clip(prec + rec, 1e-12, None)
        if len(thr) == 0:
            thresholds[j] = 0.5
            continue
        best_idx = np.nanargmax(f1s[1:]) if len(f1s) > 1 else 0
        thresholds[j] = thr[best_idx] if best_idx < len(thr) else 0.5
    return thresholds

opt_thresholds = best_thresholds_from_val(P_val, Y_val)
print("Per-label thresholds (first 10):", np.round(opt_thresholds[:10], 3))


Per-label thresholds (first 10): [0.011 0.015 0.017 0.013 0.002 0.014 0.01  0.013 0.015 0.792]


In [None]:

from sklearn.metrics import classification_report, accuracy_score, average_precision_score, roc_auc_score

P_test = predict_proba_ovr(model, X_test)
Y_pred_test = (P_test >= opt_thresholds.reshape(1, -1)).astype(int)

print("Subset accuracy:", round(accuracy_score(Y_test, Y_pred_test), 4))
print("F1 (micro):", round(f1_score(Y_test, Y_pred_test, average='micro'), 4))
print("F1 (macro):", round(f1_score(Y_test, Y_pred_test, average='macro'), 4))

try:
    mAP = average_precision_score(Y_test, P_test, average="macro")
    print("mAP (macro):", round(mAP, 4))
except:
    print("mAP unavailable.")

try:
    auc = roc_auc_score(Y_test, P_test, average="macro")
    print("ROC-AUC (macro):", round(auc, 4))
except:
    print("ROC-AUC unavailable.")

print("\nPer-class report:")
print(classification_report(Y_test, Y_pred_test, target_names=mlb.classes_))


Subset accuracy: 0.0
F1 (micro): 0.25
F1 (macro): 0.31
mAP (macro): 0.4031
ROC-AUC unavailable.

Per-class report:
                                  precision    recall  f1-score   support

Barley Yellow Dwarf Virus (BYDV)       0.05      1.00      0.09         1
                     Black Point       0.00      0.00      0.00         0
                     Common Bunt       0.25      1.00      0.40         1
                       Crown Rot       0.33      1.00      0.50         1
           Damping-off (Pythium)       0.25      1.00      0.40         1
            Fusarium Head Blight       0.00      0.00      0.00         0
                     Karnal Bunt       0.05      1.00      0.09         1
                     Leaf Blight       0.33      1.00      0.50         1
                       Leaf Rust       0.75      1.00      0.86         3
                      Loose Smut       0.57      1.00      0.73         4
             Root Rot (Fusarium)       0.60      0.75      0.67       