In [None]:
# Cell 1: Imports & basic settings
import pandas as pd
import numpy as np
import re
from pathlib import Path

# Modeling
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, average_precision_score, roc_auc_score, precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.utils import Bunch

# XGBoost
from xgboost import XGBClassifier

# Persistence
import joblib
import warnings
warnings.filterwarnings("ignore")


In [None]:

CSV_PATH = "Data/Wheat/Wheat_Crop_Disease_Environment_Cures_Maharashtra.csv"  # <-- change this
df = pd.read_csv(CSV_PATH)

# Keep only necessary columns
needed_cols = ["Crop Stage", "Crop Disease"]
missing = [c for c in needed_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

df = df[needed_cols].copy()

# Basic cleaning
df["Crop Stage"] = (
    df["Crop Stage"]
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
)

df["Crop Disease"] = (
    df["Crop Disease"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s*\|\s*", "|", regex=True)  # normalize delimiters if multi-label
)
df = df.dropna(subset=["Crop Stage", "Crop Disease"]).reset_index(drop=True)

print("Rows:", len(df))
df.head()
