In [5]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, mean_squared_error, r2_score

from xgboost import XGBClassifier, XGBRegressor

In [6]:
df = pd.read_csv("../data/processed/final_training_dataset.csv")
print("✅ Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

✅ Dataset shape: (1048575, 7)
Columns: ['soil_type', 'crop', 'region', 'fertilizer_used', 'yield_tons', 'water_usage', 'irrigation_type']


In [7]:
target_crop = "crop"
target_yield = "yield_tons"
target_resources = ["fertilizer_used", "water_usage"]


In [8]:
X = df.drop([target_crop, target_yield] + target_resources, axis=1, errors="ignore")

# ===============================
# 🏷️ Encode crop labels
# ===============================
crop_encoder = LabelEncoder()
y_crop_encoded = crop_encoder.fit_transform(df[target_crop])

joblib.dump(crop_encoder, "../models/crop_encoder.pkl")

['../models/crop_encoder.pkl']

In [9]:
# Yield regression target
y_yield = df[target_yield]

# Resource regression targets
y_resources = df[target_resources]


In [10]:
# ===================================
# 🔄 Preprocessing
# ===================================
categorical = X.select_dtypes(include=["object"]).columns.tolist()
numerical = X.select_dtypes(exclude=["object"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ]
)

In [11]:

# ===================================
# 🌾 Crop Classification (XGBoost)
# ===================================
clf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", XGBClassifier(
        n_estimators=400,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="mlogloss",
        use_label_encoder=False,
        random_state=42
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y_crop_encoded, test_size=0.2, stratify=y_crop_encoded, random_state=42
)

clf_pipeline.fit(X_train, y_train)
y_pred = clf_pipeline.predict(X_test)

print("\n🌾 Crop Classification Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred, target_names=crop_encoder.classes_))

joblib.dump(clf_pipeline, "../models/crop_model.pkl")




🌾 Crop Classification Results:
Accuracy: 0.38697756479031065
F1 Score: 0.27630906374914294
              precision    recall  f1-score   support

      Barley       0.41      0.71      0.52     47128
      Cotton       0.67      0.28      0.40     47383
       Maize       0.00      0.00      0.00     20332
        Rice       0.25      0.28      0.26     33860
     Soybean       0.35      0.73      0.47     33876
       Wheat       0.00      0.00      0.00     27136

    accuracy                           0.39    209715
   macro avg       0.28      0.33      0.28    209715
weighted avg       0.34      0.39      0.33    209715



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


['../models/crop_model.pkl']

In [12]:
# ===================================
# 🌱 Yield Regression (XGBoost)
# ===================================
reg_yield_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("reg", XGBRegressor(
        n_estimators=400,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42
    ))
])

X_train, X_test, y_train, y_test = train_test_split(X, y_yield, test_size=0.2, random_state=42)

reg_yield_pipeline.fit(X_train, y_train)
y_pred = reg_yield_pipeline.predict(X_test)

print("\n🌱 Yield Regression Results:")
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
print("\n🌱 Yield Regression Results:")
rmse = np.sqrt(mean_squared_error(y_test, y_pred))   # <-- manual square root
print("RMSE:", rmse)
print("R² Score:", r2_score(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))
joblib.dump(reg_yield_pipeline, "../models/yield_model.pkl")


🌱 Yield Regression Results:

🌱 Yield Regression Results:
RMSE: 12.846150711482036
R² Score: 0.12135470027747763
R² Score: 0.12135470027747763


['../models/yield_model.pkl']

In [15]:

# ===================================
# 💧 Resource Models (fertilizer, water)
# ===================================
for resource in target_resources:
    print(f"\n🔧 Training resource model for: {resource}")
    reg_resource_pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("reg", XGBRegressor(
            n_estimators=300,
            max_depth=7,
            learning_rate=0.07,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42
        ))
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y_resources[resource], test_size=0.2, random_state=42)

    reg_resource_pipeline.fit(X_train, y_train)
    y_pred = reg_resource_pipeline.predict(X_test)
    from sklearn.metrics import mean_squared_error, r2_score
    import numpy as np

    print("\n🌱 Yield Regression Results:")
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))   # <-- manual square root
    print("RMSE:", rmse)
    print("R² Score:", r2_score(y_test, y_pred))

    print(f"R² ({resource}):", r2_score(y_test, y_pred))

    joblib.dump(reg_resource_pipeline, f"../models/{resource}_model.pkl")

print("\n✅ All models trained and saved successfully.")



🔧 Training resource model for: fertilizer_used

🌱 Yield Regression Results:
RMSE: 2.5538254646155303
R² Score: 0.116131912059663
R² (fertilizer_used): 0.116131912059663

🔧 Training resource model for: water_usage

🌱 Yield Regression Results:
RMSE: 22706.955092170403
R² Score: 0.24045858326716618
R² (water_usage): 0.24045858326716618

✅ All models trained and saved successfully.
