In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import joblib

data = pd.read_csv("processed_agri_dataset.csv")

print("Data Loaded for Model Training")
print("Rows:", data.shape[0], " Columns:", data.shape[1])

print("\nColumns in dataset:\n", data.columns.tolist())

# Use only main environmental & soil features
X = data[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
y = data['Yield']

#Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nTraining set size:", X_train.shape)
print("Test set size:", X_test.shape)

#Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=42)
}

#Train and evaluate models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({"Model": name, "RMSE": rmse, "MAE": mae, "R2": r2})
    print(f"\n {name} Results:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")

#Compare model performance
results_df = pd.DataFrame(results)
print("\nModel Performance Comparison:")
print(results_df)

best_model_name = results_df.loc[results_df['R2'].idxmax(), 'Model']
best_model = models[best_model_name]
joblib.dump(best_model, "best_model.pkl")

print(f"\nBest Performing Model: {best_model_name}")
print("Model saved as best_model.pkl")


Data Loaded for Model Training
Rows: 277000  Columns: 17

Columns in dataset:
 ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'Crop', 'Crop_Year', 'Season', 'State', 'Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Yield']

Training set size: (221600, 7)
Test set size: (55400, 7)

 Linear Regression Results:
RMSE: 1371.5995
MAE: 872.8950
R²: 0.6492

 Random Forest Results:
RMSE: 946.0120
MAE: 201.0341
R²: 0.8331

 XGBoost Results:
RMSE: 946.0779
MAE: 201.0229
R²: 0.8331

Model Performance Comparison:
               Model         RMSE         MAE        R2
0  Linear Regression  1371.599520  872.895011  0.649238
1      Random Forest   946.011974  201.034116  0.833141
2            XGBoost   946.077873  201.022893  0.833117

Best Performing Model: Random Forest
Model saved as best_model.pkl
