# Model Training and Evaluation
## Agriculture Crop Production Prediction

This notebook focuses on training and evaluating machine learning models for crop yield prediction.

**Models to Train:**
- Random Forest Regressor
- XGBoost Regressor
- Model Comparison and Evaluation


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import joblib

# Add src to path
sys.path.append('../src')
from utils.data_loader import load_data, preprocess_data
from utils.preprocessing import prepare_model_features

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")


## 1. Load and Prepare Data


In [None]:
# Load and preprocess data
df = load_data()
df_processed = preprocess_data(df)

# Prepare features for modeling
X, y, encoder = prepare_model_features(df_processed, target_col='Quantity')

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature names: {list(X.columns) if hasattr(X, 'columns') else 'Array format'}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Target range: {y.min():.2f} to {y.max():.2f}")


## 2. Random Forest Model


In [None]:
# Train Random Forest Model
print("Training Random Forest Regressor...")
print("=" * 60)

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Train model
rf_model.fit(X_train, y_train)

# Predictions
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

# Calculate metrics
train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))
train_mae_rf = mean_absolute_error(y_train, y_train_pred_rf)
test_mae_rf = mean_absolute_error(y_test, y_test_pred_rf)
train_r2_rf = r2_score(y_train, y_train_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)

print("Random Forest Results:")
print(f"  Training RMSE: {train_rmse_rf:.4f}")
print(f"  Test RMSE: {test_rmse_rf:.4f}")
print(f"  Training MAE: {train_mae_rf:.4f}")
print(f"  Test MAE: {test_mae_rf:.4f}")
print(f"  Training R²: {train_r2_rf:.4f}")
print(f"  Test R²: {test_r2_rf:.4f}")

# Feature importance
feature_importance_rf = pd.DataFrame({
    'Feature': X.columns if hasattr(X, 'columns') else [f'Feature_{i}' for i in range(X.shape[1])],
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features (Random Forest):")
print(feature_importance_rf.head(10).to_string(index=False))


## 3. XGBoost Model


In [None]:
# Train XGBoost Model
print("Training XGBoost Regressor...")
print("=" * 60)

xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# Train model
xgb_model.fit(X_train, y_train)

# Predictions
y_train_pred_xgb = xgb_model.predict(X_train)
y_test_pred_xgb = xgb_model.predict(X_test)

# Calculate metrics
train_rmse_xgb = np.sqrt(mean_squared_error(y_train, y_train_pred_xgb))
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, y_test_pred_xgb))
train_mae_xgb = mean_absolute_error(y_train, y_train_pred_xgb)
test_mae_xgb = mean_absolute_error(y_test, y_test_pred_xgb)
train_r2_xgb = r2_score(y_train, y_train_pred_xgb)
test_r2_xgb = r2_score(y_test, y_test_pred_xgb)

print("XGBoost Results:")
print(f"  Training RMSE: {train_rmse_xgb:.4f}")
print(f"  Test RMSE: {test_rmse_xgb:.4f}")
print(f"  Training MAE: {train_mae_xgb:.4f}")
print(f"  Test MAE: {test_mae_xgb:.4f}")
print(f"  Training R²: {train_r2_xgb:.4f}")
print(f"  Test R²: {test_r2_xgb:.4f}")

# Feature importance
feature_importance_xgb = pd.DataFrame({
    'Feature': X.columns if hasattr(X, 'columns') else [f'Feature_{i}' for i in range(X.shape[1])],
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features (XGBoost):")
print(feature_importance_xgb.head(10).to_string(index=False))


## 4. Model Comparison


In [None]:
# Compare models
comparison_df = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost'],
    'Train RMSE': [train_rmse_rf, train_rmse_xgb],
    'Test RMSE': [test_rmse_rf, test_rmse_xgb],
    'Train MAE': [train_mae_rf, train_mae_xgb],
    'Test MAE': [test_mae_rf, test_mae_xgb],
    'Train R²': [train_r2_rf, train_r2_xgb],
    'Test R²': [test_r2_rf, test_r2_xgb]
})

print("Model Comparison:")
print(comparison_df.to_string(index=False))

# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# RMSE comparison
axes[0, 0].bar(['Random Forest', 'XGBoost'], [test_rmse_rf, test_rmse_xgb], 
               color=['skyblue', 'lightgreen'])
axes[0, 0].set_title('Test RMSE Comparison', fontweight='bold')
axes[0, 0].set_ylabel('RMSE')
axes[0, 0].grid(True, alpha=0.3, axis='y')

# MAE comparison
axes[0, 1].bar(['Random Forest', 'XGBoost'], [test_mae_rf, test_mae_xgb],
               color=['skyblue', 'lightgreen'])
axes[0, 1].set_title('Test MAE Comparison', fontweight='bold')
axes[0, 1].set_ylabel('MAE')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# R² comparison
axes[1, 0].bar(['Random Forest', 'XGBoost'], [test_r2_rf, test_r2_xgb],
               color=['skyblue', 'lightgreen'])
axes[1, 0].set_title('Test R² Comparison', fontweight='bold')
axes[1, 0].set_ylabel('R² Score')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Prediction vs Actual (Best model)
best_model_name = 'XGBoost' if test_r2_xgb > test_r2_rf else 'Random Forest'
best_predictions = y_test_pred_xgb if test_r2_xgb > test_r2_rf else y_test_pred_rf

axes[1, 1].scatter(y_test, best_predictions, alpha=0.5)
axes[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1, 1].set_title(f'Predicted vs Actual ({best_model_name})', fontweight='bold')
axes[1, 1].set_xlabel('Actual Values')
axes[1, 1].set_ylabel('Predicted Values')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n✅ Best Model: {best_model_name} (R² = {max(test_r2_rf, test_r2_xgb):.4f})")


## 5. Cross-Validation


In [None]:
# Perform cross-validation
print("Performing 5-fold Cross-Validation...")
print("=" * 60)

# Random Forest CV
rf_cv_scores = cross_val_score(rf_model, X_train, y_train, 
                                cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_cv_rmse = np.sqrt(-rf_cv_scores)
rf_cv_r2 = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)

# XGBoost CV
xgb_cv_scores = cross_val_score(xgb_model, X_train, y_train,
                                 cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_cv_rmse = np.sqrt(-xgb_cv_scores)
xgb_cv_r2 = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)

print("Random Forest Cross-Validation:")
print(f"  RMSE: {rf_cv_rmse.mean():.4f} (+/- {rf_cv_rmse.std() * 2:.4f})")
print(f"  R²: {rf_cv_r2.mean():.4f} (+/- {rf_cv_r2.std() * 2:.4f})")

print("\nXGBoost Cross-Validation:")
print(f"  RMSE: {xgb_cv_rmse.mean():.4f} (+/- {xgb_cv_rmse.std() * 2:.4f})")
print(f"  R²: {xgb_cv_r2.mean():.4f} (+/- {xgb_cv_r2.std() * 2:.4f})")

# Visualize CV results
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].boxplot([rf_cv_rmse, xgb_cv_rmse], labels=['Random Forest', 'XGBoost'])
axes[0].set_title('Cross-Validation RMSE Distribution', fontweight='bold')
axes[0].set_ylabel('RMSE')
axes[0].grid(True, alpha=0.3, axis='y')

axes[1].boxplot([rf_cv_r2, xgb_cv_r2], labels=['Random Forest', 'XGBoost'])
axes[1].set_title('Cross-Validation R² Distribution', fontweight='bold')
axes[1].set_ylabel('R² Score')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


## 6. Save Models


In [None]:
# Save trained models
models_dir = Path("../models/saved_models")
models_dir.mkdir(parents=True, exist_ok=True)

# Save Random Forest
joblib.dump(rf_model, models_dir / "random_forest_model.joblib")
print("✅ Random Forest model saved")

# Save XGBoost
joblib.dump(xgb_model, models_dir / "xgboost_model.joblib")
print("✅ XGBoost model saved")

# Save encoder
joblib.dump(encoder, models_dir / "feature_encoder.joblib")
print("✅ Feature encoder saved")

print(f"\nModels saved to: {models_dir.absolute()}")


## 7. Summary

### Model Performance Summary:
- **Best Model:** [Random Forest / XGBoost]
- **Test R² Score:** [Value]
- **Test RMSE:** [Value]
- **Test MAE:** [Value]

### Key Insights:
- [Add insights about model performance]
- [Add observations about feature importance]
- [Add recommendations for model improvement]

### Next Steps:
- Proceed to ensemble modeling
- Implement time-series models (ARIMA, Prophet)
- Deploy models to API
