In [2]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Load dataset
df = pd.read_csv("crop_yield.csv")

# Drop any rows with missing values for a clean dataset
df.dropna(inplace=True)

# Features and target
features = ['Crop', 'Crop_Year', 'Season', 'State',
            'Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
target = 'Yield'

X = df[features]
y = df[target]

# Separate numeric and categorical columns
numeric_features = ['Crop_Year', 'Area', 'Production',
                    'Annual_Rainfall', 'Fertilizer', 'Pesticide']
categorical_features = ['Crop', 'Season', 'State']

# Preprocessing pipeline: scale numeric + one-hot encode categorical
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Define the model pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Define a parameter grid for GridSearchCV
# We will search for the best combination of hyperparameters
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [5, 10, None],
    'regressor__min_samples_split': [2, 5]
}

# Split data into training and testing sets
# This is crucial for evaluating the model on data it has not seen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up GridSearchCV to find the best model parameters
print("Starting GridSearchCV to find the best model parameters...")
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)

# Train the GridSearchCV model
grid_search.fit(X_train, y_train)
print("GridSearchCV training complete.")

# Get the best model
best_model = grid_search.best_estimator_
print(f"Best model parameters found: {grid_search.best_params_}")

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model using appropriate regression metrics
# A confusion matrix is not suitable for regression, as it's for classification.
# Instead, we use R-squared, MAE, and MSE to assess performance.
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nModel Performance Metrics:")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Save the best trained pipeline to a pickle file
filename = 'best_crop_yield_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(best_model, file)

print(f"\nBest model pipeline saved as {filename}")


GridSearchCV training complete.
Best model parameters found: {'regressor__max_depth': 10, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 50}

Model Performance Metrics:
R-squared (R²): 0.9845
Mean Absolute Error (MAE): 8.91
Mean Squared Error (MSE): 12388.60
Root Mean Squared Error (RMSE): 111.30

Best model pipeline saved as best_crop_yield_model.pkl
