In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
yield_df_data = pd.read_csv('/content/yield_df.csv')
# Feature Engineering
yield_df_data['temp_rain_interaction'] = yield_df_data['avg_temp'] * yield_df_data['average_rain_fall_mm_per_year']
yield_df_data['pesticides_rain_interaction'] = yield_df_data['pesticides_tonnes'] * yield_df_data['average_rain_fall_mm_per_year']
yield_df_data['pesticides_temp_interaction'] = yield_df_data['pesticides_tonnes'] * yield_df_data['avg_temp']
yield_df_data['temp_squared'] = yield_df_data['avg_temp'] ** 2

# Scaling features
features_to_scale = ['average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp', 'temp_rain_interaction',
                     'pesticides_rain_interaction', 'pesticides_temp_interaction', 'temp_squared']

scaler = StandardScaler()
yield_df_data[features_to_scale] = scaler.fit_transform(yield_df_data[features_to_scale])

# Splitting the data
X = yield_df_data[features_to_scale]
y = yield_df_data['hg/ha_yield']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection and Tuning
# Random Forest Tuning
rf_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'max_features': ['sqrt']
}
rf_random_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, n_iter=5, cv=3, n_jobs=-1, verbose=1, random_state=42)
rf_random_search.fit(X_train, y_train)
rf_best_model = rf_random_search.best_estimator_

# SVR Tuning
svr_param_grid = {
    'C': [1, 10],
    'kernel': ['rbf'],
    'gamma': ['scale']
}
svr_random_search = RandomizedSearchCV(SVR(), svr_param_grid, n_iter=5, cv=3, n_jobs=-1, verbose=1, random_state=42)
svr_random_search.fit(X_train, y_train)
svr_best_model = svr_random_search.best_estimator_

# Gradient Boosting Tuning
gbr_param_grid = {
    'n_estimators': [100],
    'learning_rate': [0.1],
    'max_depth': [3, 5]
}
gbr_random_search = RandomizedSearchCV(GradientBoostingRegressor(random_state=42), gbr_param_grid, n_iter=5, cv=3, n_jobs=-1, verbose=1, random_state=42)
gbr_random_search.fit(X_train, y_train)
gbr_best_model = gbr_random_search.best_estimator_

# Model Evaluation
models = {
    'Random Forest': rf_best_model,
    'SVR': svr_best_model,
    'Gradient Boosting': gbr_best_model
}

results = {}
for name, model in models.items():
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    results[name] = {'MAE': mae, 'MSE': mse, 'R²': r2}

results

Fitting 3 folds for each of 4 candidates, totalling 12 fits




Fitting 3 folds for each of 2 candidates, totalling 6 fits




Fitting 3 folds for each of 2 candidates, totalling 6 fits


{'Random Forest': {'MAE': 63436.77621518436,
  'MSE': 6785998918.106205,
  'R²': 0.0644739732700228},
 'SVR': {'MAE': 56804.41215971462,
  'MSE': 8690136577.633966,
  'R²': -0.1980327498318517},
 'Gradient Boosting': {'MAE': 61790.0657695027,
  'MSE': 6368633455.603672,
  'R²': 0.12201248123935216}}