In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Load your Excel file (update sheet name if needed)
df = pd.read_excel("Updated_BMS_Data11.xlsm", sheet_name="Sheet1", engine="openpyxl", header=0, skiprows=0, nrows=6109)

# Drop first column
df = df.iloc[:, 1:]

# Separate features and target
X = df.drop(columns=["tbsagrft"])
y = df["tbsagrft"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = MinMaxScaler()
scaler.fit(X_train)

# Transform both training and testing data
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)

# ------------------------------- Random Forest Model -------------------------------

# Initialize Random Forest Regressor
model_rf = RandomForestRegressor(random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV to training data
grid_search_rf.fit(X_train_normalized, y_train)

# Get the best Random Forest model and its parameters
best_rf_model = grid_search_rf.best_estimator_
print("Best Random Forest Parameters:", grid_search_rf.best_params_)

# Predict and evaluate for Random Forest
y_pred_rf = best_rf_model.predict(X_test_normalized)
print("\nRandom Forest MSE:", mean_squared_error(y_test, y_pred_rf))
print("Random Forest R²:", r2_score(y_test, y_pred_rf))

# ------------------------------- Gradient Boosting Model -------------------------------

# Initialize Gradient Boosting Regressor
model_gb = GradientBoostingRegressor(random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
}

grid_search_gb = GridSearchCV(estimator=model_gb, param_grid=param_grid_gb, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV to training data
grid_search_gb.fit(X_train_normalized, y_train)

# Get the best Gradient Boosting model and its parameters
best_gb_model = grid_search_gb.best_estimator_
print("Best Gradient Boosting Parameters:", grid_search_gb.best_params_)

# Predict and evaluate for Gradient Boosting
y_pred_gb = best_gb_model.predict(X_test_normalized)
print("\nGradient Boosting MSE:", mean_squared_error(y_test, y_pred_gb))
print("Gradient Boosting R²:", r2_score(y_test, y_pred_gb))

# ------------------------------- XGBoost Model -------------------------------

# Initialize XGBoost Regressor
model_xgb = XGBRegressor(
    objective="reg:squarederror",
    booster="gbtree",         # this is the tree model
    n_estimators=100,         # number of trees
    learning_rate=0.1,        # shrinkage to prevent overfitting
    max_depth=6,              # controls tree complexity
    random_state=42
)

# Fit the XGBoost model
model_xgb.fit(X_train_normalized, y_train)

# Predict and evaluate for XGBoost
y_pred_xgb = model_xgb.predict(X_test_normalized)
print("\nXGBoost Model MSE:", mean_squared_error(y_test, y_pred_xgb))
print("XGBoost Model R²:", r2_score(y_test, y_pred_xgb))


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Random Forest Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}

Random Forest MSE: 68.97052843661415
Random Forest R²: 0.806398055696665
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best Gradient Boosting Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.9}

Gradient Boosting MSE: 72.64003456519963
Gradient Boosting R²: 0.7960976630908572

XGBoost Model MSE: 68.9565914588794
XGBoost Model R²: 0.8064371771308242
