In [1]:
import numpy as np
import pandas as pd
from numpy import math
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

  from numpy import math


In [2]:
import pickle


with open('concrete_content.pkl', 'rb') as file:
    concrete_data = pickle.load(file)

with open('strength.pkl', 'rb') as file:
    strength_data = pickle.load(file)

In [3]:
x_train, x_test, y_train, y_test = train_test_split(concrete_data,strength_data, test_size=0.2, random_state = 42)

In [4]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [5]:
gbr = GradientBoostingRegressor()
gbr.fit(x_train, y_train)
y_train_pred = gbr.predict(x_train)
y_test_pred = gbr.predict(x_test)

# Evaluate Model
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"R² Score: {r2_score(y_test, y_test_pred):.4f}")

Train RMSE: 3.7737
Test RMSE: 5.6010
R² Score: 0.8948


In [6]:
gbr = GradientBoostingRegressor(random_state=42)

# Define Hyperparameter Grid
param_grid = {
    'n_estimators': [200, 500, 800],          # Number of boosting stages
    'learning_rate': [0.01, 0.05, 0.1],       # Step size
    'max_depth': [3,4,5],                   # Tree depth
    'min_samples_split': [2, 5],              # Minimum samples to split
    'min_samples_leaf': [1, 3],               # Minimum samples per leaf
    'subsample': [0.8, 1.0],                  # Subsampling ratio
    'max_features': ['sqrt', 'log2']          # Feature selection
}

# Set Up Grid Search
grid_search = GridSearchCV(gbr, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=2)

# Fit Model
grid_search.fit(x_train, y_train)

# Best Parameters & Performance
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)

# Evaluate on Test Data
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(x_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print("Test RMSE:", test_rmse)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 800, 'subsample': 1.0}
Best RMSE: 4.299509334842413
Test RMSE: 4.085243854986707


In [7]:
print("R² Score:", r2_score(y_test, y_test_pred))

R² Score: 0.9440569059614008
