In [None]:
import pandas as pd
from google.colab import files

# Upload the CSV file
uploaded = files.upload()

Saving modified_file.csv to modified_file.csv


In [None]:
# df = pd.read_csv("modified_file.csv")

df.tail(3)

Unnamed: 0,OPC (Kg/m³),Fine agg (Kg/m³),Coarse agg (Kg/m³),Fly ash (Kg/m³),Silica fume (Kg/m³),Marble powder (Kg/m³),Water (Kg/m³),Superplasticizer (Kg/m³),Curing age (days),Compressive strength (Mpa)
246,555.0,968.0,720.0,0.0,0.0,0.0,180.0,7.5,7,40.2
247,525.0,972.0,718.0,0.0,30.0,0.0,182.0,8.0,7,45.3
248,470.0,970.0,715.0,0.0,25.0,50.0,183.0,6.1,7,48.0


In [None]:
!pip install catboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor


label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

X = df.iloc[:, :-1]
y = df.iloc[:, -1]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Models and hyperparameters
param_grids = {
    "Gradient Boosting": {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 6, 9]
    },
    "XGBoost": {
        'learning_rate': [0.1],
        'max_depth': [4],
        'reg_alpha': [0.1],
        'reg_lambda': [0.001],
        'n_estimators': [100]
    },
    "LightGBM": {
        'learning_rate': [0.1],
        'max_depth': [4],
        'subsample': [0.5],
        'n_estimators': [100],
        'num_leaves': [31],
        'min_data_in_leaf': [10],
        'boosting_type': ['gbdt']
    },
    "CatBoost": {
        'iterations': [100],
        'learning_rate': [0.1],
        'depth': [4, 6, 8]
    }
}
models = {
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": xgb.XGBRegressor(use_label_encoder=False, eval_metric='rmse'),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0)
}



In [None]:
from sklearn.model_selection import cross_val_score

def evaluate_model_with_cv(model_name, model, param_grid, X_train, y_train, X_test, y_test):
    print(f"Evaluating {model_name}...")

    # Hyperparameter tuning with GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=10,
        n_jobs=-1,
        scoring='neg_mean_squared_error',
        return_train_score=True
    )
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Cross-validation RMSE (10-fold)
    cv_rmse_scores = np.sqrt(-cross_val_score(best_model, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    cv_rmse_mean = cv_rmse_scores.mean()
    cv_rmse_std = cv_rmse_scores.std()

    # Cross-validation R² (10-fold)
    cv_r2_scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring='r2')
    cv_r2_mean = cv_r2_scores.mean()
    cv_r2_std = cv_r2_scores.std()


    # Predictions on test set
    y_pred = best_model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    ss_tot = np.sum((y_test - np.mean(y_test))**2)
    ss_res = np.sum((y_test - y_pred)**2)
    ce = 1 - (ss_res / ss_tot)

    print(f"Best Hyperparameters: {best_params}")
    print(f"CV RMSE Mean: {cv_rmse_mean:.4f}, CV RMSE Std: {cv_rmse_std:.4f}")
    print(f"CV R² Mean: {cv_r2_mean:.4f}, CV R² Std: {cv_r2_std:.4f}")
    print(f"Test MAE: {mae:.4f}")
    print(f"Test RMSE: {rmse:.4f}")
    print(f"Test R² Score: {r2:.4f}")
    print(f"CE: {ce:.4f}")
    print('-'*60)

    return {
        'Best Model': best_model,
        'Best Params': best_params,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R² Score': r2,
        'CE': ce,
        'CV RMSE Mean': cv_rmse_mean,
        'CV RMSE Std': cv_rmse_std,
        'CV R² Mean': cv_r2_mean,
        'CV R² Std': cv_r2_std
    }


In [None]:
# Run evaluation for all models
results = {}
for model_name, model in models.items():
    param_grid = param_grids.get(model_name, {})
    results[model_name] = evaluate_model_with_cv(
        model_name, model, param_grid, X_train, y_train, X_test, y_test
    )

# Display summary results
results_df = pd.DataFrame(results).T
print(results_df)

Evaluating Gradient Boosting...
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
CV RMSE Mean: 2.3957, CV RMSE Std: 0.5643
CV R² Mean: 0.9487, CV R² Std: 0.0262
Test MAE: 1.7454
Test RMSE: 2.2149
Test R² Score: 0.9632
CE: 0.9632
------------------------------------------------------------
Evaluating XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 0.001}
CV RMSE Mean: 2.9884, CV RMSE Std: 0.7158
CV R² Mean: 0.9201, CV R² Std: 0.0427
Test MAE: 1.9897
Test RMSE: 2.6479
Test R² Score: 0.9474
CE: 0.9474
------------------------------------------------------------
Evaluating LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 161
[LightGBM] [Info] Number of data points in the train set: 199, number of used features: 9
[LightGBM] [Info] Start training from score 49.884020
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149
[LightGBM] [Info] Number of data points in the train set: 179, number of used features: 9
[LightGBM] [Info] Start