In [1]:
import pandas as pd
import time
import logging
import argparse
import joblib
import sys
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [26]:
# Configure logging
logging.basicConfig(filename='model_training.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logging.getLogger('').addHandler(console)

In [27]:
def load_data(file_path):
    """Load dataset from file."""
    try:
        logging.info("Loading dataset from %s", file_path)
        return pd.read_csv(file_path)
    except Exception as e:
        logging.error("Error loading dataset: %s", e)
        raise

In [28]:
def split_data(df):
    """Split dataset into train, validation, and test sets."""
    try:
        logging.info("Splitting dataset.")
        X = df.drop(columns=["concrete_compressive_strength"])
        y = df["concrete_compressive_strength"]
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1/3, random_state=42)
        return X_train, X_val, X_test, y_train, y_val, y_test
    except Exception as e:
        logging.error("Error splitting dataset: %s", e)
        raise

In [29]:
def train_and_evaluate(models, X_train, y_train, X_val, y_val):
    """Train and evaluate multiple models."""
    logging.info("Training and evaluating models.")
    model_results = {}
    for name, model in models.items():
        try:
            start_time = time.time()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            end_time = time.time()
            
            mse = mean_squared_error(y_val, y_pred)
            rmse = mse ** 0.5
            r2 = r2_score(y_val, y_pred)
            latency = end_time - start_time

            model_results[name] = {"MSE": mse, "RMSE": rmse, "R² Score": r2, "Latency (s)": latency}
            logging.info(f"{name} - MSE: {mse}, RMSE: {rmse}, R² Score: {r2}, Latency: {latency}")
        except Exception as e:
            logging.error("Error training %s: %s", name, e)
    return pd.DataFrame(model_results).T

In [30]:
def hyperparameter_tuning(X_train, y_train):
    """Perform hyperparameter tuning for Random Forest and save best model."""
    try:
        logging.info("Starting hyperparameter tuning for Random Forest.")
        param_grid = {"n_estimators": [50, 100, 150], "max_depth": [5, 10, 15]}
        rf_model = RandomForestRegressor(random_state=42)
        grid_search = GridSearchCV(rf_model, param_grid, cv=3, scoring="r2", n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        joblib.dump(best_model, "best_rf_model.pkl")
        logging.info(f"Best RF Params: {grid_search.best_params_}, Best RF R² Score: {grid_search.best_score_}")
        return grid_search.best_params_, grid_search.best_score_
    except Exception as e:
        logging.error("Error in hyperparameter tuning: %s", e)
        raise

In [31]:
def is_jupyter_notebook():
    """Detect if the script is running in a Jupyter Notebook."""
    try:
        from IPython import get_ipython
        return get_ipython() is not None
    except ImportError:
        return False

In [32]:
if __name__ == "__main__":
    if is_jupyter_notebook():
        logging.info("Running in Jupyter Notebook. Using default dataset.")
        data_path = "concrete_data.csv"
    else:
        parser = argparse.ArgumentParser(description="Train ML models on concrete dataset.")
        parser.add_argument("--data", type=str, default="cleaned_dataset.csv", help="Path to dataset")
        args = parser.parse_args()
        data_path = args.data
    
    df = load_data(data_path)
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df)
    
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(random_state=42),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
        "XGBoost": XGBRegressor(objective="reg:squarederror", n_estimators=50, random_state=42)
    }
    
    results_df = train_and_evaluate(models, X_train, y_train, X_val, y_val)
    print(results_df)
    
    best_params, best_score = hyperparameter_tuning(X_train, y_train)
    print("Best RF Params:", best_params)
    print("Best RF R² Score:", best_score)

Running in Jupyter Notebook. Using default dataset.
Running in Jupyter Notebook. Using default dataset.
Running in Jupyter Notebook. Using default dataset.
Loading dataset from concrete_data.csv
Loading dataset from concrete_data.csv
Loading dataset from concrete_data.csv
Splitting dataset.
Splitting dataset.
Splitting dataset.
Training and evaluating models.
Training and evaluating models.
Training and evaluating models.
Linear Regression - MSE: 113.44365763931016, RMSE: 10.650993270080972, R² Score: 0.6006252770706919, Latency: 0.005258798599243164
Linear Regression - MSE: 113.44365763931016, RMSE: 10.650993270080972, R² Score: 0.6006252770706919, Latency: 0.005258798599243164
Linear Regression - MSE: 113.44365763931016, RMSE: 10.650993270080972, R² Score: 0.6006252770706919, Latency: 0.005258798599243164
Decision Tree - MSE: 48.65592669902914, RMSE: 6.975380039756195, R² Score: 0.8287083857426686, Latency: 0.04659008979797363
Decision Tree - MSE: 48.65592669902914, RMSE: 6.975380039

                          MSE       RMSE  R² Score  Latency (s)
Linear Regression  113.443658  10.650993  0.600625     0.005259
Decision Tree       48.655927   6.975380  0.828708     0.046590
Random Forest       26.613475   5.158825  0.906308     1.763304
XGBoost             19.705484   4.439086  0.930627     0.101183
Fitting 3 folds for each of 9 candidates, totalling 27 fits


Best RF Params: {'max_depth': 15, 'n_estimators': 150}, Best RF R² Score: 0.8847256340402937
Best RF Params: {'max_depth': 15, 'n_estimators': 150}, Best RF R² Score: 0.8847256340402937
Best RF Params: {'max_depth': 15, 'n_estimators': 150}, Best RF R² Score: 0.8847256340402937


Best RF Params: {'max_depth': 15, 'n_estimators': 150}
Best RF R² Score: 0.8847256340402937
