In [76]:
# Import necessary modules
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor,HistGradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor


In [71]:
# Load CSV data into a pandas DataFrame
df = pd.read_csv('data/Battery_RUL_cleaned.csv')
#df = pd.read_csv('data/Battery_RUL.csv')
df = df[df.columns[1:]]  # Remove the first column

# Last column is the target variable
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
def plot_grid_search_results(results, param1, param2, score1, score2):
    """
    Plots the grid search results with two y-axes for different hyperparameter values.

    Parameters:
    - results: DataFrame containing the GridSearchCV results.
    - param1: The first hyperparameter to plot on the x-axis.
    - param2: The second hyperparameter to differentiate the lines.
    - score1: The first score to plot on the primary y-axis.
    - score2: The second score to plot on the secondary y-axis.
    """
    # Create a single plot with two y-axes
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot the first score on the primary y-axis
    for value in results[param2].unique():
        subset = results[results[param2] == value]
        ax1.plot(subset[param1], -subset[score1], label=f'{score1} - {param2}: {value}', linestyle='-', marker='o')
    ax1.set_xlabel(param1)
    ax1.set_ylabel(score1)
    ax1.set_xscale('log')
    ax1.legend(loc='upper left')
    ax1.set_title('Grid Search Results')

    # Create a secondary y-axis for the second score
    ax2 = ax1.twinx()
    for value in results[param2].unique():
        subset = results[results[param2] == value]
        ax2.plot(subset[param1], -subset[score2], label=f'{score2} - {param2}: {value}', linestyle='--', marker='x')
    ax2.set_ylabel(score2)
    ax2.legend(loc='upper right')

    # Show the plot
    plt.tight_layout()
    plt.show()

In [77]:
# Define models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Support Vector Regressor": SVR(),
    "Random Forest": RandomForestRegressor(n_estimators=100),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100),
    "Decision Tree": DecisionTreeRegressor(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Multi-layer Perceptron": MLPRegressor(),
    "Elasitc Net": ElasticNet(),
    "Bayesian Ridge": BayesianRidge(),
    "Extra Trees": ExtraTreesRegressor(n_estimators=1000),
    "AdaBoost": AdaBoostRegressor(),
    "Bagging": BaggingRegressor(),
    "Hist Gradient Boosting": HistGradientBoostingRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0),
    "LightGBM": LGBMRegressor(),
    "XGBoost": XGBRegressor(objective='reg:squarederror')
}


# Initialize a dataframe to store results
results = []

# Evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results.append({"Model": name,"MAE": mae, "MSE": mse, "RMSE": rmse, "R²": r2})

# Convert results to a DataFrame and rank them
results_df = pd.DataFrame(results).sort_values(by="MAE", ascending=True)
print(results_df)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1769
[LightGBM] [Info] Number of data points in the train set: 10799, number of used features: 7
[LightGBM] [Info] Start training from score 583.079452
                       Model        MAE          MSE       RMSE        R²
11               Extra Trees   6.409673   161.862840  12.722533  0.998251
4              Random Forest   8.037400   237.464521  15.409884  0.997435
13                   Bagging   8.661406   291.514112  17.073784  0.996851
6              Decision Tree   8.911481   564.532963  23.759902  0.993901
17                   XGBoost  11.417299   323.778256  17.993839  0.996502
15                  CatBoost  14.020744   410.249081  20.254606  0.995568
16                  LightGBM  16.361694   550.314488  23.458783  0.994055
14    Hist Gradient Boosting  16.452931   560.758489  23.680340  

In [84]:
# Define the hyperparameter grid
#param_grid = {
#    "n_estimators": [100, 200, 500],
#    "max_depth": [10, 20, 50, None],
#    "min_samples_split": [2, 5, 10],
#    "min_samples_leaf": [1, 2, 4],
#    "max_features": [None, "sqrt", "log2"]}

# Define the scoring dictionary
#scoring = {
#    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#    'MSE': make_scorer(mean_squared_error, greater_is_better=False)}

# Initialize the GridSearchCV with n_jobs=1
model = ExtraTreesRegressor(n_estimators=10000, n_jobs=-1)
#model = GridSearchCV(ExtraTreesRegressor(), param_grid, cv=5, scoring=scoring, refit='MSE', n_jobs=1, verbose=3)

# Fit the model
model.fit(X_train, y_train)

# Extract the results and convert to a DataFrame
#results = pd.DataFrame(model.cv_results_)
#results.to_csv('data/ETR1.csv', index=False)

# Get the best model
#best_model = model.best_estimator_
#print(best_model)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")

Mean Absolute Error: 6.3922
Mean Squared Error: 161.5680


In [None]:
# Define the hyperparameter grid
#param_grid = [
#    {'C': [1e+4, 1e+5], 'kernel': ['rbf'], 'gamma': [1e-5, 1e-4]}]
# Define the scoring dictionary
#scoring = {
#    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#    'MSE': make_scorer(mean_squared_error, greater_is_better=False)}
# Initialize and train the SVM model
model = SVR(kernel='linear', C=0.0001)
# Mean Absolute Error: 35.6885
# Mean Squared Error: 2358.7762
#model = GridSearchCV(SVR(), param_grid, cv=5, scoring=scoring, refit='MSE', n_jobs=-1, verbose=3)
model.fit(X_train, y_train)

# Extract the results and convert to a DataFrame
#results = pd.DataFrame(model.cv_results_)
#results.to_csv('data/SVR2.csv', index=False)

#plot_grid_search_results(results, 'param_C', 'param_gamma', 'mean_test_MSE', 'mean_test_MAE')

# Get the best model
#best_model = model.best_estimator_
#print(best_model)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
# first run: 39.9814

In [15]:
# Define the hyperparameter grid
#param_grid = [
#    {'max_depth': [None, 1, 2, 3, 4, 5, 6], 'n_estimators': [1000] }]
# Define the scoring dictionary
#scoring = {
#    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#    'MSE': make_scorer(mean_squared_error, greater_is_better=False)}
# Initialize and train the Random Forest model
model = RandomForestRegressor(n_estimators=1000,  n_jobs=-1)
#Mean Absolute Error: 7.9780
#Mean Squared Error: 236.9407
#model = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring=scoring, refit='MSE', n_jobs=-1, verbose=3)
model.fit(X_train, y_train)

# Extract the results and convert to a DataFrame
#results = pd.DataFrame(model.cv_results_)
#results.to_csv('data/RFR1.csv', index=False)

# Plot the grid search results
#plot_grid_search_results(results, 'param_max_depth', 'param_n_estimators', 'mean_test_MSE', 'mean_test_MAE')

# Get the best model
#best_model = model.best_estimator_
#print(best_model)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
# first run: 8.0703
# best run: 7.9780

Mean Absolute Error: 7.9351
Mean Squared Error: 233.6358


In [10]:
# Define the hyperparameter grid
#param_grid = [
#    {'n_estimators': [1000, 10000, 100000, 1000000], 'learning_rate':[0.1], 'objective': ['reg:squarederror']}]
# Define the scoring dictionary
#scoring = {
#    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#    'MSE': make_scorer(mean_squared_error, greater_is_better=False)}
# Initialize and train the XGBoost model
model = XGBRegressor(objective='reg:squarederror', n_estimators=10000, learning_rate=0.1, n_jobs=-1)
#Mean Absolute Error: 7.9632
#Mean Squared Error: 210.3707
#model = GridSearchCV(xgb.XGBRegressor(), param_grid, cv=5, scoring=scoring, refit='MSE', n_jobs=-1, verbose=3)
model.fit(X_train, y_train)

# Extract the results and convert to a DataFrame
#results = pd.DataFrame(model.cv_results_)
#results.to_csv('data/XGB2.csv', index=False)

# Plot the grid search results
#plot_grid_search_results(results, 'param_n_estimators', 'param_learning_rate','mean_test_MSE', 'mean_test_MAE')

# Get the best model
#best_model = model.best_estimator_
#print(best_model)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
# first run: 15.3975
# best run: 7.9632

Mean Absolute Error: 7.9632
Mean Squared Error: 210.3707


In [4]:
# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, n_jobs=-1)
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")

# first run: 39.9814

Mean Absolute Error: 47.2215
Mean Squared Error: 3565.5578


In [24]:
# Initialize and train the CatBoost Regressor
model = CatBoostRegressor(iterations=100000, learning_rate=0.1, depth=6, loss_function='RMSE', verbose=0)
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
# best run:
# Mean Absolute Error: 7.5714
# Mean Squared Error: 186.3288

Mean Absolute Error: 7.5714
Mean Squared Error: 186.3288


In [27]:
# Initialize and train the Gaussian Process Regressor
model = GaussianProcessRegressor()
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")


Mean Absolute Error: 567.5223
Mean Squared Error: 419274.5008


In [88]:
# Function to save real and predicted values to a CSV file
def save_predictions_to_csv(model, X_test, y_test, filename):
    y_pred = model.predict(X_test)
    results_df = pd.DataFrame({'Real': y_test, 'Predicted': y_pred})
    results_df.to_csv(filename, index=False)

# Support Vector Regressor
model_svr = SVR(kernel='linear', C=0.0001)
model_svr.fit(X_train, y_train)
save_predictions_to_csv(model_svr, X_test, y_test, 'data/predictions/Support_Vector_Regression.csv')

# Random Forest Regressor
model_rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
model_rf.fit(X_train, y_train)
save_predictions_to_csv(model_rf, X_test, y_test, 'data/predictions/Random_Forest_Regression.csv')

# XGBoost Regressor
model_xgb = XGBRegressor(objective='reg:squarederror', n_estimators=10000, learning_rate=0.1, n_jobs=-1)
model_xgb.fit(X_train, y_train)
save_predictions_to_csv(model_xgb, X_test, y_test, 'data/predictions/XGBoost_Regression.csv')

# Logistic Regression
model_lr = LogisticRegression(max_iter=1000, n_jobs=-1)
model_lr.fit(X_train, y_train)
save_predictions_to_csv(model_lr, X_test, y_test, 'data/predictions/Logistic_Regression.csv')

# CatBoost Regressor
model_cb = CatBoostRegressor(iterations=100000, learning_rate=0.1, depth=6, loss_function='RMSE', verbose=0)
model_cb.fit(X_train, y_train)
save_predictions_to_csv(model_cb, X_test, y_test, 'data/predictions/CatBoost_Regression.csv')

# Extra Trees Regressor
model_et = ExtraTreesRegressor(n_estimators=10000, n_jobs=-1)
model_et.fit(X_train, y_train)
save_predictions_to_csv(model_et, X_test, y_test, 'data/predictions/Extra_Trees_Regression.csv')