In [14]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, LogisticRegression, HuberRegressor, TheilSenRegressor, PoissonRegressor, TweedieRegressor, GammaRegressor, SGDRegressor, OrthogonalMatchingPursuit, PassiveAggressiveRegressor, RANSACRegressor, ElasticNetCV, OrthogonalMatchingPursuitCV, LarsCV, LassoCV, RidgeCV, ARDRegression, LassoLars
from sklearn.isotonic import IsotonicRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor,HistGradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import train_test_split


In [15]:
# Load CSV data into a pandas DataFrame
df = pd.read_csv('data/Battery_RUL_cleaned.csv')
#df = pd.read_csv('data/Battery_RUL.csv')
df = df[df.columns[1:]]  # Remove the first column

# Last column is the target variable
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def plot_grid_search_results(results, param1, param2, score1, score2):
    """
    Plots the grid search results with two y-axes for different hyperparameter values.

    Parameters:
    - results: DataFrame containing the GridSearchCV results.
    - param1: The first hyperparameter to plot on the x-axis.
    - param2: The second hyperparameter to differentiate the lines.
    - score1: The first score to plot on the primary y-axis.
    - score2: The second score to plot on the secondary y-axis.
    """
    # Create a single plot with two y-axes
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Plot the first score on the primary y-axis
    for value in results[param2].unique():
        subset = results[results[param2] == value]
        ax1.plot(subset[param1], -subset[score1], label=f'{score1} - {param2}: {value}', linestyle='-', marker='o')
    ax1.set_xlabel(param1)
    ax1.set_ylabel(score1)
    ax1.set_xscale('log')
    ax1.legend(loc='upper left')
    ax1.set_title('Grid Search Results')

    # Create a secondary y-axis for the second score
    ax2 = ax1.twinx()
    for value in results[param2].unique():
        subset = results[results[param2] == value]
        ax2.plot(subset[param1], -subset[score2], label=f'{score2} - {param2}: {value}', linestyle='--', marker='x')
    ax2.set_ylabel(score2)
    ax2.legend(loc='upper right')

    # Show the plot
    plt.tight_layout()
    plt.show()

In [36]:
# Define models to evaluate
models = {
    "Linear Regression": LinearRegression(n_jobs=-1),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Support Vector Regressor": SVR(),
    "Random Forest": RandomForestRegressor(n_estimators=1000, n_jobs=-1),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=1000),
    "Decision Tree": DecisionTreeRegressor(),
    "K-Nearest Neighbors": KNeighborsRegressor(n_jobs=-1),
    "Multi-layer Perceptron": MLPRegressor(),
    "Elasitc Net": ElasticNet(),
    "Bayesian Ridge": BayesianRidge(),
    "Extra Trees": ExtraTreesRegressor(n_estimators=1000, n_jobs=-1),
    "AdaBoost": AdaBoostRegressor(),
    "Bagging": BaggingRegressor(n_estimators=1000, n_jobs=-1),
    "Hist Gradient Boosting": HistGradientBoostingRegressor(),
    "CatBoost": CatBoostRegressor(n_estimators=1000, verbose=0),
    "LightGBM": LGBMRegressor(n_estimators=1000, n_jobs=-1),
    "XGBoost": XGBRegressor(objective='reg:squarederror', n_estimators=1000, n_jobs=-1),
    #"Logistic Regression": LogisticRegression(max_iter=1000, n_jobs=-1),
    "Gaussian Process": GaussianProcessRegressor(),
    "Huber Regressor": HuberRegressor(),
    "Theil-Sen Regressor": TheilSenRegressor(),
    "Poisson Regressor": PoissonRegressor(),
    "Tweedie Regressor": TweedieRegressor(),
    "Gamma Regressor": GammaRegressor(),
    #"SGD Regressor": SGDRegressor(),
    "Orthogonal Matching Pursuit": OrthogonalMatchingPursuit(),
    "Passive Aggressive Regressor": PassiveAggressiveRegressor(),
    "RANSAC Regressor": RANSACRegressor(),
    "Elastic Net CV": ElasticNetCV(),
    "Orthogonal Matching Pursuit CV": OrthogonalMatchingPursuitCV(),
    "Lars CV": LarsCV(),
    "Lasso CV": LassoCV(),
    "Ridge CV": RidgeCV(),
    "ARD Regression": ARDRegression(),
    "Lasso Lars": LassoLars(),
    "Isotonic Regression": IsotonicRegression()
}


# Initialize a dataframe to store results
results = []

# Evaluate each model
for name, model in models.items():
    if name == "Isotonic Regression":
        X_train_model = X_train[:, 0].reshape(-1, 1)  # Select a single feature
        X_test_model = X_test[:, 0].reshape(-1, 1)
    else:
        X_train_model = X_train
        X_test_model = X_test
    model.fit(X_train_model, y_train)
    y_pred = model.predict(X_test_model)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results.append({"Model": name,"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2})
    
    

# Convert results to a DataFrame and rank them
results_df = pd.DataFrame(results).sort_values(by="MAE", ascending=True)
results_df.to_csv('data/ranking.csv', index=False)
print(results_df)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000383 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1769
[LightGBM] [Info] Number of data points in the train set: 10799, number of used features: 7
[LightGBM] [Info] Start training from score 583.079452


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


                             Model         MAE            MSE        RMSE  \
11                     Extra Trees    6.405731     162.577321   12.750581   
13                         Bagging    7.931399     232.256216   15.239955   
4                    Random Forest    7.937425     232.661285   15.253238   
17                         XGBoost    8.668144     245.356720   15.663867   
6                    Decision Tree    8.943704     556.126852   23.582342   
16                        LightGBM    8.973900     239.627970   15.479922   
15                        CatBoost   14.020744     410.249081   20.254606   
5                Gradient Boosting   15.482934     505.434515   22.481871   
14          Hist Gradient Boosting   16.404646     563.925931   23.747125   
7              K-Nearest Neighbors   22.810074    1374.014252   37.067698   
1                 Ridge Regression   34.076042    2103.815289   45.867366   
31                        Ridge CV   34.124713    2081.687420   45.625513   

In [16]:
#model = LGBMRegressor(n_estimators=10000, n_jobs=-1)
#model = LogisticRegression(max_iter=1000, n_jobs=-1)
#Mean Absolute Error: 7.9007
#Mean Squared Error: 211.5364
#model = GradientBoostingRegressor(n_estimators=1000, max_depth=6)
#Mean Absolute Error: 9.1008
#Mean Squared Error: 256.1412


model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")

Mean Absolute Error: 35.4393
Mean Squared Error: 2248.5742


In [84]:
# Define the hyperparameter grid
#param_grid = {
#    "n_estimators": [100, 200, 500],
#    "max_depth": [10, 20, 50, None],
#    "min_samples_split": [2, 5, 10],
#    "min_samples_leaf": [1, 2, 4],
#    "max_features": [None, "sqrt", "log2"]}

# Define the scoring dictionary
#scoring = {
#    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#    'MSE': make_scorer(mean_squared_error, greater_is_better=False)}

# Initialize the GridSearchCV with n_jobs=1
model = ExtraTreesRegressor(n_estimators=10000, n_jobs=-1)
#model = GridSearchCV(ExtraTreesRegressor(), param_grid, cv=5, scoring=scoring, refit='MSE', n_jobs=1, verbose=3)

# Fit the model
model.fit(X_train, y_train)

# Extract the results and convert to a DataFrame
#results = pd.DataFrame(model.cv_results_)
#results.to_csv('data/ETR1.csv', index=False)

# Get the best model
#best_model = model.best_estimator_
#print(best_model)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")

Mean Absolute Error: 6.3922
Mean Squared Error: 161.5680


In [None]:
# Define the hyperparameter grid
#param_grid = [
#    {'C': [1e+4, 1e+5], 'kernel': ['rbf'], 'gamma': [1e-5, 1e-4]}]
# Define the scoring dictionary
#scoring = {
#    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#    'MSE': make_scorer(mean_squared_error, greater_is_better=False)}
# Initialize and train the SVM model
model = SVR(kernel='linear', C=0.0001)
# Mean Absolute Error: 35.6885
# Mean Squared Error: 2358.7762
#model = GridSearchCV(SVR(), param_grid, cv=5, scoring=scoring, refit='MSE', n_jobs=-1, verbose=3)
model.fit(X_train, y_train)

# Extract the results and convert to a DataFrame
#results = pd.DataFrame(model.cv_results_)
#results.to_csv('data/SVR2.csv', index=False)

#plot_grid_search_results(results, 'param_C', 'param_gamma', 'mean_test_MSE', 'mean_test_MAE')

# Get the best model
#best_model = model.best_estimator_
#print(best_model)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
# first run: 39.9814

In [15]:
# Define the hyperparameter grid
#param_grid = [
#    {'max_depth': [None, 1, 2, 3, 4, 5, 6], 'n_estimators': [1000] }]
# Define the scoring dictionary
#scoring = {
#    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#    'MSE': make_scorer(mean_squared_error, greater_is_better=False)}
# Initialize and train the Random Forest model
model = RandomForestRegressor(n_estimators=1000,  n_jobs=-1)
#Mean Absolute Error: 7.9780
#Mean Squared Error: 236.9407
#model = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring=scoring, refit='MSE', n_jobs=-1, verbose=3)
model.fit(X_train, y_train)

# Extract the results and convert to a DataFrame
#results = pd.DataFrame(model.cv_results_)
#results.to_csv('data/RFR1.csv', index=False)

# Plot the grid search results
#plot_grid_search_results(results, 'param_max_depth', 'param_n_estimators', 'mean_test_MSE', 'mean_test_MAE')

# Get the best model
#best_model = model.best_estimator_
#print(best_model)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
# first run: 8.0703
# best run: 7.9780

Mean Absolute Error: 7.9351
Mean Squared Error: 233.6358


In [10]:
# Define the hyperparameter grid
#param_grid = [
#    {'n_estimators': [1000, 10000, 100000, 1000000], 'learning_rate':[0.1], 'objective': ['reg:squarederror']}]
# Define the scoring dictionary
#scoring = {
#    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#    'MSE': make_scorer(mean_squared_error, greater_is_better=False)}
# Initialize and train the XGBoost model
model = XGBRegressor(objective='reg:squarederror', n_estimators=10000, learning_rate=0.1, n_jobs=-1)
#Mean Absolute Error: 7.9632
#Mean Squared Error: 210.3707
#model = GridSearchCV(xgb.XGBRegressor(), param_grid, cv=5, scoring=scoring, refit='MSE', n_jobs=-1, verbose=3)
model.fit(X_train, y_train)

# Extract the results and convert to a DataFrame
#results = pd.DataFrame(model.cv_results_)
#results.to_csv('data/XGB2.csv', index=False)

# Plot the grid search results
#plot_grid_search_results(results, 'param_n_estimators', 'param_learning_rate','mean_test_MSE', 'mean_test_MAE')

# Get the best model
#best_model = model.best_estimator_
#print(best_model)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
# first run: 15.3975
# best run: 7.9632

Mean Absolute Error: 7.9632
Mean Squared Error: 210.3707


In [4]:
# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, n_jobs=-1)
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")

# first run: 39.9814

Mean Absolute Error: 47.2215
Mean Squared Error: 3565.5578


In [24]:
# Initialize and train the CatBoost Regressor
model = CatBoostRegressor(iterations=100000, learning_rate=0.1, depth=6, loss_function='RMSE', verbose=0)
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
# best run:
# Mean Absolute Error: 7.5714
# Mean Squared Error: 186.3288

Mean Absolute Error: 7.5714
Mean Squared Error: 186.3288


In [27]:
# Initialize and train the Gaussian Process Regressor
model = GaussianProcessRegressor()
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")


Mean Absolute Error: 567.5223
Mean Squared Error: 419274.5008


In [88]:
# Function to save real and predicted values to a CSV file
def save_predictions_to_csv(model, X_test, y_test, filename):
    y_pred = model.predict(X_test)
    results_df = pd.DataFrame({'Real': y_test, 'Predicted': y_pred})
    results_df.to_csv(filename, index=False)

# Support Vector Regressor
model_svr = SVR(kernel='linear', C=0.0001)
model_svr.fit(X_train, y_train)
save_predictions_to_csv(model_svr, X_test, y_test, 'data/predictions/Support_Vector_Regression.csv')

# Random Forest Regressor
model_rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
model_rf.fit(X_train, y_train)
save_predictions_to_csv(model_rf, X_test, y_test, 'data/predictions/Random_Forest_Regression.csv')

# XGBoost Regressor
model_xgb = XGBRegressor(objective='reg:squarederror', n_estimators=10000, learning_rate=0.1, n_jobs=-1)
model_xgb.fit(X_train, y_train)
save_predictions_to_csv(model_xgb, X_test, y_test, 'data/predictions/XGBoost_Regression.csv')

# Logistic Regression
model_lr = LogisticRegression(max_iter=1000, n_jobs=-1)
model_lr.fit(X_train, y_train)
save_predictions_to_csv(model_lr, X_test, y_test, 'data/predictions/Logistic_Regression.csv')

# CatBoost Regressor
model_cb = CatBoostRegressor(iterations=100000, learning_rate=0.1, depth=6, loss_function='RMSE', verbose=0)
model_cb.fit(X_train, y_train)
save_predictions_to_csv(model_cb, X_test, y_test, 'data/predictions/CatBoost_Regression.csv')

# Extra Trees Regressor
model_et = ExtraTreesRegressor(n_estimators=10000, n_jobs=-1)
model_et.fit(X_train, y_train)
save_predictions_to_csv(model_et, X_test, y_test, 'data/predictions/Extra_Trees_Regression.csv')