<a href="https://colab.research.google.com/github/Kshitijasharma/Li_ion_battery_ML/blob/main/B0018.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers, models
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import pandas as pd
import scipy.io as sio
import numpy as np


# Load the .mat file
def load_mat_file(file_name, key):
    """Loads the .mat file and extracts the specified key."""
    mat_file = sio.loadmat(file_name)
    if key not in mat_file:
        raise KeyError(f"Key '{key}' not found in the .mat file.")
    return mat_file[key]


def flatten_data(data):
    """Flattens data to ensure it is 1-dimensional."""
    return np.array(data).ravel()


def process_cycle(cycle_id, cycle_data, features_cols, cycles_cols):
    """Processes a single cycle and returns a DataFrame."""
    # Initialize a DataFrame for the current cycle
    cycle_type = cycle_data[0]
    if cycle_type not in features_cols:
        print(f"Unknown cycle type: {cycle_type}, skipping...")
        return None

    # Get features for the specific cycle type
    features = features_cols[cycle_type]
    features_x_cycle = cycle_data[-1]
    tmp = pd.DataFrame()

    # Populate features
    for feature, data in zip(features, features_x_cycle):
        flattened_data = flatten_data(data)
        tmp[feature] = pd.Series(flattened_data)

    # Add columns common to the cycle measurements
    tmp['id_cycle'] = cycle_id
    for k, col in enumerate(cycles_cols):
        tmp[col] = cycle_data[k]

    return cycle_type, tmp


def to_df(mat_db):
    """Returns one pd.DataFrame per cycle type."""
    # Features common for every cycle
    cycles_cols = ['type', 'ambient_temperature', 'time']

    # Features monitored during the cycle
    features_cols = {
        'charge': ['Voltage_measured', 'Current_measured', 'Temperature_measured',
                   'Current_charge', 'Voltage_charge', 'Time'],
        'discharge': ['Voltage_measured', 'Current_measured', 'Temperature_measured',
                      'Current_charge', 'Voltage_charge', 'Time', 'Capacity'],
        'impedance': ['Sense_current', 'Battery_current', 'Current_ratio',
                      'Battery_impedance', 'Rectified_impedance', 'Re', 'Rct']
    }

    # Define one pd.DataFrame per cycle type
    dfs = {key: [] for key in features_cols.keys()}

    # Get every cycle
    num_cycles = mat_db[0][0][0].shape[1]
    print(f'Number of cycles: {num_cycles}')
    cycles = [[row.flat[0] for row in line] for line in mat_db[0][0][0][0]]

    # Process each cycle
    for cycle_id, cycle_data in enumerate(cycles):
        result = process_cycle(cycle_id, cycle_data, features_cols, cycles_cols)
        if result is not None:
            cycle_type, tmp = result
            dfs[cycle_type].append(tmp)

    # Concatenate the lists of DataFrames into final DataFrames
    return {cycle_type: pd.concat(dfs[cycle_type], ignore_index=True) for cycle_type in dfs}


# Main Execution
mat_db = load_mat_file('B0018.mat', 'B0018')
dfs = to_df(mat_db)

# Display the first few rows of the 'charge' DataFrame
print(dfs['charge'].head())


Number of cycles: 319
   Voltage_measured  Current_measured  Temperature_measured  Current_charge  \
0          3.865713          0.001014             23.735721           0.000   
1          3.447651         -4.034445             23.743956          -4.036   
2          4.005559          1.517435             23.773723           1.507   
3          4.015989          1.514558             23.777077           1.507   
4          4.023230          1.517284             23.792710           1.507   

   Voltage_charge    Time  id_cycle    type  ambient_temperature    time  
0          -0.007   0.000         0  charge                   24  2008.0  
1           1.553   2.484         0  charge                   24  2008.0  
2           4.721   5.109         0  charge                   24  2008.0  
3           4.737   7.562         0  charge                   24  2008.0  
4           4.743  10.062         0  charge                   24  2008.0  


CNN

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Use 'discharge' cycle data (since it contains Capacity)
df = dfs['discharge']

# Drop missing values
df = df.dropna()

# Select relevant features
features = ['Voltage_measured', 'Current_measured', 'Temperature_measured', 'Time',
            'Voltage_charge', 'Current_charge', 'ambient_temperature']
X = df[features].values
y = df['Capacity'].values

# Reshape X for CNN (CNN expects 3D input: samples, timesteps, features)
X = X.reshape(X.shape[0], X.shape[1], 1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[1])).reshape(X_train.shape)
X_test = scaler.transform(X_test.reshape(-1, X_test.shape[1])).reshape(X_test.shape)

# CNN Model
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    Dropout(0.2),
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1)  # Regression output
])

# Compile Model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Train Model
history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test), verbose=1)

# Predictions
y_pred_cnn = model.predict(X_test)

# Evaluation metrics
mae_cnn = mean_absolute_error(y_test, y_pred_cnn)
mse_cnn = mean_squared_error(y_test, y_pred_cnn)
rmse_cnn = np.sqrt(mse_cnn)
r2_cnn = r2_score(y_test, y_pred_cnn)

print("\nConvolutional Neural Network (CNN) Performance:")
print(f"Mean Absolute Error (MAE): {mae_cnn:.4f}")
print(f"Mean Squared Error (MSE): {mse_cnn:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_cnn:.4f}")
print(f"R² Score: {r2_cnn:.4f}")
print(f"Model Accuracy: {r2_cnn * 100:.2f}%")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 193ms/step - loss: 1.9498 - mae: 1.3839 - val_loss: 1.2151 - val_mae: 1.0842
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 0.9478 - mae: 0.9199 - val_loss: 0.3470 - val_mae: 0.5180
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 0.3339 - mae: 0.4835 - val_loss: 0.4157 - val_mae: 0.4919
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 0.3104 - mae: 0.4069 - val_loss: 0.2447 - val_mae: 0.3976
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 0.2354 - mae: 0.3850 - val_loss: 0.2123 - val_mae: 0.3937
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.2104 - mae: 0.4023 - val_loss: 0.1593 - val_mae: 0.3287
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 0.1389 - mae: 

KNN

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Use 'discharge' cycle data (since it contains Capacity)
df = dfs['discharge']

# Drop missing values
df = df.dropna()

# Select relevant features
features = ['Voltage_measured', 'Current_measured', 'Temperature_measured', 'Time',
            'Voltage_charge', 'Current_charge', 'ambient_temperature']
X = df[features].values
y = df['Capacity'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# KNN Model
knn = KNeighborsRegressor(n_neighbors=5, weights='distance', metric='euclidean')  # K=5, weighted by distance
knn.fit(X_train, y_train)

# Predictions
y_pred_knn = knn.predict(X_test)

# Evaluation metrics
mae_knn = mean_absolute_error(y_test, y_pred_knn)
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
r2_knn = r2_score(y_test, y_pred_knn)

# Print performance metrics
print("\nK-Nearest Neighbors (KNN) Regression Performance:")
print(f"Mean Absolute Error (MAE): {mae_knn:.4f}")
print(f"Mean Squared Error (MSE): {mse_knn:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_knn:.4f}")
print(f"R² Score: {r2_knn:.4f}")
print(f"Model Accuracy: {r2_knn * 100:.2f}%")



K-Nearest Neighbors (KNN) Regression Performance:
Mean Absolute Error (MAE): 0.0840
Mean Squared Error (MSE): 0.0116
Root Mean Squared Error (RMSE): 0.1079
R² Score: 0.4796
Model Accuracy: 47.96%


LSTM

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Load & preprocess dataset
charge_df = dfs['charge'].dropna()

# Ensure balanced classes
threshold = charge_df['Voltage_measured'].quantile(0.5)  # Use median instead of mean
charge_df['label'] = (charge_df['Voltage_measured'] > threshold).astype(int)

# Define features & target
features = ['Voltage_measured', 'Current_measured', 'Temperature_measured', 'Current_charge', 'Voltage_charge']
X = charge_df[features].values
y = charge_df['label'].values

# Apply time steps (Use 5 previous readings)
time_steps = 5
X_seq, y_seq = [], []
for i in range(len(X) - time_steps):
    X_seq.append(X[i:i+time_steps])
    y_seq.append(y[i+time_steps])

X_seq = np.array(X_seq)
y_seq = np.array(y_seq)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42, stratify=y_seq)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[2])).reshape(X_train.shape)
X_test = scaler.transform(X_test.reshape(-1, X_test.shape[2])).reshape(X_test.shape)

# Build the LSTM model
model = Sequential([
    LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Predictions
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nLSTM Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Epoch 1/100


  super().__init__(**kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 93ms/step - loss: 0.4641 - mae: 0.9073 - val_loss: 0.4545 - val_mae: 0.8720
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 0.4723 - mae: 0.9136 - val_loss: 0.4539 - val_mae: 0.8717
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.4660 - mae: 0.9059 - val_loss: 0.4529 - val_mae: 0.8709
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 0.4432 - mae: 0.8732 - val_loss: 0.4508 - val_mae: 0.8687
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.4513 - mae: 0.8814 - val_loss: 0.4474 - val_mae: 0.8652
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.4552 - mae: 0.8918 - val_loss: 0.4435 - val_mae: 0.8610
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.4669 - mae: 0.9046



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347ms/step

✅ LSTM Model Performance:
📉 Mean Absolute Error (MAE): 0.0761
📉 Mean Squared Error (MSE): 0.0088
📉 Root Mean Squared Error (RMSE): 0.0936
📈 R² Score: 0.6084
📊 Model Accuracy Estimate: 60.84%


FNN

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Preprocess the dataset (using the 'charge' DataFrame from your earlier processing)
charge_df = dfs['charge'].dropna()  # Remove rows with missing data

# 🛠 Fix 1: Use median instead of mean for balanced labels
threshold = charge_df['Voltage_measured'].median()
charge_df['label'] = (charge_df['Voltage_measured'] > threshold).astype(int)

# Define features and target variable
features = ['Voltage_measured', 'Current_measured', 'Temperature_measured', 'Current_charge', 'Voltage_charge']
X = charge_df[features]
y = charge_df['label']

# 🛠 Fix 2: Ensure stratified splitting to handle class imbalance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True
)

# Standardize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the Feedforward Neural Network (FNN)
model = Sequential([
    Dense(64, activation='relu', input_dim=X_train.shape[1]),
    Dropout(0.3),  # 🛠 Fix 3: Increase dropout to reduce overfitting
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 🛠 Fix 4: Implement Early Stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2,
                    callbacks=[early_stopping], verbose=1)

# Evaluate the model on the test data
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary labels

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("\nFeedforward Neural Network Performance Metrics 🔹")
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charge_df['label'] = (charge_df['Voltage_measured'] > threshold).astype(int)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m5597/5597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.8932 - loss: 0.2401 - val_accuracy: 0.9375 - val_loss: 0.1394
Epoch 2/20
[1m5597/5597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.9420 - loss: 0.1422 - val_accuracy: 0.9556 - val_loss: 0.1206
Epoch 3/20
[1m5597/5597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 6ms/step - accuracy: 0.9446 - loss: 0.1324 - val_accuracy: 0.9616 - val_loss: 0.1105
Epoch 4/20
[1m5597/5597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 3ms/step - accuracy: 0.9498 - loss: 0.1199 - val_accuracy: 0.9668 - val_loss: 0.0863
Epoch 5/20
[1m5597/5597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 4ms/step - accuracy: 0.9561 - loss: 0.1062 - val_accuracy: 0.9663 - val_loss: 0.0904
Epoch 6/20
[1m5597/5597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step - accuracy: 0.9600 - loss: 0.0960 - val_accuracy: 0.9730 - val_loss: 0.0745
Epoch 7/20

SVM

In [13]:
from sklearn.svm import LinearSVC  # Use LinearSVC for faster training
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Preprocess the dataset (using the 'charge' DataFrame from your earlier processing)
charge_df = dfs['charge'].dropna()  # Remove rows with missing data

# Define the target variable (binary classification)
charge_df['label'] = (charge_df['Voltage_measured'] > charge_df['Voltage_measured'].mean()).astype(int)

# Define features and target variable
features = ['Voltage_measured', 'Current_measured', 'Temperature_measured', 'Current_charge', 'Voltage_charge']
X = charge_df[features]
y = charge_df['label']

# Split the dataset into training and testing sets (use a smaller subset if needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the LinearSVC model (faster for large datasets)
svm_model = LinearSVC(max_iter=1000, random_state=42)
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("SVM Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charge_df['label'] = (charge_df['Voltage_measured'] > charge_df['Voltage_measured'].mean()).astype(int)


SVM Performance Metrics:
Accuracy: 0.9999
Mean Absolute Error (MAE): 0.0001
Mean Squared Error (MSE): 0.0001
Root Mean Squared Error (RMSE): 0.0073
R-squared (R²): 0.9997


Logsitic Regression

In [15]:
import pandas as pd
import scipy.io as sio
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, mean_absolute_error,
    mean_squared_error, r2_score
)

# Load the .mat file
def load_mat_file(file_name, key):
    mat_file = sio.loadmat(file_name)
    if key not in mat_file:
        raise KeyError(f"Key '{key}' not found in the .mat file.")
    return mat_file[key]

def flatten_data(data):
    return np.array(data).ravel()

def process_cycle(cycle_id, cycle_data, features_cols, cycles_cols):
    cycle_type = cycle_data[0]
    if cycle_type not in features_cols:
        print(f"Unknown cycle type: {cycle_type}, skipping...")
        return None

    features = features_cols[cycle_type]
    features_x_cycle = cycle_data[-1]
    tmp = pd.DataFrame()

    for feature, data in zip(features, features_x_cycle):
        flattened_data = flatten_data(data)
        tmp[feature] = pd.Series(flattened_data)

    tmp['id_cycle'] = cycle_id
    for k, col in enumerate(cycles_cols):
        tmp[col] = cycle_data[k]

    return cycle_type, tmp

def to_df(mat_db):
    cycles_cols = ['type', 'ambient_temperature', 'time']
    features_cols = {
        'charge': ['Voltage_measured', 'Current_measured', 'Temperature_measured',
                   'Current_charge', 'Voltage_charge', 'Time'],
        'discharge': ['Voltage_measured', 'Current_measured', 'Temperature_measured',
                      'Current_charge', 'Voltage_charge', 'Time', 'Capacity'],
        'impedance': ['Sense_current', 'Battery_current', 'Current_ratio',
                      'Battery_impedance', 'Rectified_impedance', 'Re', 'Rct']
    }

    dfs = {key: [] for key in features_cols.keys()}
    num_cycles = mat_db[0][0][0].shape[1]
    print(f'Number of cycles: {num_cycles}')
    cycles = [[row.flat[0] for row in line] for line in mat_db[0][0][0][0]]

    for cycle_id, cycle_data in enumerate(cycles):
        result = process_cycle(cycle_id, cycle_data, features_cols, cycles_cols)
        if result is not None:
            cycle_type, tmp = result
            dfs[cycle_type].append(tmp)

    return {cycle_type: pd.concat(dfs[cycle_type], ignore_index=True) for cycle_type in dfs}

# Load & process dataset
mat_db = load_mat_file('B0018.mat', 'B0018')
dfs = to_df(mat_db)

# Prepare the 'charge' DataFrame
charge_df = dfs['charge'].dropna()

# Binary classification label based on median instead of mean (for better balance)
threshold = charge_df['Voltage_measured'].median()
charge_df['label'] = (charge_df['Voltage_measured'] > threshold).astype(int)

# Define features and labels
X = charge_df[['Voltage_measured', 'Current_measured', 'Temperature_measured', 'Current_charge', 'Voltage_charge']]
y = charge_df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print("\nLogistic Regression Model Performance 🔹")
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f" Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Number of cycles: 319


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charge_df['label'] = (charge_df['Voltage_measured'] > threshold).astype(int)



Logistic Regression Model Performance 🔹
Accuracy: 0.8601
Mean Absolute Error (MAE): 0.1399
 Mean Squared Error (MSE): 0.1399
Root Mean Squared Error (RMSE): 0.3740
R-squared (R²): 0.4404

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.78      0.85     27981
           1       0.81      0.94      0.87     27981

    accuracy                           0.86     55962
   macro avg       0.87      0.86      0.86     55962
weighted avg       0.87      0.86      0.86     55962



Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# 🔹 Load dataset and drop NaN rows
charge_df = dfs['charge'].dropna()

# 🔹 Ensure balanced labels using median
threshold = charge_df['Voltage_measured'].median()
charge_df['label'] = (charge_df['Voltage_measured'] > threshold).astype(int)

# 🔹 Define features and target
features = ['Current_measured', 'Temperature_measured', 'Current_charge', 'Voltage_charge']
X = charge_df[features]  # Removed 'Voltage_measured' to prevent target leakage
y = charge_df['label']

# 🔹 Stratified train-test split to prevent class imbalance issues
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True
)

# 🔹 Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 🔹 Train Random Forest with regularization to prevent overfitting
rf_model = RandomForestClassifier(
    n_estimators=50,  # Reduce tree count
    max_depth=5,  # Limit tree depth
    min_samples_split=10,  # Prevent deep splits
    min_samples_leaf=5,  # Require more samples per leaf
    random_state=42
)
rf_model.fit(X_train, y_train)

# 🔹 Make predictions
y_pred = rf_model.predict(X_test)

# 🔹 Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# 🔹 Print evaluation metrics
print("\nFixed Random Forest Classifier Performance 🔹")
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")

# 🔹 Check class balance
print("\n🔹 Class Distribution in Training Set:")
print(y_train.value_counts(normalize=True))
print("\n🔹 Class Distribution in Test Set:")
print(y_test.value_counts(normalize=True))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charge_df['label'] = (charge_df['Voltage_measured'] > threshold).astype(int)



Fixed Random Forest Classifier Performance 🔹
Accuracy: 0.9679
Mean Absolute Error (MAE): 0.0321
Mean Squared Error (MSE): 0.0321
Root Mean Squared Error (RMSE): 0.1792
R-squared (R²): 0.8715

🔹 Class Distribution in Training Set:
label
1    0.5
0    0.5
Name: proportion, dtype: float64

🔹 Class Distribution in Test Set:
label
0    0.5
1    0.5
Name: proportion, dtype: float64


XGBoost

In [17]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Use 'charge' DataFrame from processed cycles
charge_df = dfs['charge'].dropna()  # Remove rows with missing data

# Define Target Variable (Binary Classification)
if 'Capacity' in charge_df.columns:
    charge_df['label'] = (charge_df['Capacity'] > charge_df['Capacity'].mean()).astype(int)
else:
    charge_df['label'] = (charge_df['Voltage_measured'] > charge_df['Voltage_measured'].mean()).astype(int)

# Features and Target
features = [
    'Voltage_measured',
    'Current_measured',
    'Temperature_measured',
    'Current_charge',
    'Voltage_charge'
]
if 'Time' in charge_df.columns:  # Add 'Time' if present
    features.append('Time')

X = charge_df[features]
y = charge_df['label']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print Evaluation Metrics
print("XGBoost Performance Metrics:")
print("Accuracy:", accuracy)
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R²):", r2)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charge_df['label'] = (charge_df['Voltage_measured'] > charge_df['Voltage_measured'].mean()).astype(int)


XGBoost Performance Metrics:
Accuracy: 0.9994103141417391
Mean Absolute Error (MAE): 0.0005896858582609628
Mean Squared Error (MSE): 0.0005896858582609628
Root Mean Squared Error (RMSE): 0.024283448236627408
R-squared (R²): 0.9965843198577544

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     12416
           1       1.00      1.00      1.00     43546

    accuracy                           1.00     55962
   macro avg       1.00      1.00      1.00     55962
weighted avg       1.00      1.00      1.00     55962



working on B0005

In [18]:
import pandas as pd
import scipy.io as sio
import numpy as np


# Load the .mat file
def load_mat_file(file_name, key):
    """Loads the .mat file and extracts the specified key."""
    mat_file = sio.loadmat(file_name)
    if key not in mat_file:
        raise KeyError(f"Key '{key}' not found in the .mat file.")
    return mat_file[key]


def flatten_data(data):
    """Flattens data to ensure it is 1-dimensional."""
    return np.array(data).ravel()


def process_cycle(cycle_id, cycle_data, features_cols, cycles_cols):
    """Processes a single cycle and returns a DataFrame."""
    # Initialize a DataFrame for the current cycle
    cycle_type = cycle_data[0]
    if cycle_type not in features_cols:
        print(f"Unknown cycle type: {cycle_type}, skipping...")
        return None

    # Get features for the specific cycle type
    features = features_cols[cycle_type]
    features_x_cycle = cycle_data[-1]
    tmp = pd.DataFrame()

    # Populate features
    for feature, data in zip(features, features_x_cycle):
        flattened_data = flatten_data(data)
        tmp[feature] = pd.Series(flattened_data)

    # Add columns common to the cycle measurements
    tmp['id_cycle'] = cycle_id
    for k, col in enumerate(cycles_cols):
        tmp[col] = cycle_data[k]

    return cycle_type, tmp


def to_df(mat_db):
    """Returns one pd.DataFrame per cycle type."""
    # Features common for every cycle
    cycles_cols = ['type', 'ambient_temperature', 'time']

    # Features monitored during the cycle
    features_cols = {
        'charge': ['Voltage_measured', 'Current_measured', 'Temperature_measured',
                   'Current_charge', 'Voltage_charge', 'Time'],
        'discharge': ['Voltage_measured', 'Current_measured', 'Temperature_measured',
                      'Current_charge', 'Voltage_charge', 'Time', 'Capacity'],
        'impedance': ['Sense_current', 'Battery_current', 'Current_ratio',
                      'Battery_impedance', 'Rectified_impedance', 'Re', 'Rct']
    }

    # Define one pd.DataFrame per cycle type
    dfs = {key: [] for key in features_cols.keys()}

    # Get every cycle
    num_cycles = mat_db[0][0][0].shape[1]
    print(f'Number of cycles: {num_cycles}')
    cycles = [[row.flat[0] for row in line] for line in mat_db[0][0][0][0]]

    # Process each cycle
    for cycle_id, cycle_data in enumerate(cycles):
        result = process_cycle(cycle_id, cycle_data, features_cols, cycles_cols)
        if result is not None:
            cycle_type, tmp = result
            dfs[cycle_type].append(tmp)

    # Concatenate the lists of DataFrames into final DataFrames
    return {cycle_type: pd.concat(dfs[cycle_type], ignore_index=True) for cycle_type in dfs}


# Main Execution
mat_db = load_mat_file('B0005.mat', 'B0005')
dfs = to_df(mat_db)

# Display the first few rows of the 'charge' DataFrame
print(dfs['charge'].head())


Number of cycles: 616
   Voltage_measured  Current_measured  Temperature_measured  Current_charge  \
0          3.873017         -0.001201             24.655358           0.000   
1          3.479394         -4.030268             24.666480          -4.036   
2          4.000588          1.512731             24.675394           1.500   
3          4.012395          1.509063             24.693865           1.500   
4          4.019708          1.511318             24.705069           1.500   

   Voltage_charge    Time  id_cycle    type  ambient_temperature    time  
0           0.003   0.000         0  charge                   24  2008.0  
1           1.570   2.532         0  charge                   24  2008.0  
2           4.726   5.500         0  charge                   24  2008.0  
3           4.742   8.344         0  charge                   24  2008.0  
4           4.753  11.125         0  charge                   24  2008.0  


logistic regression

In [19]:
import pandas as pd
import scipy.io as sio
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, mean_absolute_error,
    mean_squared_error, r2_score
)

# Load the .mat file
def load_mat_file(file_name, key):
    mat_file = sio.loadmat(file_name)
    if key not in mat_file:
        raise KeyError(f"Key '{key}' not found in the .mat file.")
    return mat_file[key]

def flatten_data(data):
    return np.array(data).ravel()

def process_cycle(cycle_id, cycle_data, features_cols, cycles_cols):
    cycle_type = cycle_data[0]
    if cycle_type not in features_cols:
        print(f"Unknown cycle type: {cycle_type}, skipping...")
        return None

    features = features_cols[cycle_type]
    features_x_cycle = cycle_data[-1]
    tmp = pd.DataFrame()

    for feature, data in zip(features, features_x_cycle):
        flattened_data = flatten_data(data)
        tmp[feature] = pd.Series(flattened_data)

    tmp['id_cycle'] = cycle_id
    for k, col in enumerate(cycles_cols):
        tmp[col] = cycle_data[k]

    return cycle_type, tmp

def to_df(mat_db):
    cycles_cols = ['type', 'ambient_temperature', 'time']
    features_cols = {
        'charge': ['Voltage_measured', 'Current_measured', 'Temperature_measured',
                   'Current_charge', 'Voltage_charge', 'Time'],
        'discharge': ['Voltage_measured', 'Current_measured', 'Temperature_measured',
                      'Current_charge', 'Voltage_charge', 'Time', 'Capacity'],
        'impedance': ['Sense_current', 'Battery_current', 'Current_ratio',
                      'Battery_impedance', 'Rectified_impedance', 'Re', 'Rct']
    }

    dfs = {key: [] for key in features_cols.keys()}
    num_cycles = mat_db[0][0][0].shape[1]
    print(f'Number of cycles: {num_cycles}')
    cycles = [[row.flat[0] for row in line] for line in mat_db[0][0][0][0]]

    for cycle_id, cycle_data in enumerate(cycles):
        result = process_cycle(cycle_id, cycle_data, features_cols, cycles_cols)
        if result is not None:
            cycle_type, tmp = result
            dfs[cycle_type].append(tmp)

    return {cycle_type: pd.concat(dfs[cycle_type], ignore_index=True) for cycle_type in dfs}

# Load & process dataset
mat_db = load_mat_file('B0005.mat', 'B0005')
dfs = to_df(mat_db)

# Prepare the 'charge' DataFrame
charge_df = dfs['charge'].dropna()

# Binary classification label based on median instead of mean (for better balance)
threshold = charge_df['Voltage_measured'].median()
charge_df['label'] = (charge_df['Voltage_measured'] > threshold).astype(int)

# Define features and labels
X = charge_df[['Voltage_measured', 'Current_measured', 'Temperature_measured', 'Current_charge', 'Voltage_charge']]
y = charge_df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print("\nLogistic Regression Model Performance 🔹")
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f" Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Number of cycles: 616

Logistic Regression Model Performance 🔹
Accuracy: 0.8228
Mean Absolute Error (MAE): 0.1772
 Mean Squared Error (MSE): 0.1772
Root Mean Squared Error (RMSE): 0.4210
R-squared (R²): 0.2910

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.66      0.79     54118
           1       0.75      0.98      0.85     54117

    accuracy                           0.82    108235
   macro avg       0.86      0.82      0.82    108235
weighted avg       0.86      0.82      0.82    108235

