## Preprocessing

Of Aachen battery dataset, with the following github link;
https://git.rwth-aachen.de/isea/battery-degradation-trajectory-prediction


#### Import

In [1]:
import pandas as pd
import numpy as np 
from keras.preprocessing.sequence import pad_sequences
import mat4py as mpy

#### Data load

In [2]:
filename_mat = "Degradation_Prediction_Dataset_ISEA.mat"

In [3]:
data_loader = mpy.loadmat(filename_mat)
data = pd.DataFrame.from_dict(data_loader["TDS"])
data

Unnamed: 0,Cell,Sample,History_Cycle,History,Target_Cycle_Expanded,Target_expanded,Target_Cycle,Target
0,1,1,"[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55,...","[1.834012342776987, 1.8321873038112761, 1.8303...","[105, 110, 115, 120, 125, 130, 135, 140, 145, ...","[1.7973466099593307, 1.7957194104838883, 1.794...","[105, 150, 195, 240, 285, 330, 375, 420, 465, ...","[1.7973466099593307, 1.7832393947563925, 1.770..."
1,1,2,"[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55,...","[1.834012342776987, 1.8321873038112761, 1.8303...","[110, 115, 120, 125, 130, 135, 140, 145, 150, ...","[1.7957194104838883, 1.794106179214274, 1.7925...","[110, 155, 200, 245, 290, 335, 380, 425, 470, ...","[1.7957194104838883, 1.7817527911332156, 1.769..."
2,1,3,"[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55,...","[1.834012342776987, 1.8321873038112761, 1.8303...","[115, 120, 125, 130, 135, 140, 145, 150, 155, ...","[1.794106179214274, 1.7925073295393719, 1.7909...","[115, 160, 205, 250, 295, 340, 385, 430, 475, ...","[1.794106179214274, 1.7802836132202329, 1.7676..."
3,1,4,"[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55,...","[1.834012342776987, 1.8321873038112761, 1.8303...","[120, 125, 130, 135, 140, 145, 150, 155, 160, ...","[1.7925073295393719, 1.7909232748480652, 1.789...","[120, 165, 210, 255, 300, 345, 390, 435, 480, ...","[1.7925073295393719, 1.7788295719024707, 1.766..."
4,1,5,"[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55,...","[1.834012342776987, 1.8321873038112761, 1.8303...","[125, 130, 135, 140, 145, 150, 155, 160, 165, ...","[1.7909232748480652, 1.7893544285292378, 1.787...","[125, 170, 215, 260, 305, 350, 395, 440, 485, ...","[1.7909232748480652, 1.7773897124807985, 1.765..."
...,...,...,...,...,...,...,...,...
10681,47,10682,"[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55,...","[1.84724471159825, 1.845306403634897, 1.843378...","[1195, 1200, 1205, 1210, 1215, 1220, 1225, 123...","[1.4027216711179482, 1.3979441697822066, 1.393...","[1195, 1240, 1285, 1330, 1375, 1420, 1465, 151...","[1.4027216711179482, 1.3580654511370822, 1.309..."
10682,47,10683,"[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55,...","[1.84724471159825, 1.845306403634897, 1.843378...","[1200, 1205, 1210, 1215, 1220, 1225, 1230, 123...","[1.3979441697822066, 1.3931186077270967, 1.388...","[1200, 1245, 1290, 1335, 1380, 1425, 1470, 151...","[1.3979441697822066, 1.3528860367290538, 1.303..."
10683,47,10684,"[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55,...","[1.84724471159825, 1.845306403634897, 1.843378...","[1205, 1210, 1215, 1220, 1225, 1230, 1235, 124...","[1.3931186077270967, 1.3882458358804528, 1.383...","[1205, 1250, 1295, 1340, 1385, 1430, 1475, 152...","[1.3931186077270967, 1.3476662199521636, 1.298..."
10684,47,10685,"[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55,...","[1.84724471159825, 1.845306403634897, 1.843378...","[1210, 1215, 1220, 1225, 1230, 1235, 1240, 124...","[1.3882458358804528, 1.383326705170109, 1.3783...","[1210, 1255, 1300, 1345, 1390, 1435, 1480, 152...","[1.3882458358804528, 1.342406851734246, 1.2921..."


#### Padding

Padding is needed as the sequences of history and target is of varying lengths 

Pre-padding

- Used on "History"
- The result ensures that the last column consistently holds the most recent cycle (e.g., 100th, 105th, etc.)

Post-padding
- Used on "Target" 
- Ensures the actual target values are aligned with the start of the sequence, and the padding (zeros) comes at the end.

In [4]:
# Padding the 'History' sequences
data['Padded_History'] = pad_sequences(data['History'], padding='pre', dtype='float32').tolist()

# Padding the 'Target' sequence
data['Padded_Target'] = pad_sequences(data['Target'], padding='post', dtype='float32').tolist()

#### Processing

In [5]:
# Initialize a list to store the processed data
processed_data = []

# Iterate over the DataFrame rows
for _, row in data.iterrows():
    # Create a dictionary for each entry
    processed_entry = {
        'Cell': row['Cell'],                     # Cell ID
        'Sample': row['Sample'],                 # Sample number
        'History_Cycle': row['History_Cycle'],   # Capacity history cycle numbers
        'Target_Cycle': row['Target_Cycle'],     # Target cycle numbers
        'Padded_History': row['Padded_History'], # Use padded history
        'Padded_Target': row['Padded_Target']    # Use padded target
    }
    # Append the processed entry to the list
    processed_data.append(processed_entry)

# Convert the list of dictionaries into a DataFrame
processed_df = pd.DataFrame(processed_data)

In [6]:
def generate_dataset(data, shuffle=False):
    number_of_rows = len(data)  # number of rows in the DataFrame
    inputlist = []             # list to store processed input arrays
    targetlist = []            # list to store processed target arrays

    # Extract each row from the DataFrame
    for row in range(number_of_rows):
        # Extract input (capacity history)
        input_row = data.iloc[row, -2]  # Extract capacity history
        input_row = np.array(input_row, dtype=np.float32)
        inputlist.append(input_row)

        # Extract target (capacity degradation curve)
        target_row = data.iloc[row, -1]  # Extract target degradation curve
        target_row = np.array(target_row, dtype=np.float32)  # Ensure it is a NumPy array of floats
        targetlist.append(target_row)

    # Convert lists to NumPy arrays
    input_array = np.array(inputlist, dtype=np.float32)  # Create a 3D array: (number_of_rows, number_of_samples, 1)
    target_array = np.array(targetlist, dtype=np.float32)  # Create a 3D array: (number_of_rows, number_of_targets, 1)

    if shuffle:
        # Shuffle input and target arrays in tandem
        indices = np.arange(number_of_rows)
        np.random.shuffle(indices)
        input_array = input_array[indices]
        target_array = target_array[indices]
        print('Shuffled set', end=' - ')
    else:
        print('Non Shuffled set', end=' - ')

    print(input_array.shape, target_array.shape)  # Print the shape of the processed arrays
    return input_array, target_array

In [7]:
features_file, labels_file = generate_dataset(processed_df)

Non Shuffled set - (10686, 288) (10686, 39)


In [8]:
features_file

array([[0.       , 0.       , 0.       , ..., 1.8023078, 1.8006413,
        1.7989874],
       [0.       , 0.       , 0.       , ..., 1.8006413, 1.7989874,
        1.7973466],
       [0.       , 0.       , 0.       , ..., 1.7989874, 1.7973466,
        1.7957194],
       ...,
       [0.       , 0.       , 0.       , ..., 1.4074503, 1.4027216,
        1.3979442],
       [0.       , 0.       , 0.       , ..., 1.4027216, 1.3979442,
        1.3931186],
       [0.       , 0.       , 0.       , ..., 1.3979442, 1.3931186,
        1.3882458]], dtype=float32)

In [9]:
labels_file

array([[1.7973466, 1.7832394, 1.7703887, ..., 0.       , 0.       ,
        0.       ],
       [1.7957194, 1.7817528, 1.7690253, ..., 0.       , 0.       ,
        0.       ],
       [1.7941061, 1.7802836, 1.7676731, ..., 0.       , 0.       ,
        0.       ],
       ...,
       [1.3931186, 1.3476663, 1.2980216, ..., 0.       , 0.       ,
        0.       ],
       [1.3882458, 1.3424069, 1.2921164, ..., 0.       , 0.       ,
        0.       ],
       [1.3833266, 1.3371087, 1.2861387, ..., 0.       , 0.       ,
        0.       ]], dtype=float32)