# CWRU Bearing Fault Data - Data Preprocessing (divided by load)

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import re
import scipy.io
import pickle
import math

## 48k Drive-End Bearing Fault Data + Normal Baseline Data (48k)

### Reading and spliting data by load and fault type

In [13]:
BASE_PATH = 'Data'
NORMAL_DATA_PATH = 'NormalBaselineData'
DRIVE_END_48K_DATA_PATH = '48kDriveEndBearingFaultData'

In [14]:
def load_data_for_motorload(load):
  data = []

  # Loading Normal Baseline Data for chosen motorload
  normal_dir = os.path.join(BASE_PATH, NORMAL_DATA_PATH)
  for filename in os.listdir(normal_dir):
    if filename.endswith(f"_{load}.mat"):
      filepath = os.path.join(normal_dir, filename)
      try:
        mat = scipy.io.loadmat(filepath)
        de_key = next((k for k in mat.keys() if re.match(r".*_DE_time$", k)), None)
        if de_key is None:
          print(f"No DE_time key found in {filepath}")
          continue
        signal = mat[de_key].squeeze()
        label = filename[:-4].rsplit('_', 1)[0]
        data.append({
          'signal': signal,
          'label': label,
          'load': load
        })
      except Exception as e:
        print(f"Error loading {filepath}: {e}")

  # Loading Drive End Bearing Fault Data
  fault_dir = os.path.join(BASE_PATH, DRIVE_END_48K_DATA_PATH)
  for subfolder in os.listdir(fault_dir):
    subfolder_path = os.path.join(fault_dir, subfolder)
    if not os.path.isdir(subfolder_path):
      continue
    for filename in os.listdir(subfolder_path):
      if filename.endswith(f"_{load}.mat"):
        filepath = os.path.join(subfolder_path, filename)
        try:
          mat = scipy.io.loadmat(filepath)
          de_key = next((k for k in mat.keys() if re.match(r".*_DE_time$", k)), None)
          if de_key is None:
            print(f"No DE_time key found in {filepath}")
            continue
          signal = mat[de_key].squeeze()
          label = filename[:-4].rsplit('_', 1)[0]
          data.append({
            'signal': signal,
            'label': label,
            'load': load
          })
        except Exception as e:
          print(f"Error loading {filepath}: {e}")

  return pd.DataFrame(data)

In [15]:
# Merging data for all motorloads data subsets
data = []
for load in range(4):
  df = load_data_for_motorload(load)
  data.append(df)

data = pd.concat(data, ignore_index=True)

In [16]:
data

Unnamed: 0,signal,label,load
0,"[0.05319692307692307, 0.08866153846153846, 0.0...",Normal,0
1,"[-0.16812035928143715, 0.18127760479041916, 0....",B007,0
2,"[-0.14700379241516967, -0.03329920159680639, 0...",B014,0
3,"[-0.0028426147704590818, 0.06091317365269461, ...",B021,0
4,"[-0.025015009980039922, -0.028913453093812373,...",IR007,0
5,"[-0.15918642714570858, -0.1328719361277445, 0....",IR014,0
6,"[-0.5891298461538461, 0.019609846153846155, 0....",IR021,0
7,"[-0.1338465469061876, 0.0029238323353293415, -...",OR007@6,0
8,"[-1.2489637125748503, -0.18761257485029942, 1....",OR014@6,0
9,"[-1.1354215568862276, 0.13116636726546907, 0.9...",OR021@6,0


In [17]:
with open ('48kdrive-end_normalbaseline_loadsplit_data.pkl', 'wb') as f:
  pickle.dump(data, f)

### Cropping all of the data to the shortest signal length

In [18]:
with open('48kdrive-end_normalbaseline_loadsplit_data.pkl', 'rb') as f:
    data = pickle.load(f)

min_len = data['signal'].apply(len).min()
data['cropped_signal'] = data['signal'].apply(lambda x: x[:min_len])

with open('48kdrive-end_normalbaseline_loadsplit_cropped_data.pkl', 'wb') as f:
  pickle.dump(data, f)