# CWRU Bearing Fault Data - Data Preprocessing (divided by load)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import re
import scipy.io
import pickle
import math

## 48k Drive-End Bearing Fault Data + Normal Baseline Data (48k)

### Reading and spliting data by load and fault type

In [None]:
BASE_PATH = 'Data'
NORMAL_DATA_PATH = 'NormalBaselineData'
DRIVE_END_48K_DATA_PATH = '48kDriveEndBearingFaultData'

In [None]:
def load_data_for_motorload(load):
  data = []

  # Loading Normal Baseline Data for chosen motorload
  normal_dir = os.path.join(BASE_PATH, NORMAL_DATA_PATH)
  for filename in os.listdir(normal_dir):
    if filename.endswith(f"_{load}.mat"):
      filepath = os.path.join(normal_dir, filename)
      try:
        mat = scipy.io.loadmat(filepath)
        de_key = next((k for k in mat.keys() if re.match(r".*_DE_time$", k)), None)
        if de_key is None:
          print(f"No DE_time key found in {filepath}")
          continue
        signal = mat[de_key].squeeze()
        label = filename[:-4].rsplit('_', 1)[0]
        data.append({
          'signal': signal,
          'label': label,
          'load': load
        })
      except Exception as e:
        print(f"Error loading {filepath}: {e}")

  # Loading Drive End Bearing Fault Data
  fault_dir = os.path.join(BASE_PATH, DRIVE_END_48K_DATA_PATH)
  for subfolder in os.listdir(fault_dir):
    subfolder_path = os.path.join(fault_dir, subfolder)
    if not os.path.isdir(subfolder_path):
      continue
    for filename in os.listdir(subfolder_path):
      if filename.endswith(f"_{load}.mat"):
        filepath = os.path.join(subfolder_path, filename)
        try:
          mat = scipy.io.loadmat(filepath)
          print(filepath + "\n")
          de_key = next((k for k in mat.keys() if re.match(r".*_DE_time$", k)), None)
          if de_key is None:
            print(f"No DE_time key found in {filepath}")
            continue
          signal = mat[de_key].squeeze()
          label = filename[:-4].rsplit('_', 1)[0]
          data.append({
            'signal': signal,
            'label': label,
            'load': load
          })
        except Exception as e:
          print(f"Error loading {filepath}: {e}")

  return pd.DataFrame(data)

In [None]:
# Merging data for all motorloads data subsets
data = []
for load in range(4):
  df = load_data_for_motorload(load)
  data.append(df)

data = pd.concat(data, ignore_index=True)

/content/drive/MyDrive/Praca dyplomowa/Data/48kDriveEndBearingFaultData/Ball/B007_0.mat

/content/drive/MyDrive/Praca dyplomowa/Data/48kDriveEndBearingFaultData/Ball/B014_0.mat

/content/drive/MyDrive/Praca dyplomowa/Data/48kDriveEndBearingFaultData/Ball/B021_0.mat

/content/drive/MyDrive/Praca dyplomowa/Data/48kDriveEndBearingFaultData/OuterRaceOpposite@12.00/OR021@12_0.mat

/content/drive/MyDrive/Praca dyplomowa/Data/48kDriveEndBearingFaultData/OuterRaceOpposite@12.00/OR007@12_0.mat

/content/drive/MyDrive/Praca dyplomowa/Data/48kDriveEndBearingFaultData/OuterRaceOrthogonal@3.00/OR021@3_0.mat

/content/drive/MyDrive/Praca dyplomowa/Data/48kDriveEndBearingFaultData/OuterRaceOrthogonal@3.00/OR007@3_0.mat

/content/drive/MyDrive/Praca dyplomowa/Data/48kDriveEndBearingFaultData/InnerRace/IR007_0.mat

/content/drive/MyDrive/Praca dyplomowa/Data/48kDriveEndBearingFaultData/InnerRace/IR014_0.mat

/content/drive/MyDrive/Praca dyplomowa/Data/48kDriveEndBearingFaultData/InnerRace/IR021_0.mat



In [None]:
data

Unnamed: 0,signal,label,load
0,"[0.05319692307692307, 0.08866153846153846, 0.0...",Normal,0
1,"[-0.111192, -0.08302892307692307, -0.042348923...",B007,0
2,"[0.03984553846153846, 0.0897046153846154, 0.13...",B014,0
3,"[0.17152399999999998, 0.326772, 0.491618666666...",B021,0
4,"[0.051749333333333335, 0.05592266666666667, 0....",OR021@12,0
5,"[0.328152, 0.24011630769230768, 0.153332307692...",OR007@12,0
6,"[-0.06802533333333333, -0.27335333333333334, -...",OR021@3,0
7,"[0.471888, 0.4418473846153846, 0.3515169230769...",OR007@3,0
8,"[0.010016, -0.023788, -0.007929333333333333, 0...",IR007,0
9,"[0.18195733333333333, 0.2032413333333333, 0.21...",IR014,0


In [None]:
with open ('48kdrive-end_normalbaseline_loadsplit_data.pkl', 'wb') as f:
  pickle.dump(data, f)

### Cropping all of the data to the shortest signal length

In [None]:
with open('48kdrive-end_normalbaseline_loadsplit_data.pkl', 'rb') as f:
    data = pickle.load(f)

min_len = data['signal'].apply(len).min()
data['cropped_signal'] = data['signal'].apply(lambda x: x[:min_len])

with open('48kdrive-end_normalbaseline_loadsplit_cropped_data.pkl', 'wb') as f:
  pickle.dump(data, f)