# Prerequisites
Understanding in
- machine learning and deep learning
- python syntax
- python libraries: numpy, pandas, Pytorch

# Introduction
This notebook shows the study of the [CWRU Bearing Dataset](https://csegroups.case.edu/bearingdatacenter/home), which contains data of normal and fault bearings. Artificial defects of different diameters (0.007 ~ 0.028 Inches) are manufactured at different locations of the bearings: inner raceway(IR), outer raceway(OR) and ball(B) defects. 

Vibration data was recorded for motor loads of 0 to 3 hp (motor speed of 1797 to 1720 RPM) using accelerometers at the drive end (DE) and fan end (FE) and the data is stored as Matlab files. The sampling rate is 12 kHz and each Matlab file contains between ~120k to ~240k sample points. For more information please refer to the [website](https://csegroups.case.edu/bearingdatacenter/home).

This study focuses on the classification of the drive end bearing defects using only the signal data at **DE**. It is a **multiclass classification** problem. The input is the vibration signal data at DE and the output is the type of defects:
- 0 : Normal (N), 
- 1 : Fault at Ball (B),
- 2 : Fault at Inner Raceway (IR), 
- 3 : Fault at Outer Raceway (OR), 



# Import

In [2]:
# Data science libraries
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Pytorch
import torch
from torch import nn
from torch.nn import functional as F
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader
from torch import optim
from torch.nn.modules.loss import CrossEntropyLoss

# Others
from IPython.core.debugger import set_trace
from pathlib import Path

from helper import get_df_all, download
from train_helper import get_dataloader, fit, validate 
import nn_model
from data_urls import URLS

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
working_dir = Path('.')
DATA_PATH = Path("./Data")
save_model_path = working_dir / 'Model'
DE_path = DATA_PATH / '12k_DE'

for path in [DATA_PATH, save_model_path]:
    if not path.exists():
        path.mkdir(parents=True)

In [3]:
# # Uncomment this to download the 12k_DE data if needed
# for name, url in URLS["DE_12k"].items():
#     download(url, DE_path, name, suffix=".mat")

In [4]:
#### HYPERPARAMETERS ####
bs = 64
lr = 0.001
wd = 1e-5
betas=(0.99, 0.999)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
random_seed = 42

cuda


# Load Data and Preprocessing

In [None]:
df_all = get_df_all(DE_path, segment_length=500, normalize=False)
features = df_all.columns[2:]
target = 'label'

         filename                                            DE_time label
0      B007_0.mat  [[-0.0027613972055888225], [-0.096324031936127...     B
1      B007_1.mat  [[-0.07212119760479042], [0.30310395209580837]...     B
2      B007_2.mat  [[-0.3144744111776447], [0.12198878243512974],...     B
3      B007_3.mat  [[0.09031393213572854], [0.22497265469061875],...     B
4      B014_0.mat  [[-0.46781317365269465], [0.1790035129740519],...     B
..            ...                                                ...   ...
59  OR021@3_3.mat  [[0.08868958083832335], [0.06854762475049901],...    OR
60  OR021@6_0.mat  [[0.10436457085828342], [0.017461776447105788]...    OR
61  OR021@6_1.mat  [[-0.025989620758483035], [-0.0272078842315369...    OR
62  OR021@6_2.mat  [[-0.08771497005988023], [-0.00203043912175648...    OR
63  OR021@6_3.mat  [[-0.27613972055888225], [0.0625375249500998],...    OR

[64 rows x 3 columns]
----------------------------------------------------
------------------------

In [6]:
df_all.sample(10)

Unnamed: 0,label,filename,0,1,2,3,4,5,6,7,...,490,491,492,493,494,495,496,497,498,499
2990,1,B028_0.mat,0.085747,0.595471,-0.674725,-0.277838,1.004504,0.133938,-1.419917,-0.418492,...,1.035456,0.532784,-1.141743,-0.832226,1.065624,1.098143,-0.838887,-1.26986,0.340805,1.025661
11922,3,OR007@12_3.mat,0.098072,0.224257,0.185577,0.297812,0.39356,-0.038893,-0.263996,0.196357,...,0.490577,1.512737,-0.830243,-1.074369,1.040337,0.462676,-0.788393,0.2325,0.5375,-0.51383
6383,2,IR021_2.mat,0.02123,0.12839,0.21146,0.043659,-0.1582,0.108454,-0.283636,-0.341784,...,-0.197243,-0.463066,0.245519,0.658375,0.272932,-0.018643,0.181555,0.31862,-0.195582,-0.451436
7053,2,IR028_1.mat,1.340372,-0.33723,-1.523649,0.114115,2.380554,1.562401,-0.47618,0.540196,...,0.645137,0.12286,-0.694807,-0.482982,0.174845,0.515904,-0.244435,-0.839587,0.176788,1.101826
4686,2,IR007_3.mat,0.031574,-0.787928,1.640016,2.365757,-0.495249,-0.156466,0.290065,-2.505671,...,-0.242975,-1.351012,1.778844,1.899024,-1.820853,-1.284706,1.305377,0.165741,-0.710226,0.326326
12003,3,OR007@12_3.mat,-0.071231,-0.044599,-0.114984,-0.04777,-0.042697,-0.1283,-0.392718,-0.302676,...,-0.43647,0.209673,0.28513,-0.826438,-0.510025,0.211575,-0.231657,-0.920918,-0.22278,0.397365
5913,2,IR021_0.mat,0.11302,-0.568197,-0.303752,0.467933,0.474892,0.181837,0.424632,0.497316,...,0.279265,-1.635256,-1.916712,0.560721,1.367974,-1.019764,-1.102499,1.760002,1.666441,-1.141934
7819,0,Normal_0.mat,-1.96655,-1.160067,-0.310533,0.605012,1.147451,1.70998,1.827652,1.135971,...,0.315137,0.610752,0.536131,0.140064,-0.626238,-1.197378,-1.246168,-1.142847,-0.987864,-1.025175
11987,3,OR007@12_3.mat,0.321273,-0.340722,-0.31409,0.538134,0.978829,0.229329,-0.390181,0.105681,...,0.700462,-0.739567,0.053051,1.033362,-0.568996,-1.215138,0.607884,0.702364,-0.79727,-0.147323
4775,2,IR007_3.mat,-0.561037,0.517992,0.571348,-0.11088,0.372947,0.96245,0.334096,-0.450181,...,-0.802433,0.632992,1.436953,-0.116061,-0.78482,0.148128,-0.005205,-0.578131,-0.181331,-0.288042


In [7]:
df_all.shape

(17987, 502)

In [8]:
## Split the data into train and validation set
X_train, X_valid, y_train, y_valid = train_test_split(df_all[features], 
                                                      df_all[target], 
                                                      test_size=0.20, random_state=random_seed, shuffle=True
                                                     )

In [9]:
## Create DataLoader of train and validation set
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_valid = torch.tensor(X_valid.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_valid = torch.tensor(y_valid.values, dtype=torch.long)

train_ds = TensorDataset(X_train, y_train)
valid_ds = TensorDataset(X_valid, y_valid)
train_dl, valid_dl = get_dataloader(train_ds, valid_ds, bs)

# Training with Adams Optimizer

In [10]:
## Instantiate model, optimizer and loss function
model = nn_model.CNN_1D_2L(len(features))
model.to(device)
opt = optim.Adam(model.parameters(), lr=lr, betas=betas, weight_decay=wd)
loss_func = CrossEntropyLoss()

In [11]:
%%time
## Train
epochs = 20
model, metrics = fit(epochs, model, loss_func, opt, train_dl, valid_dl, train_metric=False)

EPOCH 	 Train Loss 	 Val Loss 	 Train Acc 	 Val Acc 	
CPU times: total: 281 ms
Wall time: 295 ms


KeyboardInterrupt: 

In [None]:
%%time
## Train
epochs = 20
model, metrics = fit(epochs, model, loss_func, opt, train_dl, valid_dl, train_metric=False)

EPOCH 	 Train Loss 	 Val Loss 	 Train Acc 	 Val Acc 	
0 	 0.00022 	 0.24229 	 0.00000 	0.93274 	
1 	 0.00020 	 0.05039 	 0.00000 	0.98193 	
2 	 0.00029 	 0.17963 	 0.00000 	0.94997 	
3 	 0.00029 	 0.07733 	 0.00000 	0.97721 	
4 	 0.00028 	 0.04317 	 0.00000 	0.98555 	
5 	 0.00017 	 0.03448 	 0.00000 	0.99138 	
6 	 0.00018 	 0.07745 	 0.00000 	0.97665 	
7 	 0.00014 	 0.06705 	 0.00000 	0.98054 	
8 	 0.00033 	 0.06601 	 0.00000 	0.97860 	
9 	 0.00022 	 0.04243 	 0.00000 	0.98777 	
10 	 0.00020 	 0.08321 	 0.00000 	0.97193 	
11 	 0.00018 	 0.04951 	 0.00000 	0.98888 	
12 	 0.00019 	 0.05189 	 0.00000 	0.98360 	
13 	 0.00014 	 0.06283 	 0.00000 	0.97860 	
14 	 0.00022 	 0.02842 	 0.00000 	0.98999 	
15 	 0.00017 	 0.04112 	 0.00000 	0.98638 	
16 	 0.00020 	 0.02733 	 0.00000 	0.99055 	
17 	 0.00020 	 0.08156 	 0.00000 	0.97387 	
18 	 0.00017 	 0.03271 	 0.00000 	0.98944 	
19 	 0.00016 	 0.06353 	 0.00000 	0.98638 	
CPU times: total: 9.77 s
Wall time: 10.1 s


# Save trained model

In [None]:
torch.save(model.state_dict(), save_model_path / 'model_2.pth')

In [None]:
model2 = nn_model.CNN_1D_2L(len(features))

In [None]:
model2.load_state_dict(torch.load(save_model_path / 'model_2.pth'))
model2.eval()

  model2.load_state_dict(torch.load(save_model_path / 'model_2.pth'))


CNN_1D_2L(
  (layer1): Sequential(
    (0): Conv1d(1, 64, kernel_size=(9,), stride=(1,), padding=(4,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv1d(64, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
  )
  (linear1): Linear(in_features=16000, out_features=4, bias=True)
)

In [None]:
%%time
print(validate(model, valid_dl, loss_func))

(0.06352972865203424, np.float64(0.9863813229571985), (array([2, 0, 1, ..., 3, 1, 3], shape=(3598,)), array([2, 0, 1, ..., 3, 1, 3], shape=(3598,))))
CPU times: total: 156 ms
Wall time: 151 ms
