# Prerequisites
Understanding in
- machine learning and deep learning
- python syntax
- python libraries: numpy, pandas, Pytorch

# Introduction
This notebook shows the study of the [CWRU Bearing Dataset](https://csegroups.case.edu/bearingdatacenter/home), which contains data of normal and fault bearings. Artificial defects of different diameters (0.007 ~ 0.028 Inches) are manufactured at different locations of the bearings: inner raceway(IR), outer raceway(OR) and ball(B) defects. 

Vibration data was recorded for motor loads of 0 to 3 hp (motor speed of 1797 to 1720 RPM) using accelerometers at the drive end (DE) and fan end (FE) and the data is stored as Matlab files. The sampling rate is 12 kHz and each Matlab file contains between ~120k to ~240k sample points. For more information please refer to the [website](https://csegroups.case.edu/bearingdatacenter/home).

This study focuses on the classification of the drive end bearing defects using only the signal data at **DE**. It is a **multiclass classification** problem. The input is the vibration signal data at DE and the output is the type of defects:
- 0 : Normal (N), 
- 1 : Fault at Ball (B),
- 2 : Fault at Inner Raceway (IR), 
- 3 : Fault at Outer Raceway (OR), 



# Import

In [1]:
# Data science libraries
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Pytorch
import torch
from torch import nn
from torch.nn import functional as F
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader
from torch import optim
from torch.nn.modules.loss import CrossEntropyLoss

# Others
from IPython.core.debugger import set_trace
from pathlib import Path

from helper import get_df_all, download
from train_helper import get_dataloader, fit, validate 
import nn_model
from data_urls import URLS

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
working_dir = Path('.')
DATA_PATH = Path("./Data")
save_model_path = working_dir / 'Model'
DE_path = DATA_PATH / '12k_DE'

for path in [DATA_PATH, save_model_path]:
    if not path.exists():
        path.mkdir(parents=True)

In [3]:
# # Uncomment this to download the 12k_DE data if needed
for name, url in URLS["DE_12k"].items():
    download(url, DE_path, name, suffix=".mat")

In [4]:
#### HYPERPARAMETERS ####
bs = 64
lr = 0.001
wd = 1e-5
betas=(0.99, 0.999)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
random_seed = 42
print(device)
print(torch.cuda.is_available())

cuda
True


# Load Data and Preprocessing

In [5]:
df_all = get_df_all(DE_path, segment_length=500, normalize=True)
features = df_all.columns[2:]
target = 'label'

In [6]:
df_all.sample(5)

Unnamed: 0,label,filename,0,1,2,3,4,5,6,7,...,490,491,492,493,494,495,496,497,498,499
16920,1,Data/12k_DE/B007_0.mat,0.744324,0.035597,-0.732874,0.348374,0.545178,-0.895706,-0.869934,1.061787,...,0.044969,0.178514,0.253487,-0.031176,0.193743,0.221858,-0.236179,0.254658,0.438576,0.061369
4346,1,Data/12k_DE/B028_1.mat,0.486591,1.582707,0.143404,-1.996307,-0.829228,1.774347,1.044672,-1.405751,...,3.74968,1.29204,-3.31814,-1.191259,3.311875,1.207847,-3.015045,-1.247388,2.413012,1.253952
7454,3,Data/12k_DE/OR021@3_1.mat,-0.026308,0.196805,0.831245,0.623579,0.01431,-0.156743,-0.111548,-0.832946,...,-0.351824,1.125868,1.052642,-0.083516,-0.036033,0.862138,0.78033,-0.220816,-0.582373,0.438796
15439,1,Data/12k_DE/B021_2.mat,-1.061048,0.741058,0.99114,-0.606354,-0.43357,0.469757,-0.203191,-0.613932,...,0.637994,-0.619995,-0.201676,0.086298,-0.101643,0.74712,0.393974,-0.712449,-1.085299,0.345473
5914,2,Data/12k_DE/IR028_1.mat,0.23606,-0.787117,-1.688834,-0.374154,1.346688,1.393329,0.833157,0.924008,...,-0.618531,-0.862908,-0.014633,0.15784,-0.701609,-0.909062,-0.372696,0.013546,0.02715,0.260352


In [7]:
df_all.shape

(17987, 502)

In [8]:
## Split the data into train and validation set
X_train, X_valid, y_train, y_valid = train_test_split(df_all[features], 
                                                      df_all[target], 
                                                      test_size=0.20, random_state=random_seed, shuffle=True
                                                     )

In [9]:
## Create DataLoader of train and validation set
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_valid = torch.tensor(X_valid.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_valid = torch.tensor(y_valid.values, dtype=torch.long)

train_ds = TensorDataset(X_train, y_train)
valid_ds = TensorDataset(X_valid, y_valid)
train_dl, valid_dl = get_dataloader(train_ds, valid_ds, bs)

# Training with Adams Optimizer

In [10]:
## Instantiate model, optimizer and loss function
model = nn_model.CNN_1D_2L(len(features))
model.to(device)
opt = optim.Adam(model.parameters(), lr=lr, betas=betas, weight_decay=wd)
loss_func = CrossEntropyLoss()

In [11]:
%%time
## Train
epochs = 20
model, metrics = fit(epochs, model, loss_func, opt, train_dl, valid_dl, train_metric=False)

EPOCH 	 Train Loss 	 Val Loss 	 Train Acc 	 Val Acc 	
0 	 0.01264 	 0.37007 	 0.00000 	0.88049 	
1 	 0.00329 	 0.22787 	 0.00000 	0.91245 	
2 	 0.00148 	 0.27648 	 0.00000 	0.89550 	
3 	 0.00115 	 0.11189 	 0.00000 	0.96220 	
4 	 0.00085 	 0.09620 	 0.00000 	0.96609 	
5 	 0.00061 	 0.08284 	 0.00000 	0.97193 	
6 	 0.00067 	 0.17547 	 0.00000 	0.93663 	
7 	 0.00054 	 0.09470 	 0.00000 	0.97082 	
8 	 0.00045 	 0.07605 	 0.00000 	0.97610 	
9 	 0.00041 	 0.06929 	 0.00000 	0.97387 	
10 	 0.00039 	 0.05818 	 0.00000 	0.97804 	
11 	 0.00043 	 0.04972 	 0.00000 	0.98166 	
12 	 0.00040 	 0.04208 	 0.00000 	0.98249 	
13 	 0.00032 	 0.05615 	 0.00000 	0.98277 	
14 	 0.00029 	 0.02324 	 0.00000 	0.99444 	
15 	 0.00018 	 0.04442 	 0.00000 	0.98277 	
16 	 0.00030 	 0.03703 	 0.00000 	0.98805 	
17 	 0.00029 	 0.08931 	 0.00000 	0.96998 	
18 	 0.00046 	 0.02955 	 0.00000 	0.99055 	
19 	 0.00054 	 0.08679 	 0.00000 	0.97499 	
CPU times: user 16.8 s, sys: 165 ms, total: 16.9 s
Wall time: 17.2 s


In [17]:
%%time
## Train
epochs = 3
model, metrics = fit(epochs, model, loss_func, opt, train_dl, valid_dl, train_metric=False)

EPOCH 	 Train Loss 	 Val Loss 	 Train Acc 	 Val Acc 	
0 	 0.00001 	 0.17479 	 0.00000 	0.97437 	
1 	 0.00006 	 0.13076 	 0.00000 	0.97994 	
2 	 0.00004 	 0.11195 	 0.00000 	0.98440 	
CPU times: user 2.13 s, sys: 14.9 ms, total: 2.15 s
Wall time: 2.17 s


# Save trained model

In [None]:
torch.save(model.state_dict(), save_model_path / 'model_12k.pth')

In [13]:
model2 = nn_model.CNN_1D_2L(len(features))

In [None]:
model2.load_state_dict(torch.load(save_model_path / 'model_12k.pth'))
model2.eval()

  model2.load_state_dict(torch.load(save_model_path / 'model_12k_1000.pth'))


CNN_1D_2L(
  (layer1): Sequential(
    (0): Conv1d(1, 64, kernel_size=(9,), stride=(1,), padding=(4,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv1d(64, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
  )
  (linear1): Linear(in_features=32000, out_features=4, bias=True)
)

In [15]:
%%time
print(validate(model, valid_dl, loss_func))

(0.06863823064439475, np.float64(0.9860724233983287), (array([0, 3, 2, ..., 2, 1, 1], shape=(1795,)), array([0, 3, 2, ..., 2, 1, 1], shape=(1795,))))
CPU times: user 58.6 ms, sys: 2.94 ms, total: 61.5 ms
Wall time: 60.5 ms
