# Import & Setting

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns

import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

# Parameter Setting

In [2]:
CFG = {
    'EPOCHS':60,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':64,
    'SEED':41
}

# Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # 다중 GPU 환경
    torch.backends.cudnn.deterministic = True  # 결정론적 알고리즘 강제
    torch.backends.cudnn.benchmark = False     # 알고리즘 선택 비활성화

seed_everything(CFG['SEED']) # Seed 고정

# Load Data

In [4]:
file_path = '/Users/toad/Documents/ToyProject/Heat_Treatment_PdM/'

In [5]:
train = pd.read_csv(file_path+'Data/train_normal.csv').drop(columns='Unnamed: 0')
valid = pd.read_csv(file_path+'Data/valid_normal.csv').drop(columns='Unnamed: 0')
test = pd.read_csv(file_path+'Data/test_normal.csv').drop(columns='Unnamed: 0')
anomaly = pd.read_csv(file_path+'Data/anomaly_data.csv').drop(columns='Unnamed: 0')

In [6]:
print(f'Normal Data')
print(f'- Train Data : {len(train)}')
print(f'- Valid Data : {len(valid)}')
print(f'- Test Data : {len(test)}')
print(f'--------------------------------')
print(f'Anomaly Data : {len(anomaly)}')

Normal Data
- Train Data : 2020534
- Valid Data : 288648
- Test Data : 288648
--------------------------------
Anomaly Data : 2717


In [7]:
display(train.head())
print(f'Columns : {train.columns.values}')

Unnamed: 0,CP(제어) OP,건조 1존 OP,건조 2존 OP,건조로 온도 1 Zone,건조로 온도 2 Zone,세정기,소입1존 OP,소입2존 OP,소입3존 OP,소입4존 OP,...,소입로 CP 모니터 값,소입로 온도 2 Zone,솔트 1존 OP,솔트 2존 OP,솔트 슬러지 제거,솔트 컨베이어 온도 1 Zone,솔트 컨베이어 온도 2 Zone,솔트조 온도 1 Zone,outliers_cnt,anomaly
0,27.5181,69.4988,28.0363,100.366,100.353,69.2698,84.3415,57.7779,50.808,70.3781,...,1.14953e-10,860.286,70.7822,59.7691,325.772,288.802,285.324,328.856,1,0
1,27.5181,69.5777,28.2541,100.366,100.353,69.2698,89.9508,58.0981,49.5654,70.6791,...,1.15036e-10,860.272,72.1144,59.8864,325.711,288.728,285.311,328.783,1,0
2,28.8494,69.6464,30.5821,100.366,100.291,69.2085,85.0744,58.0155,51.0691,70.6326,...,1.1493e-10,860.272,72.1144,59.8864,325.833,288.789,285.372,328.783,2,0
3,28.8494,69.828,28.39,100.428,100.353,69.2698,84.9387,57.9397,49.485,70.6453,...,1.15036e-10,860.272,71.0169,59.8864,325.833,288.728,285.372,328.844,1,0
4,28.8494,72.1245,26.3033,100.366,100.353,69.2085,84.981,57.9252,49.4771,70.6567,...,1.15036e-10,860.272,71.0169,59.8864,325.833,288.728,285.372,328.844,1,0


Columns : ['CP(제어) OP' '건조 1존 OP' '건조 2존 OP' '건조로 온도 1 Zone' '건조로 온도 2 Zone' '세정기'
 '소입1존 OP' '소입2존 OP' '소입3존 OP' '소입4존 OP' '소입로 CP 값' '소입로 CP 모니터 값'
 '소입로 온도 2 Zone' '솔트 1존 OP' '솔트 2존 OP' '솔트 슬러지 제거' '솔트 컨베이어 온도 1 Zone'
 '솔트 컨베이어 온도 2 Zone' '솔트조 온도 1 Zone' 'outliers_cnt' 'anomaly']


# Data Processing

#### Delete outliers count & anomaly feature

In [8]:
print(f'''train cols : {len(train.columns)}
valid cols : {len(valid.columns)}
test cols : {len(test.columns)}
anomaly cols : {len(anomaly.columns)}''')
train.drop(columns=['outliers_cnt', 'anomaly'], inplace=True)
valid.drop(columns=['outliers_cnt', 'anomaly'], inplace=True)
test.drop(columns=['outliers_cnt'], inplace=True)
anomaly.drop(columns=['outliers_cnt'], inplace=True)
print(f'''train cols : {len(train.columns)}
valid cols : {len(valid.columns)}
test cols : {len(test.columns)}
anomaly cols : {len(anomaly.columns)}''')

train cols : 21
valid cols : 21
test cols : 21
anomaly cols : 21
train cols : 19
valid cols : 19
test cols : 20
anomaly cols : 20


#### Scaling

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# For normal Test data
train_scaled = scaler.fit_transform(train)
valid_scaled = scaler.transform(valid)
# test_scaled = scaler.transform(test)
# anomaly_scaled = scaler.transform(anomaly)

# Dataset


In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cpu


In [11]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

        
    def __getitem__(self, index): 
        return self.data[index]
        
    def __len__(self): 
        return self.data.shape[0]

In [12]:
train_dataset = CustomDataset(train_scaled)
valid_dataset = CustomDataset(valid_scaled)
# test_dataset = CustomDataset(test_scaled)
# anomaly_dataset = CustomDataset(anomaly_scaled)

train_dataloader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
# anomaly_dataloader = DataLoader(anomaly_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)

# Modeling

In [13]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(19,32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU()
        )

        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,32),
            nn.ReLU(),
            nn.Linear(32,19),
            nn.ReLU(),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

# Train

In [14]:
def loss_plot(train_loss, valid_loss):
    plt.plot(train_loss, label='Train Loss')
    plt.plot(valid_loss, label='Valid Loss')
    plt.yscale("log")  # Y축 로그 스케일
    plt.ylabel("Loss (Log Scale)")
    plt.legend()
    plt.show()

In [15]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, valid_loader, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.device = device
        # Loss function
        self.criterion = nn.MSELoss().to(self.device)
        # Loss array
        self.train_mean_list = []
        self.valid_mean_list = []
    
    def fit(self,):
        self.model.to(self.device)
        best_loss = 100000000
        
        for epoch in range(CFG['EPOCHS']):
            self.model.train()
            train_loss = []
            
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x) # after autoencoder
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            valid_loss = self.validation(self.model)
            
            mean_train_loss = np.mean(train_loss)
            mean_valid_loss = np.mean(valid_loss)
            self.train_mean_list.append(mean_train_loss)
            self.valid_mean_list.append(mean_valid_loss)
            
            print(f'Epoch : [{epoch}] Train Loss : [{mean_train_loss:.20f}] Valid Loss : [{mean_valid_loss:.20f}]')
            
            if mean_valid_loss < best_loss:
                best_loss = mean_valid_loss
                torch.save(self.model.state_dict(), f'{epoch}_best_model.pth')
            # else:
            #     print(f'Early Stopping in Epoch : [{epoch}] Best Loss : [{best_loss}]')
            #     loss_plot(train_mean_list, valid_mean_list) # visualize loss
            #     break
        loss_plot(self.train_mean_list, self.valid_mean_list) # visualize loss
    
    def validation(self, model):
        model.eval()
        valid_loss = []
        with torch.no_grad():
            for x in iter(self.valid_loader):
                x = x.float().to(self.device)
                _x = model(x)
                loss = self.criterion(x, _x)
                valid_loss.append(loss.item())
        return valid_loss


In [None]:
# model = AutoEncoder()
# optimizer = optim.Adam(params=model.parameters(), lr=CFG['LEARNING_RATE'])

# trainer = Trainer(model, optimizer, train_dataloader, valid_dataloader, device)
# trainer.fit()

Kaggle P100 Training Result 
- Best Model 
    - Epoch : [51]  
    - Train Loss : [0.00000356411259314551]  
    - Valid Loss : [0.00000229669710111774]
- Loss Plot
    
    <img src="/Users/toad/Documents/ToyProject/Heat_Treatment_PdM/Epoch60_Loss.png" alt="이미지 설명" width="500" height="400">

# Inference

In [17]:
model = AutoEncoder()
model.load_state_dict(torch.load(file_path+'/Code/best_model.pth', map_location=torch.device('cpu')))
model.to(device)
model.eval()

AutoEncoder(
  (Encoder): Sequential(
    (0): Linear(in_features=19, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=128, bias=True)
    (5): ReLU()
  )
  (Decoder): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=19, bias=True)
    (5): ReLU()
  )
)

In [20]:
def validation(model, valid_loader, criterion):
        valid_loss = []
        with torch.no_grad():
            for x in iter(valid_loader):
                x = x.float().to(device)
                _x = model(x)
                loss = criterion(x, _x)
                valid_loss.append(loss.item())
        return valid_loss

In [22]:
valid_loss = validation(model, valid_dataloader, nn.MSELoss().to(device))
print(f'Valid Loss : {np.mean(valid_loss):.20f}')

Valid Loss : 0.00000229670615638400


In [38]:
test_scaled = scaler.transform(test.iloc[:,:-1])

Unnamed: 0,CP(제어) OP,건조 1존 OP,건조 2존 OP,건조로 온도 1 Zone,건조로 온도 2 Zone,세정기,소입1존 OP,소입2존 OP,소입3존 OP,소입4존 OP,소입로 CP 값,소입로 CP 모니터 값,소입로 온도 2 Zone,솔트 1존 OP,솔트 2존 OP,솔트 슬러지 제거,솔트 컨베이어 온도 1 Zone,솔트 컨베이어 온도 2 Zone,솔트조 온도 1 Zone,anomaly
0,31.6492,70.02040,27.63070,100.2790,99.7349,68.1173,83.50290,58.3751,55.55580,73.0755,0.447044,1.140880e-10,859.675,63.8160,54.2885,330.8930,291.991,277.551,332.240,0
1,31.6492,72.10290,30.14790,100.3490,99.6806,68.1281,84.43160,58.3739,55.59790,73.1007,0.446637,1.140880e-10,859.675,63.8160,53.7397,330.9660,291.991,277.551,332.240,0
2,32.8507,71.64070,27.62800,100.2880,99.6806,68.1281,85.11800,58.3731,55.63670,73.1237,0.446845,1.140880e-10,859.675,63.8160,54.2885,330.9050,292.052,277.612,332.240,0
3,32.8507,69.16080,30.01890,100.3490,99.6806,68.1894,85.74830,58.3726,55.67250,73.1448,0.447216,1.140880e-10,859.675,63.8160,53.7397,330.9660,292.052,277.551,332.240,0
4,32.8507,69.20370,30.02110,100.3490,99.6806,68.1894,81.93010,56.8513,55.65020,73.1641,0.447207,1.140880e-10,859.738,63.8160,54.2885,330.9660,292.052,277.612,332.240,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288643,15.6761,75.07815,25.52975,98.9276,99.5366,68.8107,53.49165,54.6361,56.35050,70.0290,0.449622,1.147110e-10,860.064,73.3786,61.9842,339.0390,279.522,277.586,331.721,0
288644,15.6761,74.52635,26.90115,98.9765,99.5610,68.8594,53.46750,54.6559,56.38505,70.1214,0.449972,1.147110e-10,860.094,73.3786,61.9842,339.0635,279.450,277.573,331.714,0
288645,15.6761,73.43930,26.53660,98.9150,99.4624,68.7981,57.84290,53.8060,56.34330,70.0723,0.450438,1.147110e-10,860.124,73.3786,62.5332,339.0635,279.450,277.512,331.714,0
288646,15.6761,73.50410,26.38570,98.9150,99.5239,68.7981,57.57580,53.8274,56.30550,70.0574,0.450514,1.147110e-10,860.124,73.3786,62.5332,339.0635,279.450,277.512,331.714,0
