# AutoEncoder Study

## Colab setting

In [1]:
# Colab
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# Colab
import pandas as pd
train_df = pd.read_csv('./drive/MyDrive/data/train.csv')
train_df = train_df.drop(columns=['ID'])
val_df = pd.read_csv('./drive/MyDrive/data/val.csv')
val_df = val_df.drop(columns=['ID'])
test_df = pd.read_csv('./drive/MyDrive/data/test.csv')
test_df = test_df.drop(columns=['ID'])

## Import

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

## Data Load

In [4]:
# Local
# train_df = pd.read_csv('./data/train.csv')
# train_df = train_df.drop(columns=['ID'])
# val_df = pd.read_csv('./data/val.csv')
# val_df = val_df.drop(columns=['ID'])
# test_df = pd.read_csv('./data/test.csv')
# test_df = test_df.drop(columns=['ID'])

In [5]:
# validation data의 정상, 불량 거래 데이터 비율 확인
print('Normals', round(val_df['Class'].value_counts()[0]/len(val_df) * 100,2), '% of the dataset')
print('Frauds', round(val_df['Class'].value_counts()[1]/len(val_df) * 100,2), '% of the dataset')

Normals 99.89 % of the dataset
Frauds 0.11 % of the dataset


## Pytorch

In [6]:
# device 설정, gpu 있을시 gpu사용
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Using Device:", device)

Using Device: cuda


### Hyper parameter

In [7]:
EPOCHS = 400
LR = 1e-2
BS = 16384
SEED = 123

### Fix Seed

In [8]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED) # Seed 고정

 ### Make DataSet    

In [9]:
# eval_ mode를 통해 validation, 즉 평가를 위한 데이터 val.df와 train.df 분리
# val_df의 Class인 정상 거래, 비정상 거래 내용을 labels로, 나머지 feature 값을 df로 저장
# train_df의 값을 df로 저장
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

### Pytorch Data Load

In [10]:
# shuffle을 통해 데이터 과적합해결(신경망이 데이터의 순서를 예측하지 못하게 한다)
train_dataset = MyDataset(df=train_df, eval_mode=False)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=2)


val_dataset = MyDataset(df = val_df, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=2)

### AutoEncoder 구조(신경망)

In [11]:
# neural network를 이용해서 AutoEncoder Layer 설정
# BatchNorm1d 정규화 레이어 사용
# LeakyReLU 활성화 함수 사용
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(30,64),
            nn.BatchNorm1d(64),
            nn.PReLU(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.PReLU(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.PReLU(),
            nn.Linear(64,30),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

### Train

In [12]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                # 역전파 단계 전에, optimizer 객체를 사용하여 (모델의 학습 가능한 가중치인) 갱신할
                # 변수들에 대한 모든 변화도(gradient)를 0으로 만듭니다. 이렇게 하는 이유는 기본적으로 
                # .backward()를 호출할 때마다 변화도가 버퍼(buffer)에 (덮어쓰지 않고) 누적되기
                # 때문입니다. 더 자세한 내용은 torch.autograd.backward에 대한 문서를 참조하세요.
                self.optimizer.zero_grad()

                # AutoEncoder 통과한 예측값
                _x = self.model(x)
                loss = self.criterion(x, _x)

                # 역전파 단계: 모델의 매개변수들에 대한 손실의 변화도를 계산합니다.
                loss.backward()
                # optimizer의 step 함수를 호출하면 매개변수가 갱신됩니다.
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        #  model.eval()는 이런 layer들의 동작을 inference(eval) mode로 바꿔준다는 목적
        eval_model.eval()
        pred = []
        true = []
        # torch.no_grad()의 주된 목적은 autograd(자동으로 gradient를 트래킹)를 끔으로써 메모리 사용량을 줄이고 연산 속도를 높히기 위함
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                #유사도 0.95보다 작은것은 이상거래 1, 아닌 것은 정상거래 0
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

In [13]:
model = nn.DataParallel(AutoEncoder())
model.eval()

# optim 패키지를 사용하여 모델의 가중치를 갱신할 optimizer를 정의합니다.
optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
# 학습률 개선 scheduler, patience번 정체되면 학습률 factor와 곱한다. 
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

Epoch : [0] Train loss : [0.5221804465566363] Val Score : [0.011953472826114834])
Epoch : [1] Train loss : [0.2989453354052135] Val Score : [0.31065869644030003])
Epoch : [2] Train loss : [0.19732261981282914] Val Score : [0.47505818666465444])
Epoch : [3] Train loss : [0.14701326404299056] Val Score : [0.5002556988120173])
Epoch : [4] Train loss : [0.11722193977662496] Val Score : [0.5083569226104104])
Epoch : [5] Train loss : [0.09686788597277232] Val Score : [0.5148608177924139])
Epoch : [6] Train loss : [0.0854653971535819] Val Score : [0.5251525663924234])
Epoch : [7] Train loss : [0.07923840518508639] Val Score : [0.5362409406460878])
Epoch : [8] Train loss : [0.07075492079768862] Val Score : [0.5438093350896822])
Epoch : [9] Train loss : [0.06794493538992745] Val Score : [0.5584269155630615])
Epoch : [10] Train loss : [0.06260997802019119] Val Score : [0.5747399991275138])
Epoch : [11] Train loss : [0.05903259292244911] Val Score : [0.5850157421008502])
Epoch : [12] Train loss :

### train 데이터를 예측

In [14]:
# 학습된 내용 불러오기
model = AutoEncoder()
model.load_state_dict(torch.load('./best_model.pth'))
model = nn.DataParallel(model)
model.eval()

DataParallel(
  (module): AutoEncoder(
    (Encoder): Sequential(
      (0): Linear(in_features=30, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): PReLU(num_parameters=1)
      (3): Linear(in_features=64, out_features=128, bias=True)
      (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): PReLU(num_parameters=1)
    )
    (Decoder): Sequential(
      (0): Linear(in_features=128, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): PReLU(num_parameters=1)
      (3): Linear(in_features=64, out_features=30, bias=True)
    )
  )
)

In [15]:
train_loader2 = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=2)

In [16]:
def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

In [17]:
train_df2 = train_df
preds = prediction(model,0.95,train_loader2,device)

In [18]:
train_df2['Class'] = preds

In [19]:
train_df2_normal = train_df2[train_df['Class']==0]

In [24]:
train_df2_normal = train_df2_normal.drop(columns=['Class'])

KeyError: ignored

### 예측한 train_df2 로 모델 다시 구성

In [25]:
train_dataset2 = MyDataset(df=train_df2_normal, eval_mode=False)
train_loader2 = DataLoader(train_dataset2, batch_size=BS, shuffle=True, num_workers=2)

In [26]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                # 역전파 단계 전에, optimizer 객체를 사용하여 (모델의 학습 가능한 가중치인) 갱신할
                # 변수들에 대한 모든 변화도(gradient)를 0으로 만듭니다. 이렇게 하는 이유는 기본적으로 
                # .backward()를 호출할 때마다 변화도가 버퍼(buffer)에 (덮어쓰지 않고) 누적되기
                # 때문입니다. 더 자세한 내용은 torch.autograd.backward에 대한 문서를 참조하세요.
                self.optimizer.zero_grad()

                # AutoEncoder 통과한 예측값
                _x = self.model(x)
                loss = self.criterion(x, _x)

                # 역전파 단계: 모델의 매개변수들에 대한 손실의 변화도를 계산합니다.
                loss.backward()
                # optimizer의 step 함수를 호출하면 매개변수가 갱신됩니다.
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model2.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        #  model.eval()는 이런 layer들의 동작을 inference(eval) mode로 바꿔준다는 목적
        eval_model.eval()
        pred = []
        true = []
        # torch.no_grad()의 주된 목적은 autograd(자동으로 gradient를 트래킹)를 끔으로써 메모리 사용량을 줄이고 연산 속도를 높히기 위함
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                #유사도 0.95보다 작은것은 이상거래 1, 아닌 것은 정상거래 0
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

In [27]:
model2 = nn.DataParallel(AutoEncoder())
model2.eval()

# optim 패키지를 사용하여 모델의 가중치를 갱신할 optimizer를 정의합니다.
optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
# 학습률 개선 scheduler, patience번 정체되면 학습률 factor와 곱한다. 
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer2 = Trainer(model, optimizer, train_loader2, val_loader, scheduler, device)
trainer2.fit()

Epoch : [0] Train loss : [0.12345490444983755] Val Score : [0.9165787375726882])
Epoch : [1] Train loss : [0.07479494969759669] Val Score : [0.9165787375726882])
Epoch : [2] Train loss : [0.05588579124638012] Val Score : [0.9165787375726882])
Epoch : [3] Train loss : [0.04396542268139975] Val Score : [0.9165787375726882])
Epoch : [4] Train loss : [0.04076072626880237] Val Score : [0.9165787375726882])
Epoch : [5] Train loss : [0.037180643528699875] Val Score : [0.9165787375726882])
Epoch : [6] Train loss : [0.03802211316568511] Val Score : [0.9165787375726882])
Epoch : [7] Train loss : [0.036426473941121786] Val Score : [0.9165787375726882])
Epoch : [8] Train loss : [0.04184137497629438] Val Score : [0.9165787375726882])
Epoch : [9] Train loss : [0.03848288793648992] Val Score : [0.9165787375726882])
Epoch : [10] Train loss : [0.037617764834846766] Val Score : [0.9165787375726882])
Epoch : [11] Train loss : [0.035705599401678355] Val Score : [0.9165787375726882])
Epoch 00012: reducing 

### 추론

In [28]:
# 학습된 내용 불러오기
model = AutoEncoder()
model.load_state_dict(torch.load('./best_model2.pth'))
model = nn.DataParallel(model)
model.eval()

DataParallel(
  (module): AutoEncoder(
    (Encoder): Sequential(
      (0): Linear(in_features=30, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): PReLU(num_parameters=1)
      (3): Linear(in_features=64, out_features=128, bias=True)
      (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): PReLU(num_parameters=1)
    )
    (Decoder): Sequential(
      (0): Linear(in_features=128, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): PReLU(num_parameters=1)
      (3): Linear(in_features=64, out_features=30, bias=True)
    )
  )
)

In [29]:
test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=BS, shuffle=False, num_workers=2)

In [30]:
preds = prediction(model, 0.95, test_loader, device)

In [32]:
submit = pd.read_csv('./drive/MyDrive/data/sample_submission.csv')
submit['Class'] = preds
submit.to_csv('./drive/MyDrive/autoencoder_test_hwan2.csv', index=False)