<a href="https://colab.research.google.com/github/seunghee0518/AI_python/blob/main/Outlier%20detection%20%EC%8B%A0%EC%9A%A9%EC%B9%B4%EB%93%9C%20%EC%82%AC%EA%B8%B0%EA%B1%B0%EB%9E%98%20%EA%B0%90%EC%A7%80/%5Bbaseline2%5D%E1%84%86%E1%85%A2%E1%84%8B%E1%85%AE_%E1%84%80%E1%85%A1%E1%86%AB%E1%84%83%E1%85%A1%E1%86%AB%E1%84%92%E1%85%A1%E1%86%AB_1D_AutoEncoder_%E1%84%92%E1%85%AA%E1%86%AF%E1%84%8B%E1%85%AD%E1%86%BC_(Public_Score___0_926).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [None]:
import random
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Data Load

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
directory = '/content/drive/MyDrive/DACON_신용카드 사기 거래 탐지 AI 경진대회'
train_df = pd.read_csv(f'{directory}/data/train.csv') # Train
train_df = train_df.drop(columns=['ID'])
#display(train_df.head())

val_df = pd.read_csv(f'{directory}/data/val.csv') # Validation
val_df = val_df.drop(columns=['ID'])
#display(val_df.head())

test_df = pd.read_csv(f'{directory}/data/test.csv') # test
#display(test_df.head())

In [None]:
len(train_df)/16384

6.9483642578125

## 하이퍼파라미터

In [None]:
EPOCHS = 200
LR = 1e-2
BS = 16384 # 한번의 학습시 대략 7번으로 나누어서 들어감 
SEED = 41

## 시드고정

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED) # Seed 고정

## 데이터셋 생성

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [None]:
train_dataset = MyDataset(df = train_df, eval_mode = False)
train_loader  = DataLoader(train_dataset, batch_size = BS, shuffle = True)#, num_workers=6)

val_dataset = MyDataset(df = val_df, eval_mode = True)
val_loader  = DataLoader(val_dataset, batch_size = BS, shuffle = False)#, num_workers=6)
## 우리의 경우 num_workers = 1로 변경

## 1D AutoEncoder

* 차원확장 유추(gan)
Generative Adversial Network(GAN)은 훈련데이터의 확률분포를 학습하는 대표적인 생산적 모델 (Genertive Model)로 여러 분야에 활용되고 있다. 최근 모바일 장치를 이용한 포지셔닝 데이터의 대량수집이 가능해지면서 GAN을 활용해 위치데이터(위도,경도)를 생산하는 연구가 있었다. 하지만, 훈련 데이터가 위치데이터와 같이 복잡한 분포를 가지는 저차원 데이터인 경우 GAN의 학습이 불안정해진다는 문제점이 있다. 본 논문은 기본적인 Auto Encoder(AE)를 이용해 위치데이터의 차원을 확장시키는 방법을 제시한다. 실험을 통해, 해당 방법으로 차원이 늘어난 데이터를 GAN에 학습시킨다면 학습 안정에 효과가 있고, 의미있는 학습이 가능하다는 것을 확인하였다.

In [None]:
## 배치 정규화란 : 학습 과정에서 각 배치 단위 별로 데이터가 다양한 분포를 가지더라도 각 배치별로 평균과 분산을 이용해 정규화하는 것
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(30,42),
            nn.BatchNorm1d(42),
            nn.LeakyReLU(),
            nn.Linear(42,64),
            nn.BatchNorm1d(64), 
            nn.LeakyReLU(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128), 
            nn.LeakyReLU(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,42),
            nn.BatchNorm1d(42),
            nn.LeakyReLU(), # 일반렐루와 다르게 임계치 보다 작으면 0이 아닌 0.01을 곱함
            nn.Linear(42,30),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

## Train (학습)

In [None]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS): #  몇번 학습시킬것인가
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device) # 인풋데이터
                self.optimizer.zero_grad()

                _x = self.model(x) # 아웃풋데이터
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6) # 기본값은 1, 1e-8임
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist() # 인풋과 아웃풋의 코사인 유사도를 계산함
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

## 모델 학습

In [None]:
model = nn.DataParallel(AutoEncoder())
model.eval()  
# eval(), train() : eval 모드시 평과 가정에서 사용하지 않을 레이어들의 전원을 끈다
# 사용하지 않는 레이어들의 예시 : Dropout, BatchNorm 등등

# optimizer : 모델학습시 예측결과의 차이를를 잘 줄일수 있게 해주는 역할
# scheduler : 가중치의 보폭인 학습률을 조정하는 역할
# optimizer = torch.optim.Adam(params = model.parameters(), lr = LR) # 원본
optimizer = torch.optim.SparseAdam(params = model.parameters(), lr = LR) # SGD -> 학습안됨 0.0010529271374420891 변동없음]) 
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

RuntimeError: ignored

## 추론

In [None]:
model = AutoEncoder()
model.load_state_dict(torch.load('./best_model.pth'))
model = nn.DataParallel(model)
model.eval()

DataParallel(
  (module): AutoEncoder(
    (Encoder): Sequential(
      (0): Linear(in_features=30, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
      (3): Linear(in_features=64, out_features=128, bias=True)
      (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): LeakyReLU(negative_slope=0.01)
    )
    (Decoder): Sequential(
      (0): Linear(in_features=128, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
      (3): Linear(in_features=64, out_features=30, bias=True)
    )
  )
)

In [None]:
#test_df = pd.read_csv('./test.csv')
test_df = test_df.drop(columns=['ID'])

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [None]:
test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=BS, shuffle=False, num_workers=6)

  cpuset_checked))


In [None]:
def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            batch_pred = np.where(np.array(diff)<thr, 1,0).tolist() # thr = 0.95
            pred += batch_pred
    return pred

def prediction_raw(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            print(x.shape)
            print(x)
            
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            print('diff')
            print(diff.shape)
            #batch_pred = np.where(np.array(diff)<thr, 1,0).tolist() # thr = 0.95
            pred += diff
    return pred

In [None]:
preds = prediction(model, 0.95, test_loader, device)

  cpuset_checked))


In [None]:
# print(
# len(preds),
# len(list(filter(lambda x : x == 1, preds))),
# len(list(filter(lambda x : x == 0, preds))),
# len(list(filter(lambda x : x == 1, preds)))/len(list(filter(lambda x : x == 0, preds))))

142503 335 142168 0.0023563671149625797


In [None]:
submit = pd.read_csv(f'{directory}/data/sample_submission.csv')
submit['Class'] = preds
submit.to_csv(f'{directory}/submit/autoencoder_origin.csv', index=False)

In [None]:
#자카르트 유사도 예시
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

list1 = ["삼성전자", "테슬라", "LG전자", "카카오", "펄어비스"]
list2 = ["삼성전자", "카카오", "넷마블", "현대자동차", "셀트리온"]

print('jaccard_similarity : ', jaccard_similarity(list1, list2))
출처: https://needjarvis.tistory.com/705 [자비스가 필요해:티스토리]

## 오토인코더와 이클 모델 결과물 비교

In [None]:
auto = submit.copy()
ellp = pd.read_csv('/content/drive/MyDrive/DACON_신용카드 사기 거래 탐지 AI 경진대회/submit/EllipticEnvelope_0.001055_by_sh.csv')

In [None]:
len(ellp)

142503

In [None]:
total = pd.merge(auto, ellp, how = 'left', on='ID')

In [None]:
len(pd.read_csv(f'{directory}/data/sample_submission.csv'))

142503

In [None]:
len(submit)

142503

In [None]:
idx = total[total.Class_x != total.Class_y].index.tolist()
total[total.Class_x != total.Class_y]

Unnamed: 0,ID,Class_x,Class_y
32581,A0xff90,0,1
69390,0x21fd2,0,1
76962,0x25aaf,1,0
93774,0x2dd50,1,0
106562,0x340a9,1,0
110069,0x35c35,0,1
112266,0x36d3a,0,1
116944,0x391b1,1,0
137462,0x43154,0,1


In [None]:
total_max = auto.copy()

In [None]:
total_max.loc[idx]

Unnamed: 0,ID,Class
32581,A0xff90,0
69390,0x21fd2,0
76962,0x25aaf,1
93774,0x2dd50,1
106562,0x340a9,1
110069,0x35c35,0
112266,0x36d3a,0
116944,0x391b1,1
137462,0x43154,0


In [None]:
total_max = auto.copy()
total_max['Class'].loc[total[total.Class_x != total.Class_y].index] = 1
total_max.to_csv(f'{directory}/submit/autoencoder&EllipticEnvelope_OR.csv', index=False)

total_min = auto.copy()
total_min['Class'].loc[total[total.Class_x != total.Class_y].index]= 0
total_min.to_csv(f'{directory}/submit/autoencoder&EllipticEnvelope_AND.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
total_max.loc[total[total.Class_x != total.Class_y].index]['Class']

32581     1
69390     1
76962     1
93774     1
106562    1
110069    1
112266    1
116944    1
137462    1
Name: Class, dtype: int64

In [None]:
total_max

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0
...,...,...
142498,0x4587f,0
142499,0x45880,0
142500,0x45884,0
142501,0x45885,0


In [None]:
display(auto[auto.Class == 1]) # 317
display(ellp[ellp.Class == 1]) # 318 
display(total_max[total_max.Class == 1])
display(total_min[total_min.Class == 1])

Unnamed: 0,ID,Class
227,AA0x1cd,1
233,AA0x1d9,1
274,AA0x21e,1
2439,A0x1339,1
3000,A0x17dd,1
...,...,...
138529,0x43981,1
138992,0x43d0d,1
140032,0x44538,1
140182,0x44656,1


Unnamed: 0,ID,Class
227,AA0x1cd,1
233,AA0x1d9,1
274,AA0x21e,1
2439,A0x1339,1
3000,A0x17dd,1
...,...,...
138529,0x43981,1
138992,0x43d0d,1
140032,0x44538,1
140182,0x44656,1


Unnamed: 0,ID,Class
227,AA0x1cd,1
233,AA0x1d9,1
274,AA0x21e,1
2439,A0x1339,1
3000,A0x17dd,1
...,...,...
138529,0x43981,1
138992,0x43d0d,1
140032,0x44538,1
140182,0x44656,1


Unnamed: 0,ID,Class
227,AA0x1cd,1
233,AA0x1d9,1
274,AA0x21e,1
2439,A0x1339,1
3000,A0x17dd,1
...,...,...
138529,0x43981,1
138992,0x43d0d,1
140032,0x44538,1
140182,0x44656,1
