In [1]:
import pandas as pd
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
num_epochs = 400
lr = 1e-2
bs = 2500
seed = 42

In [4]:
def seed_everything(seed): # 사용가능한 모든 경우에서의 seed를 고정
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed) # Seed 고정

# Load Data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
train_df = pd.read_csv('/content/drive/MyDrive/dacon/open/train.csv') # Train
val_df = pd.read_csv('/content/drive/MyDrive/dacon/open/val.csv') # Validation
train_x = train_df.drop(columns = ['ID'])
val_x = val_df.drop(columns = ['ID','Class'])
val_y = val_df['Class']

# Scaling

In [20]:
mm_scaler = MinMaxScaler()
std_scaler = StandardScaler()

train_mm = mm_scaler.fit_transform(train_x)
train_std = std_scaler.fit_transform(train_x)

In [57]:
train_mm_df = pd.concat([train_df['ID'], pd.DataFrame(train_mm)], axis = 1)
train_mm_df.columns = train_df.columns
train_std_df = pd.concat([train_df['ID'], pd.DataFrame(train_std)], axis = 1)
train_std_df.columns = train_df.columns

val_mm_df = pd.concat([val_df['ID'], pd.DataFrame(mm_scaler.transform(val_x)), val_df['Class']], axis = 1)
val_mm_df.columns = val_df.columns
val_std_df = pd.concat([val_df['ID'], pd.DataFrame(std_scaler.transform(val_x)), val_df['Class']], axis = 1)
val_std_df.columns = val_df.columns

# Create Dataset

In [60]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode): # eval_mode : whether there are labels or not (True / False)
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['ID','Class']).values
        else:
            self.df = self.df.drop(columns = ['ID']).values
        
    def __getitem__(self, index): # index에 접근할 때 자동으로 호출되는 method
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [61]:
# # raw

# train_dataset = MyDataset(df = train_df, eval_mode = False)
# train_loader = DataLoader(train_dataset, batch_size = bs, shuffle = True, num_workers = 6)
# val_dataset = MyDataset(df = val_df, eval_mode = True, )
# val_loader = DataLoader(val_dataset, batch_size = bs, shuffle = False, num_workers = 6)

In [62]:
# #min max scaler

# train_dataset = MyDataset(df = train_mm_df, eval_mode = False)
# train_loader = DataLoader(train_dataset, batch_size = bs, shuffle = True, num_workers = 6)
# val_dataset = MyDataset(df = val_mm_df, eval_mode = True, )
# val_loader = DataLoader(val_dataset, batch_size = bs, shuffle = False, num_workers = 6)

In [69]:
#standard normalize

train_dataset = MyDataset(df = train_std_df, eval_mode = False)
train_loader = DataLoader(train_dataset, batch_size = bs, shuffle = True, num_workers = 6)
val_dataset = MyDataset(df = val_std_df, eval_mode = True, )
val_loader = DataLoader(val_dataset, batch_size = bs, shuffle = False, num_workers = 6)

#1D AutoEncoder

In [70]:
# # Overcomplete - 1

# class AutoEncoder(nn.Module):
#     def __init__(self):
#         super(AutoEncoder, self).__init__() # 부모 클래스(nn.Module)의 __init__()을 호출
#         self.Encoder = nn.Sequential(
#             nn.Linear(30,64), 
#             nn.BatchNorm1d(64),
#             nn.LeakyReLU(),
#             nn.Linear(64,80),
#             nn.BatchNorm1d(80),
#             nn.LeakyReLU(),
#         )
#         self.Decoder = nn.Sequential(
#             nn.Linear(80,64),
#             nn.BatchNorm1d(64),
#             nn.LeakyReLU(),
#             nn.Linear(64,30),
#         )
        
#     def forward(self, x):
#         x = self.Encoder(x)
#         x = self.Decoder(x)
#         return x

In [71]:
# Overcomplete - 2

class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__() # 부모 클래스(nn.Module)의 __init__()을 호출
        self.Encoder = nn.Sequential(
            nn.Linear(30,64), 
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,80),
            nn.BatchNorm1d(80),
            nn.LeakyReLU(),
            nn.Linear(80,100), 
            nn.BatchNorm1d(100),
            nn.LeakyReLU(),
            
        )
        self.Decoder = nn.Sequential(
            nn.Linear(100,80),
            nn.BatchNorm1d(80),
            nn.LeakyReLU(),
            nn.Linear(80,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,30),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

In [72]:
# # Under-complete

# class AutoEncoder(nn.Module):
#     def __init__(self):
#         super(AutoEncoder, self).__init__() # 부모 클래스(nn.Module)의 __init__()을 호출
#         self.Encoder = nn.Sequential(
#             nn.Linear(30,20), 
#             nn.BatchNorm1d(20),
#             nn.LeakyReLU(),
#             nn.Linear(20,10),
#             nn.BatchNorm1d(10),
#             nn.LeakyReLU(),
#         )
#         self.Decoder = nn.Sequential(
#             nn.Linear(10,20),
#             nn.BatchNorm1d(20),
#             nn.LeakyReLU(),
#             nn.Linear(20,30),
#             nn.LeakyReLU()
#         )
        
#     def forward(self, x):
#         x = self.Encoder(x)
#         x = self.Decoder(x)
#         return x

# Train

In [73]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(num_epochs):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False) # multi-gpu
                # torch.save(model.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

In [74]:
model = nn.DataParallel(AutoEncoder())
# model = AutoEncoder()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

Epoch : [0] Train loss : [0.35034520211427106] Val Score : [0.3001897201998244])
Epoch : [1] Train loss : [0.19269854802152384] Val Score : [0.45158446126645474])
Epoch : [2] Train loss : [0.16323774469935376] Val Score : [0.4635140037488198])
Epoch : [3] Train loss : [0.15250836539527643] Val Score : [0.4733703567624148])
Epoch : [4] Train loss : [0.14345311341078384] Val Score : [0.47715083240333417])
Epoch : [5] Train loss : [0.1357568406864353] Val Score : [0.4785726476862576])
Epoch : [6] Train loss : [0.130210907238981] Val Score : [0.47972152636628146])
Epoch : [7] Train loss : [0.12438459791567015] Val Score : [0.479972755497491])
Epoch : [8] Train loss : [0.12071265507003535] Val Score : [0.4828148944109286])
Epoch : [9] Train loss : [0.11323496926089992] Val Score : [0.48848160527612944])
Epoch : [10] Train loss : [0.10938162761537926] Val Score : [0.4908885353858123])
Epoch : [11] Train loss : [0.1050485361205495] Val Score : [0.49069967782787927])
Epoch : [12] Train loss : 

Exception ignored in: <function _releaseLock at 0x7faa2cfec8c0>
Traceback (most recent call last):
  File "/usr/lib/python3.7/logging/__init__.py", line 221, in _releaseLock
    def _releaseLock():
KeyboardInterrupt


KeyboardInterrupt: ignored

# 1st nn : max 85
# 2nd nn : 

## Inference : Test set

In [75]:
test_df = pd.read_csv('/content/drive/MyDrive/dacon/open/test.csv') # Test
test_df.head()

Unnamed: 0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,AAAA0x1,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,1.783274,-0.994983
1,AAAA0x2,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.269825,-0.994983
2,AAAA0x5,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0.670579,-0.99496
3,AAAA0x7,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,-0.237686,-0.994937
4,AAAA0xc,0.384978,0.616109,-0.8743,-0.094019,2.924584,3.317027,0.470455,0.538247,-0.558895,...,0.049924,0.238422,0.00913,0.99671,-0.767315,-0.492208,0.042472,-0.054337,-0.167819,-0.994866


In [76]:
test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False, num_workers=6)

In [77]:
def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

In [78]:
preds = prediction(model, 0.95, test_loader, device)

In [79]:
submit = pd.read_csv('/content/drive/MyDrive/dacon/open/sample_submission.csv')
submit['Class'] = preds
submit.to_csv('./submit_autoencoder_2.csv', index=False)

In [35]:
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0


## Submission