In [1]:
import random
from datetime import datetime

import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm

In [2]:
# random.seed(0)
# np.random.seed(0)
# torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [3]:
train_data = pd.read_csv('./data/train.csv').drop([379, 24598], axis=0)
test_data = pd.read_csv('./data/test_x.csv')
drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE',
             'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
             'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

In [4]:
train_y = train_data['voted']
train_y = 2 - train_y.to_numpy()

train_x = train_data.drop(drop_list + ['voted'], axis=1)
train_x = train_x.astype(replace_dict)
train_x = pd.get_dummies(train_x)
train_x = train_x.to_numpy()

test_x = test_data.drop(drop_list, axis=1)
test_x = test_x.astype(replace_dict)
test_x = pd.get_dummies(test_x)
test_x = test_x.to_numpy()


In [5]:
train_x[:, :20] = (train_x[:, :20] - 3.) / 2.
test_x[:, :20] = (test_x[:, :20] - 3.) / 2
train_x[:, 20] = (train_x[:, 20] - 5.) / 5.
test_x[:, 20] = (test_x[:, 20] - 5.) / 5.
train_x[:, 21:31] = (train_x[:, 21:31] - 3.5) / 3.5
test_x[:, 21:31] = (test_x[:, 21:31] - 3.5) / 3.5

In [6]:
train_x

array([[ 0. ,  0.5,  1. , ...,  1. ,  0. ,  0. ],
       [ 1. ,  1. ,  0. , ...,  0. ,  0. ,  1. ],
       [ 0.5, -1. , -1. , ...,  0. ,  1. ,  0. ],
       ...,
       [ 0.5, -1. , -1. , ...,  0. ,  1. ,  0. ],
       [-1. ,  0. ,  0.5, ...,  0. ,  0. ,  0. ],
       [ 0. ,  1. ,  1. , ...,  0. ,  1. ,  0. ]])

In [9]:
train_y = torch.tensor(train_y, dtype=torch.float32, device=DEVICE)
train_x = torch.tensor(train_x, dtype=torch.float32, device=DEVICE)
test_x = torch.tensor(test_x, dtype=torch.float32, device=DEVICE)
train_len, test_len = len(train_x), len(test_x)

N_MODEL = 18
N_EPOCH = 105
BATCH_SIZE = 128
LOADER_PARAM = {
    'batch_size': BATCH_SIZE,
    'num_workers': 0,
    'pin_memory': False
}
prediction = np.zeros((11383, 1), dtype=np.float32)

with torch.cuda.device(0):
    for no in range(N_MODEL):

        train_loader = DataLoader(TensorDataset(train_x, train_y),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
        test_loader = DataLoader(TensorDataset(test_x, torch.zeros((test_len,), dtype=torch.float32, device=DEVICE)),
                                 shuffle=False, drop_last=False, **LOADER_PARAM)
        model = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(91, 96, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
            
            nn.Dropout(0.5),
            nn.Linear(96, 36, bias=False),
            nn.ReLU(inplace=True),
            
            nn.Linear(36, 1)
        ).to(DEVICE)
        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
        optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=4e-2)
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, T_0=N_EPOCH // 4, eta_min=1.2e-5)

        model.train()
        for epoch in tqdm(range(N_EPOCH), desc='{:02d}/{:02d}'.format(no + 1, N_MODEL)):
            for idx, (xx, yy) in enumerate(train_loader):
                optimizer.zero_grad()
                xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                pred = model(xx).squeeze()
                loss = criterion(pred, yy)
                loss.backward()
                optimizer.step()
                scheduler.step(epoch + idx / len(train_loader))

        model.eval()
        with torch.no_grad():
            for idx, (xx, _) in enumerate(test_loader):
                xx = xx.to(DEVICE)
                pred = (2. - torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
                prediction[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction)), :] += pred[:, :] / N_MODEL

df = pd.read_csv('./data/sample_submission.csv')
df.iloc[:, 1:] = prediction
df.to_csv('./result/{}.csv'.format(datetime.now().strftime('%m%d-%H%M')), index=False)

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
01/18: 100%|█████████████████████████████████████████████████████████████████████████| 105/105 [01:58<00:00,  1.12s/it]
02/18: 100%|█████████████████████████████████████████████████████████████████████████| 105/105 [01:55<00:00,  1.10s/it]
03/18: 100%|█████████████████████████████████████████████████████████████████████████| 105/105 [01:59<00:00,  1.13s/it]
04/18: 100%|█████████████████████████████████████████████████████████████████████████| 105/105 [02:13<00:00,  1.27s/it]
05/18: 100%|█████████████████████████████████████████████████████████████████████████| 105/105 [03:20<00:00,  1.91s/it]
06/18: 100%|█████████████████████████████████████████████████████████████████████████| 105/105 [02:16<00:00,  1.30s/it]
07/18: 100%|█████████████████████████████████████████████████████████████████████████| 105/105 [02:20<00:00,  1.33s/it]
08/18: 100%|█████████████

## Early stopping tutorial

https://www.kaggle.com/akhileshrai/tutorial-early-stopping-vanilla-rnn-pytorch

### 정규화하기
(정규화하고자 하는 값 - 데이터 값들 중 최소값) / (데이터 값들 중 최대값 - 데이터 값들 중 최소값)
