In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import random
from datetime import datetime

import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm


In [None]:

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda:0'

train_data = pd.read_csv('./drive/My Drive/Colab Notebooks/daycon1/train.csv').drop([379, 24598], axis=0)
test_data = pd.read_csv('./drive/My Drive/Colab Notebooks/daycon1/test_x.csv')
drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE',
             'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
             'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}
train_y = train_data['voted']
train_x = train_data.drop(drop_list + ['voted'], axis=1)
test_x = test_data.drop(drop_list, axis=1)

In [None]:
train_x = train_x.astype(replace_dict)
test_x = test_x.astype(replace_dict)

#모든 데이터를 수치로 변환
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)
train_y = 2 - train_y.to_numpy()
train_x = train_x.to_numpy()
test_x = test_x.to_numpy()

In [None]:
#마키아벨리즘 문항들에 대해
train_x[:, :20] = (train_x[:, :20] - 3.) / 2.
test_x[:, :20] = (test_x[:, :20] - 3.) / 2
train_x[:, 20] = (train_x[:, 20] - 5.) / 5.
test_x[:, 20] = (test_x[:, 20] - 5.) / 5.


In [None]:
#train_x[ :20].shape[1]

In [None]:
train_x[:, 21:31] = (train_x[:, 21:31] - 3.5) / 3.5
test_x[:, 21:31] = (test_x[:, 21:31] - 3.5) / 3.5

In [None]:

train_y = torch.tensor(train_y, dtype=torch.float32)
train_x = torch.tensor(train_x, dtype=torch.float32)
test_x = torch.tensor(test_x, dtype=torch.float32)
train_len, test_len = len(train_x), len(test_x)

In [None]:
#생성할 모델의 갯수
N_MODEL = 10
#epoch : 전체 데이터로 forword - backword 학습을 몇번 완료하는지.
N_EPOCH = 100
#한번의 batch마다 주는 데이터의 size 메모리의 한계로 한 번의 epoch에서 모든 데이터를 다 넣을 수 없으므로 batch size만큼의 데이터를 iteration횟수만큼 넣게된다.
BATCH_SIZE = 128
LOADER_PARAM = {
    'batch_size': BATCH_SIZE,
    'num_workers': 4,
    'pin_memory': True
}
prediction = np.zeros((11383, 1), dtype=np.float32)



In [None]:
for no in range(N_MODEL):
    N_EPOCH +=1
    train_loader = DataLoader(TensorDataset(train_x, train_y),
                              shuffle=True, drop_last=True, **LOADER_PARAM)
    test_loader = DataLoader(TensorDataset(test_x, torch.zeros((test_len,), dtype=torch.float32)),
                             shuffle=False, drop_last=False, **LOADER_PARAM)
    model = nn.Sequential(
        nn.Dropout(0.05),
        nn.Linear(91, 96, bias=False), # 첫번째 파라미터로는 변수 갯수가 들어감.
        nn.LeakyReLU(0.05, inplace=True),
        nn.Dropout(0.5),
        nn.Linear(96, 36, bias=False),
        nn.ReLU(inplace=True),
        nn.Linear(36, 1)
    ).to(DEVICE)
    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=4e-2)
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=N_EPOCH // 4, eta_min=1.2e-5)

    model.train()
    for epoch in tqdm(range(N_EPOCH), desc='{:02d}/{:02d}'.format(no + 1, N_MODEL)):
        for idx, (xx, yy) in enumerate(train_loader):
            optimizer.zero_grad()
            xx, yy = xx.to(DEVICE), yy.to(DEVICE)
            pred = model(xx).squeeze()
            loss = criterion(pred, yy)
            loss.backward()
            optimizer.step()
            scheduler.step(epoch + idx / len(train_loader))

    model.eval()
    with torch.no_grad():
        for idx, (xx, _) in enumerate(test_loader):
            xx = xx.to(DEVICE)
            pred = (2. - torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
            prediction[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction)), :] 
                += pred[:, :] / N_MODEL


01/10: 100%|██████████| 101/101 [02:59<00:00,  1.78s/it]
02/10: 100%|██████████| 102/102 [02:58<00:00,  1.75s/it]
03/10: 100%|██████████| 103/103 [02:57<00:00,  1.72s/it]
04/10: 100%|██████████| 104/104 [02:58<00:00,  1.72s/it]
05/10: 100%|██████████| 105/105 [02:53<00:00,  1.65s/it]
06/10: 100%|██████████| 106/106 [02:55<00:00,  1.66s/it]
07/10: 100%|██████████| 107/107 [02:56<00:00,  1.65s/it]
08/10: 100%|██████████| 108/108 [02:58<00:00,  1.65s/it]
09/10: 100%|██████████| 109/109 [03:01<00:00,  1.67s/it]
10/10: 100%|██████████| 110/110 [03:03<00:00,  1.67s/it]


In [None]:

df = pd.read_csv('./drive/My Drive/Colab Notebooks/daycon1/sample_submission.csv')
df.iloc[:, 1:] = prediction
df.to_csv('./drive/My Drive/Colab Notebooks/daycon1/nn_result_epoch100plus1N_model10.csv'.format(datetime.now().strftime('%m%d-%H%M')), index=False)