In [1]:
import random
from datetime import datetime

import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm

In [2]:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [3]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test_x.csv')

# drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE',
#              'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
#              'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
#              'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
#              'index']

drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE',
             'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
             'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']

# drop_list = ['index']

train_data = train_data.drop(train_data[train_data.familysize >= 20].index)
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

In [4]:
import re

#### 설문 소요시간 정규화

In [5]:
# survey_time = []
# for col in list(train_data.columns):
#     if re.match('Q[a-z]E', col):
#         survey_time.append(col)
# train_data[survey_time] = train_data[survey_time].apply(lambda x: x / (x.max()*10), axis=1)   

# survey_time = []
# for col in list(test_data.columns):
#     if re.match('Q[a-z]E', col):
#         survey_time.append(col)
# test_data[survey_time] = test_data[survey_time].apply(lambda x: x / (x.max()*10), axis=1)   

In [6]:
train_y = train_data['voted']
train_y = 2 - train_y.to_numpy()

train_x = train_data.drop(drop_list + ['voted'], axis=1)
train_x = train_x.astype(replace_dict)
train_x = pd.get_dummies(train_x)

test_x = test_data.drop(drop_list, axis=1)
test_x = test_x.astype(replace_dict)
test_x = pd.get_dummies(test_x)


In [7]:
# index 확인하기
# for i, d in enumerate(train_x.columns):
#     print(i, d)

In [8]:
train_x = train_x.to_numpy()
test_x = test_x.to_numpy()

In [9]:
## 설문 소요시간 포함시...

# train_x[:, :40] = (train_x[:, :40] - 1.) / 4.
# test_x[:, 40] = (test_x[:, 40] - 1.) / 4.

# train_x[:, 41] = (train_x[:, 41]) / 3
# test_x[:, 41] = (test_x[:, 41]) / 3

# train_x[:, 42:52] = (train_x[:, 42:52] - 3.5) / 3.5
# test_x[:, 42:52] = (test_x[:, 42:52] - 3.5) / 3.5

In [10]:
## 설문 소요시간 미포함시...

# train_x[:, :20] = (train_x[:, :20] - 1.) / 4.
# test_x[:, 20] = (test_x[:, 20] - 1.) / 4.

# train_x[:, 21] = (train_x[:, 21]) / 3
# test_x[:, 21] = (test_x[:, 21]) / 3

# train_x[:, 22:32] = (train_x[:, 22:32] - 3.5) / 3.5
# test_x[:, 22:32] = (test_x[:, 22:32] - 3.5) / 3.5

In [11]:
## 설문 소요시간 미포함시... 정규화 다르게..

train_x[:, :20] = (train_x[:, :20] - 3.) / 2.
train_x[:, 20] = (train_x[:, 20] - 5.) / 5.
test_x[:, :20] = (test_x[:, :20] - 3.) / 2
test_x[:, 20] = (test_x[:, 20] - 5.) / 5.

# train_x[:, 21] = (train_x[:, 21]) / 3
# test_x[:, 21] = (test_x[:, 21]) / 3

train_x[:, 21:31] = (train_x[:, 21:31] - 3.5) / 3.5
test_x[:, 21:31] = (test_x[:, 21:31] - 3.5) / 3.5

In [12]:
train_x = torch.from_numpy(train_x).float()
train_y = torch.from_numpy(train_y).float()

real_train_x = train_x
real_train_y = train_y

In [13]:
ratios = [.7, .3]

train_cnt = int(train_x.size(0) * ratios[0])
valid_cnt = train_x.size(0) - train_cnt
cnts = [train_cnt, valid_cnt]
train_cnt, valid_cnt

(31863, 13657)

In [14]:
indices = torch.randperm(train_x.size(0))

train_x = torch.index_select(train_x, dim=0, index=indices)
train_y = torch.index_select(train_y, dim=0, index=indices)

train_x = train_x.split(cnts, dim=0)
train_y = train_y.split(cnts, dim=0)

test_x = torch.tensor(test_x, dtype=torch.float32, device=DEVICE)

for x_i, y_i in zip(train_x, train_y):
    print(x_i.size(), y_i.size())

torch.Size([31863, 91]) torch.Size([31863])
torch.Size([13657, 91]) torch.Size([13657])


In [15]:
real_train_x.shape

torch.Size([45520, 91])

In [16]:
from copy import deepcopy

lowest_loss = np.inf
best_model = None
lowest_epoch = np.inf

In [17]:
train_len, test_len = len(train_x), len(test_x)
early_stop = 30
print_interval = 5
N_MODEL = 15
N_EPOCH = 100
BATCH_SIZE = 128
LOADER_PARAM = {
    'batch_size': BATCH_SIZE,
    'num_workers': 0,
    'pin_memory': False
}
prediction_valid = np.zeros((13660, 1), dtype=np.float32)
prediction_test = np.zeros((11383, 1), dtype=np.float32)

In [18]:
from sklearn.metrics import roc_auc_score

## train - valid

In [19]:
# with torch.cuda.device(0):
#     for no in range(N_MODEL):

#         train_loader = DataLoader(TensorDataset(train_x[0], train_y[0]),
#                                   shuffle=True, drop_last=True, **LOADER_PARAM)
#         valid_loader = DataLoader(TensorDataset(train_x[1], train_y[1]),
#                                   shuffle=False, drop_last=True, **LOADER_PARAM)
        
#         model = nn.Sequential(
#             nn.Dropout(0.05),
#             nn.Linear(91, 96, bias=False),
#             nn.LeakyReLU(0.05, inplace=True),
            
#             nn.Dropout(0.5),
#             nn.Linear(96, 36, bias=False),
#             nn.ReLU(inplace=True),
            
#             nn.Linear(36, 1)
#         ).to(DEVICE)
#         criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
#         optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=4e-2)
#         scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
#             optimizer, T_0=N_EPOCH // 4, eta_min=1.2e-5)

#         for epoch in range(N_EPOCH):
#             train_loss, valid_loss = 0, 0
#             y_hat = []
        
#             model.train()
#             for idx, (xx, yy) in enumerate(real_train_loader):
#                 optimizer.zero_grad()
#                 print(xx.size())
#                 xx, yy = xx.to(DEVICE), yy.to(DEVICE)
#                 pred = model(xx).squeeze()
#                 loss = criterion(pred, yy)
#                 loss.backward()
#                 optimizer.step()
#                 scheduler.step(epoch + idx / len(real_train_loader))
#                 train_loss += float(loss)
                
#             model.eval()
#             with torch.no_grad():
#                 valid_loss = 0
#                 for idx, (xx, yy) in enumerate(valid_loader):
#                     xx, yy = xx.to(DEVICE), yy.to(DEVICE)
#                     pred = model(xx).squeeze()
#                     loss = criterion(pred, yy)

#                     valid_loss += float(loss)
#                     y_hat += [pred]

#             valid_loss = valid_loss / len(valid_loader)

#             if (epoch + 1) % print_interval == 0:
#                 print('Epoch %d: train loss=%.4e  valid_loss=%.4e  lowest_loss=%.4e' % (
#                     epoch + 1,
#                     train_loss,
#                     valid_loss,
#                     lowest_loss,
#                 ))

#             if valid_loss <= lowest_loss:
#                 lowest_loss = valid_loss
#                 lowest_epoch = epoch

#                 best_model = deepcopy(model.state_dict())
#             else:
#                 if early_stop > 0 and lowest_epoch + early_stop < epoch + 1:
#                     print("There is no improvement during last %d epochs." % early_stop)
#                     break

#         print("The best validation loss from epoch %d: %.4e" % (lowest_epoch + 1, lowest_loss))
#         model.load_state_dict(best_model)

#         model.eval()
#         with torch.no_grad():
#             for idx, (xx, _) in enumerate(valid_loader):
#                 xx = xx.to(DEVICE)
#                 pred = (torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
#                 prediction_valid[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction_valid)), :] += pred[:, :] / N_MODEL


# roc_auc_score(train_y[1], prediction_valid)

### valid auc score
1. 0.7664986978004593
2. 0.7675935061471474  (설문조사답변 정규화 다르게..)
3. 0.7710123794060724  ~ 0.7738559380625569 (hand변수 포함 + 정규화)
4. 0.7657323921703704  (familysize 정규화) --> 제외!!
5. 0.7611448774735629 ~ 0.7618059057348474 (설문시간 포함+ 정규화)
6. 0.77171753378888 ~ 0.7727322346127534 (설문시간 포함+ 정규화 + layer늘림)
7. 0.7677365629016805 ~ 0.7741115155537603 (설문시간 포함+ 정규화 + layer더 늘림)
8. 0.7635808020457102 ~ 0.776610620093519 (설문시간 포함+ 정규화 + layer더 늘림 + batchnorm)
9. 0.7643981540072351 ~ 0.7708937758928313 (설문시간 포함+ 정규화 + layer더 늘림 + batchnorm + 설문마키아벨리) --> 제외!!

## train - test

In [20]:
with torch.cuda.device(0):
    for no in range(N_MODEL):

        real_train_loader = DataLoader(TensorDataset(real_train_x, real_train_y),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
    
        test_loader = DataLoader(TensorDataset(test_x, torch.zeros((test_len,), dtype=torch.float32, device=DEVICE)),
                                 shuffle=False, drop_last=False, **LOADER_PARAM)
        model = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(91, 96, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
            
            nn.Dropout(0.5),
            nn.Linear(96, 36, bias=False),
            nn.ReLU(inplace=True),
            
            nn.Linear(36, 1)
        ).to(DEVICE)
        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
        optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=4e-2)
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, T_0=N_EPOCH // 4, eta_min=1.2e-5)

        model.train()
        for epoch in tqdm(range(N_EPOCH), desc='{:02d}/{:02d}'.format(no + 1, N_MODEL)):
            for idx, (xx, yy) in enumerate(real_train_loader):
                optimizer.zero_grad()
                xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                pred = model(xx).squeeze()
                loss = criterion(pred, yy)
                loss.backward()
                optimizer.step()
                scheduler.step(epoch + idx / len(real_train_loader))
                

        model.eval()
        with torch.no_grad():
            for idx, (xx, _) in enumerate(test_loader):
                xx = xx.to(DEVICE)
                pred = (2. - torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
                prediction_test[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction_test)), :] += pred[:, :] / N_MODEL

df = pd.read_csv('./data/sample_submission.csv')
df.iloc[:, 1:] = prediction_test
df.to_csv('./result/{}.csv'.format(datetime.now().strftime('%m%d-%H%M')), index=False)

01/15: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [01:26<00:00,  1.16it/s]
02/15: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [01:26<00:00,  1.16it/s]
03/15: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [01:25<00:00,  1.17it/s]
04/15: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [01:27<00:00,  1.14it/s]
05/15: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [01:26<00:00,  1.16it/s]
06/15: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [01:28<00:00,  1.13it/s]
07/15: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [01:25<00:00,  1.17it/s]
08/15: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [02:06<00:00,  1.26s/it]
09/15: 100%|████████████████████████████