In [1]:
import random
from datetime import datetime

import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm

In [2]:
# random.seed(0)
# np.random.seed(0)
# torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [3]:
train_data = pd.read_csv('./data/train.csv').drop([379, 24598], axis=0)
test_data = pd.read_csv('./data/test_x.csv')
# drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE',
#              'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
#              'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
#              'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
#              'index', 'hand']
drop_list = ['index']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

### 설문조사 소요시간을 사람별로 정규화

In [4]:
import re

In [5]:
survey_time = []
for col in list(train_data.columns):
    if re.match('Q[a-z]E', col):
        survey_time.append(col)
train_data[survey_time] = train_data[survey_time].apply(lambda x: x / x.max(), axis=1)   

survey_time = []
for col in list(test_data.columns):
    if re.match('Q[a-z]E', col):
        survey_time.append(col)
test_data[survey_time] = test_data[survey_time].apply(lambda x: x / x.max(), axis=1)    

In [6]:
train_data.head()

Unnamed: 0,index,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,...,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13
0,0,3.0,0.230184,4.0,0.868738,5.0,0.632213,1.0,0.649334,2.0,...,0,1,0,1,1,0,1,0,1,1
1,1,5.0,0.149457,5.0,0.303303,3.0,0.782398,5.0,0.68584,1.0,...,1,1,0,1,1,0,1,0,1,1
2,2,4.0,0.481031,1.0,0.438648,1.0,0.302608,4.0,1.0,5.0,...,1,1,0,1,1,1,1,0,1,1
3,3,3.0,0.021116,3.0,0.096824,4.0,0.041562,3.0,0.135956,1.0,...,0,0,0,0,1,0,1,0,1,1
4,4,1.0,0.490736,1.0,0.374272,5.0,0.294336,2.0,0.562202,1.0,...,1,1,1,1,1,0,1,1,1,1


In [7]:
Answers = ['QaA', 'QbA', 'QcA', 'QdA', 'QeA',
             'QfA', 'QgA', 'QhA', 'QiA', 'QjA', 
             'QkA', 'QlA', 'QmA', 'QnA', 'QoA', 
             'QpA', 'QqA', 'QrA', 'QsA', 'QtA']

In [8]:
flipping_columns = ["QeA", "QfA", "QkA", "QqA", "QrA", "QaA", "QdA", "QgA", "QiA", "QnA"]
for flip in flipping_columns: 
    train_data[flip] = 6 - train_data[flip]
    test_data[flip] = 6 - test_data[flip]
train_data['Mach_score'] = train_data[Answers].mean(axis = 1)
test_data['Mach_score'] = test_data[Answers].mean(axis = 1)

In [9]:
train_data.head()

Unnamed: 0,index,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,...,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13,Mach_score
0,0,3.0,0.230184,4.0,0.868738,5.0,0.632213,5.0,0.649334,4.0,...,1,0,1,1,0,1,0,1,1,2.95
1,1,1.0,0.149457,5.0,0.303303,3.0,0.782398,1.0,0.68584,5.0,...,1,0,1,1,0,1,0,1,1,2.6
2,2,2.0,0.481031,1.0,0.438648,1.0,0.302608,2.0,1.0,1.0,...,1,0,1,1,1,1,0,1,1,1.9
3,3,3.0,0.021116,3.0,0.096824,4.0,0.041562,3.0,0.135956,5.0,...,0,0,0,1,0,1,0,1,1,3.35
4,4,5.0,0.490736,1.0,0.374272,5.0,0.294336,4.0,0.562202,5.0,...,1,1,1,1,0,1,1,1,1,3.0


### 설문 정규화하기

In [10]:
# (정규화하고자 하는 값 - 데이터 값들 중 최소값) / (데이터 값들 중 최대값 - 데이터 값들 중 최소값)
for col in list(train_data.columns):
    if re.match('Q[a-z]A', col):
        train_data[col] = (train_data[col] - 1.) / 4.
        test_data[col] = (test_data[col] - 1.) / 4.

In [11]:
train_y = train_data['voted']
train_y = 2 - train_y.to_numpy()

train_x = train_data.drop(drop_list + ['voted'], axis=1)
train_x = train_x.astype(replace_dict)
train_x = pd.get_dummies(train_x)
train_x = train_x.to_numpy()

test_x = test_data.drop(drop_list, axis=1)
test_x = test_x.astype(replace_dict)
test_x = pd.get_dummies(test_x)
test_x = test_x.to_numpy()


In [12]:
# for i, d in enumerate(train_x.columns):
#     print(i, d)

In [13]:
train_x[:, 42:52] = (train_x[:, 42:52] - 3.5) / 3.5
test_x[:, 42:52] = (test_x[:, 42:52] - 3.5) / 3.5

In [14]:
train_x = torch.from_numpy(train_x).float()
train_y = torch.from_numpy(train_y).float()

real_train_x = train_x
real_train_y = train_y

In [15]:
ratios = [.7, .3]

train_cnt = int(train_x.size(0) * ratios[0])
valid_cnt = train_x.size(0) - train_cnt
cnts = [train_cnt, valid_cnt]
train_cnt, valid_cnt

(31870, 13660)

In [16]:
indices = torch.randperm(train_x.size(0))

train_x = torch.index_select(train_x, dim=0, index=indices)
train_y = torch.index_select(train_y, dim=0, index=indices)

train_x = train_x.split(cnts, dim=0)
train_y = train_y.split(cnts, dim=0)

test_x = torch.tensor(test_x, dtype=torch.float32, device=DEVICE)

for x_i, y_i in zip(train_x, train_y):
    print(x_i.size(), y_i.size())

torch.Size([31870, 113]) torch.Size([31870])
torch.Size([13660, 113]) torch.Size([13660])


In [17]:
real_train_x.shape

torch.Size([45530, 113])

In [18]:
train_len, test_len = len(train_x), len(test_x)

N_MODEL = 5
N_EPOCH = 50
BATCH_SIZE = 128
LOADER_PARAM = {
    'batch_size': BATCH_SIZE,
    'num_workers': 0,
    'pin_memory': False
}
prediction_valid = np.zeros((13660, 1), dtype=np.float32)
prediction_test = np.zeros((11383, 1), dtype=np.float32)

for no in range(N_MODEL):

    real_train_loader = DataLoader(TensorDataset(real_train_x, real_train_y),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
    
    train_loader = DataLoader(TensorDataset(train_x[0], train_y[0]),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
    valid_loader = DataLoader(TensorDataset(train_x[1], train_y[1]),
                                  shuffle=False, drop_last=True, **LOADER_PARAM)
    
    test_loader = DataLoader(TensorDataset(test_x, torch.zeros((test_len,), dtype=torch.float32, device=DEVICE)),
                                 shuffle=False, drop_last=False, **LOADER_PARAM)
    model = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(113, 92, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
        
            nn.Dropout(0.05),
            nn.Linear(92, 74, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
        
            nn.Dropout(0.05),
            nn.Linear(74, 52, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
        
            nn.Dropout(0.05),
            nn.Linear(52, 36, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
            
            nn.Dropout(0.5),
            nn.Linear(36, 20, bias=False),
            nn.ReLU(inplace=True),
            
            nn.Linear(20, 1)
    ).to(DEVICE)
    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=4e-2)
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=N_EPOCH // 4, eta_min=1.2e-5)

    model.train()
    for epoch in tqdm(range(N_EPOCH), desc='{:02d}/{:02d}'.format(no + 1, N_MODEL)):
        for idx, (xx, yy) in enumerate(train_loader): ## real_train_loader 교체 
            optimizer.zero_grad()
            xx, yy = xx.to(DEVICE), yy.to(DEVICE)
            pred = model(xx).squeeze()
            loss = criterion(pred, yy)
            loss.backward()
            optimizer.step()
            scheduler.step(epoch + idx / len(train_loader))

    model.eval()
    with torch.no_grad():
        for idx, (xx, _) in enumerate(valid_loader): ## valid_loader 교체
            xx = xx.to(DEVICE)
            pred = (torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
            prediction_valid[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction_valid)), :] += pred[:, :] / N_MODEL
            ## prediction_test 교체
    

# df = pd.read_csv('./data/sample_submission.csv')
# df.iloc[:, 1:] = prediction_test
# df.to_csv('./result/{}.csv'.format(datetime.now().strftime('%m%d-%H%M')), index=False)

01/05: 100%|███████████████████████████████████████████████████████████████████████████| 50/50 [00:39<00:00,  1.25it/s]
02/05: 100%|███████████████████████████████████████████████████████████████████████████| 50/50 [00:38<00:00,  1.31it/s]
03/05: 100%|███████████████████████████████████████████████████████████████████████████| 50/50 [00:38<00:00,  1.30it/s]
04/05: 100%|███████████████████████████████████████████████████████████████████████████| 50/50 [00:38<00:00,  1.29it/s]
05/05: 100%|███████████████████████████████████████████████████████████████████████████| 50/50 [00:38<00:00,  1.29it/s]


In [21]:
from sklearn.metrics import roc_auc_score

In [22]:
roc_auc_score(train_y[1], prediction_valid)

0.7654264569879532