In [1]:
import random
from datetime import datetime

import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm

In [2]:
# random.seed(0)
# np.random.seed(0)
# torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [3]:
train_data = pd.read_csv('./data/train.csv').drop([379, 24598], axis=0)
test_data = pd.read_csv('./data/test_x.csv')
# drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE',
#              'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
#              'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
#              'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
#              'index', 'hand']
drop_list = ['index']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}

### 설문조사 소요시간을 사람별로 정규화

In [4]:
import re

In [5]:
survey_time = []
for col in list(train_data.columns):
    if re.match('Q[a-z]E', col):
        survey_time.append(col)
train_data[survey_time] = train_data[survey_time].apply(lambda x: x / (x.max()*10), axis=1)   

survey_time = []
for col in list(test_data.columns):
    if re.match('Q[a-z]E', col):
        survey_time.append(col)
test_data[survey_time] = test_data[survey_time].apply(lambda x: x / (x.max()*10), axis=1)    

In [6]:
train_data.head()

Unnamed: 0,index,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,...,wr_04,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13
0,0,3.0,0.023018,4.0,0.086874,5.0,0.063221,1.0,0.064933,2.0,...,0,1,0,1,1,0,1,0,1,1
1,1,5.0,0.014946,5.0,0.03033,3.0,0.07824,5.0,0.068584,1.0,...,1,1,0,1,1,0,1,0,1,1
2,2,4.0,0.048103,1.0,0.043865,1.0,0.030261,4.0,0.1,5.0,...,1,1,0,1,1,1,1,0,1,1
3,3,3.0,0.002112,3.0,0.009682,4.0,0.004156,3.0,0.013596,1.0,...,0,0,0,0,1,0,1,0,1,1
4,4,1.0,0.049074,1.0,0.037427,5.0,0.029434,2.0,0.05622,1.0,...,1,1,1,1,1,0,1,1,1,1


In [7]:
Answers = ['QaA', 'QbA', 'QcA', 'QdA', 'QeA',
             'QfA', 'QgA', 'QhA', 'QiA', 'QjA', 
             'QkA', 'QlA', 'QmA', 'QnA', 'QoA', 
             'QpA', 'QqA', 'QrA', 'QsA', 'QtA']

In [8]:
flipping_columns = ["QeA", "QfA", "QkA", "QqA", "QrA", "QaA", "QdA", "QgA", "QiA", "QnA"]
for flip in flipping_columns: 
    train_data[flip] = 6 - train_data[flip]
    test_data[flip] = 6 - test_data[flip]
train_data['Mach_score'] = train_data[Answers].mean(axis = 1) / 5
test_data['Mach_score'] = test_data[Answers].mean(axis = 1) / 5

In [9]:
train_data['Mach_score'].max()

1.0

In [10]:
train_data.head()

Unnamed: 0,index,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,...,wr_05,wr_06,wr_07,wr_08,wr_09,wr_10,wr_11,wr_12,wr_13,Mach_score
0,0,3.0,0.023018,4.0,0.086874,5.0,0.063221,5.0,0.064933,4.0,...,1,0,1,1,0,1,0,1,1,0.59
1,1,1.0,0.014946,5.0,0.03033,3.0,0.07824,1.0,0.068584,5.0,...,1,0,1,1,0,1,0,1,1,0.52
2,2,2.0,0.048103,1.0,0.043865,1.0,0.030261,2.0,0.1,1.0,...,1,0,1,1,1,1,0,1,1,0.38
3,3,3.0,0.002112,3.0,0.009682,4.0,0.004156,3.0,0.013596,5.0,...,0,0,0,1,0,1,0,1,1,0.67
4,4,5.0,0.049074,1.0,0.037427,5.0,0.029434,4.0,0.05622,5.0,...,1,1,1,1,0,1,1,1,1,0.6


### 설문 정규화하기

In [11]:
# (정규화하고자 하는 값 - 데이터 값들 중 최소값) / (데이터 값들 중 최대값 - 데이터 값들 중 최소값)
for col in list(train_data.columns):
    if re.match('Q[a-z]A', col):
        train_data[col] = (train_data[col] - 1.) / 4.
        test_data[col] = (test_data[col] - 1.) / 4.

In [12]:
train_y = train_data['voted']
train_y = 2 - train_y.to_numpy()

train_x = train_data.drop(drop_list + ['voted'], axis=1)
train_x = train_x.astype(replace_dict)
train_x = pd.get_dummies(train_x)

test_x = test_data.drop(drop_list, axis=1)
test_x = test_x.astype(replace_dict)
test_x = pd.get_dummies(test_x)


In [13]:
train_x['familysize'] = train_x['familysize'].apply(lambda x : 1 if x > 20 else x/20)

In [14]:
for i, d in enumerate(train_x.columns):
    print(i, d)

0 QaA
1 QaE
2 QbA
3 QbE
4 QcA
5 QcE
6 QdA
7 QdE
8 QeA
9 QeE
10 QfA
11 QfE
12 QgA
13 QgE
14 QhA
15 QhE
16 QiA
17 QiE
18 QjA
19 QjE
20 QkA
21 QkE
22 QlA
23 QlE
24 QmA
25 QmE
26 QnA
27 QnE
28 QoA
29 QoE
30 QpA
31 QpE
32 QqA
33 QqE
34 QrA
35 QrE
36 QsA
37 QsE
38 QtA
39 QtE
40 familysize
41 hand
42 tp01
43 tp02
44 tp03
45 tp04
46 tp05
47 tp06
48 tp07
49 tp08
50 tp09
51 tp10
52 wf_01
53 wf_02
54 wf_03
55 wr_01
56 wr_02
57 wr_03
58 wr_04
59 wr_05
60 wr_06
61 wr_07
62 wr_08
63 wr_09
64 wr_10
65 wr_11
66 wr_12
67 wr_13
68 Mach_score
69 age_group_+70s
70 age_group_10s
71 age_group_20s
72 age_group_30s
73 age_group_40s
74 age_group_50s
75 age_group_60s
76 education_0
77 education_1
78 education_2
79 education_3
80 education_4
81 engnat_0
82 engnat_1
83 engnat_2
84 gender_Female
85 gender_Male
86 married_0
87 married_1
88 married_2
89 married_3
90 race_Arab
91 race_Asian
92 race_Black
93 race_Indigenous Australian
94 race_Native American
95 race_Other
96 race_White
97 religion_Agnostic
98 religion

In [15]:
train_x = train_x.to_numpy()
test_x = test_x.to_numpy()

In [16]:
# train_x[:, 21:31] = (train_x[:, 21:31] - 3.5) / 3.5
# test_x[:, 21:31] = (test_x[:, 21:31] - 3.5) / 3.5

In [17]:
train_x[:, 42:52] = (train_x[:, 42:52] - 3.5) / 3.5
train_x[:, 41] = (train_x[:, 41]) / 3

test_x[:, 41] = (test_x[:, 41]) / 3
test_x[:, 42:52] = (test_x[:, 42:52] - 3.5) / 3.5

In [18]:
train_x = torch.from_numpy(train_x).float()
train_y = torch.from_numpy(train_y).float()

real_train_x = train_x
real_train_y = train_y

In [19]:
ratios = [.7, .3]

train_cnt = int(train_x.size(0) * ratios[0])
valid_cnt = train_x.size(0) - train_cnt
cnts = [train_cnt, valid_cnt]
train_cnt, valid_cnt

(31870, 13660)

In [20]:
indices = torch.randperm(train_x.size(0))

train_x = torch.index_select(train_x, dim=0, index=indices)
train_y = torch.index_select(train_y, dim=0, index=indices)

train_x = train_x.split(cnts, dim=0)
train_y = train_y.split(cnts, dim=0)

test_x = torch.tensor(test_x, dtype=torch.float32, device=DEVICE)

for x_i, y_i in zip(train_x, train_y):
    print(x_i.size(), y_i.size())

torch.Size([31870, 113]) torch.Size([31870])
torch.Size([13660, 113]) torch.Size([13660])


In [21]:
real_train_x.shape

torch.Size([45530, 113])

In [22]:
from copy import deepcopy

lowest_loss = np.inf
best_model = None
lowest_epoch = np.inf

In [25]:
train_len, test_len = len(train_x), len(test_x)
early_stop = 30
print_interval = 5
N_MODEL = 15
N_EPOCH = 100
BATCH_SIZE = 128
LOADER_PARAM = {
    'batch_size': BATCH_SIZE,
    'num_workers': 0,
    'pin_memory': False
}
prediction_valid = np.zeros((13660, 1), dtype=np.float32)
prediction_test = np.zeros((11383, 1), dtype=np.float32)

for no in range(N_MODEL):

    real_train_loader = DataLoader(TensorDataset(real_train_x, real_train_y),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
    
    train_loader = DataLoader(TensorDataset(train_x[0], train_y[0]),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
    valid_loader = DataLoader(TensorDataset(train_x[1], train_y[1]),
                                  shuffle=False, drop_last=True, **LOADER_PARAM)
    
    test_loader = DataLoader(TensorDataset(test_x, torch.zeros((test_len,), dtype=torch.float32, device=DEVICE)),
                                 shuffle=False, drop_last=False, **LOADER_PARAM)
    model = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(113, 120, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
        
            nn.Dropout(0.05),
            nn.Linear(120, 90, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
        
            nn.Dropout(0.05),
            nn.Linear(90, 60, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
        
            nn.Dropout(0.5),
            nn.Linear(60, 30, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
            
            nn.Linear(30, 1)
    ).to(DEVICE)
    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=4e-2)
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=N_EPOCH // 4, eta_min=1.2e-5)

    for epoch in range(N_EPOCH):
#     for epoch in tqdm(range(N_EPOCH), desc='{:02d}/{:02d}'.format(no + 1, N_MODEL)):
        
        train_loss, valid_loss = 0, 0
        y_hat = []
    
        model.train()
        for idx, (xx, yy) in enumerate(real_train_loader):
            optimizer.zero_grad()
            xx, yy = xx.to(DEVICE), yy.to(DEVICE)
            pred = model(xx).squeeze()
            loss = criterion(pred, yy)
            loss.backward()
            optimizer.step()
            scheduler.step(epoch + idx / len(real_train_loader))
            
            train_loss += float(loss)
            
        model.eval()
        with torch.no_grad():
            valid_loss = 0
            for idx, (xx, yy) in enumerate(valid_loader):
                xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                pred = model(xx).squeeze()
                loss = criterion(pred, yy)
                
                valid_loss += float(loss)
                y_hat += [pred]
                
        valid_loss = valid_loss / len(valid_loader)
#         valid_history += [valid_loss]
        
        if (epoch + 1) % print_interval == 0:
            print('Epoch %d: train loss=%.4e  valid_loss=%.4e  lowest_loss=%.4e' % (
                epoch + 1,
                train_loss,
                valid_loss,
                lowest_loss,
            ))
        
        if valid_loss <= lowest_loss:
            lowest_loss = valid_loss
            lowest_epoch = epoch

            best_model = deepcopy(model.state_dict())
        else:
            if early_stop > 0 and lowest_epoch + early_stop < epoch + 1:
                print("There is no improvement during last %d epochs." % early_stop)
                break
                
    print("The best validation loss from epoch %d: %.4e" % (lowest_epoch + 1, lowest_loss))
    model.load_state_dict(best_model)

    #### validation ####
#     model.eval()
#     with torch.no_grad():
#         for idx, (xx, _) in enumerate(valid_loader): ## valid_loader 교체
#             xx = xx.to(DEVICE)
#             pred = (torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
#             prediction_valid[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction_valid)), :] += pred[:, :] / N_MODEL
#             ## prediction_test 교체
            
    
    #### test ####
    model.eval()
    with torch.no_grad():
        for idx, (xx, _) in enumerate(test_loader): ## valid_loader 교체
            xx = xx.to(DEVICE)
            pred = (torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
            prediction_test[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction_test)), :] += pred[:, :] / N_MODEL
    

df = pd.read_csv('./data/sample_submission.csv')
df.iloc[:, 1:] = prediction_test
df.to_csv('./result/{}.csv'.format(datetime.now().strftime('%m%d-%H%M')), index=False)

Epoch 5: train loss=2.1696e+02  valid_loss=5.9867e-01  lowest_loss=4.8739e-01
Epoch 10: train loss=2.1273e+02  valid_loss=5.8520e-01  lowest_loss=4.8739e-01
Epoch 15: train loss=2.0831e+02  valid_loss=5.7071e-01  lowest_loss=4.8739e-01
Epoch 20: train loss=2.0494e+02  valid_loss=5.5755e-01  lowest_loss=4.8739e-01
Epoch 25: train loss=2.0396e+02  valid_loss=5.5549e-01  lowest_loss=4.8739e-01
Epoch 30: train loss=2.0669e+02  valid_loss=5.5910e-01  lowest_loss=4.8739e-01
Epoch 35: train loss=2.0210e+02  valid_loss=5.4537e-01  lowest_loss=4.8739e-01
Epoch 40: train loss=1.9679e+02  valid_loss=5.2010e-01  lowest_loss=4.8739e-01
Epoch 45: train loss=1.9219e+02  valid_loss=5.0445e-01  lowest_loss=4.8739e-01
Epoch 50: train loss=1.9108e+02  valid_loss=5.0025e-01  lowest_loss=4.8739e-01
Epoch 55: train loss=1.9831e+02  valid_loss=5.3350e-01  lowest_loss=4.8739e-01
Epoch 60: train loss=1.9476e+02  valid_loss=5.0717e-01  lowest_loss=4.8739e-01
Epoch 65: train loss=1.8819e+02  valid_loss=4.9193e-0

Epoch 10: train loss=2.1339e+02  valid_loss=5.8683e-01  lowest_loss=4.2532e-01
Epoch 15: train loss=2.0959e+02  valid_loss=5.7372e-01  lowest_loss=4.2532e-01
Epoch 20: train loss=2.0561e+02  valid_loss=5.6133e-01  lowest_loss=4.2532e-01
Epoch 25: train loss=2.0380e+02  valid_loss=5.5873e-01  lowest_loss=4.2532e-01
Epoch 30: train loss=2.0732e+02  valid_loss=5.6459e-01  lowest_loss=4.2532e-01
Epoch 35: train loss=2.0349e+02  valid_loss=5.4474e-01  lowest_loss=4.2532e-01
Epoch 40: train loss=1.9761e+02  valid_loss=5.2553e-01  lowest_loss=4.2532e-01
Epoch 45: train loss=1.9301e+02  valid_loss=5.0491e-01  lowest_loss=4.2532e-01
Epoch 50: train loss=1.9094e+02  valid_loss=5.0075e-01  lowest_loss=4.2532e-01
Epoch 55: train loss=1.9810e+02  valid_loss=5.2159e-01  lowest_loss=4.2532e-01
Epoch 60: train loss=1.9513e+02  valid_loss=5.0801e-01  lowest_loss=4.2532e-01
Epoch 65: train loss=1.8895e+02  valid_loss=4.8479e-01  lowest_loss=4.2532e-01
Epoch 70: train loss=1.8362e+02  valid_loss=4.6454e-

Epoch 15: train loss=2.0927e+02  valid_loss=5.7444e-01  lowest_loss=4.2114e-01
Epoch 20: train loss=2.0566e+02  valid_loss=5.6249e-01  lowest_loss=4.2114e-01
Epoch 25: train loss=2.0452e+02  valid_loss=5.5913e-01  lowest_loss=4.2114e-01
Epoch 30: train loss=2.0706e+02  valid_loss=5.6291e-01  lowest_loss=4.2114e-01
Epoch 35: train loss=2.0284e+02  valid_loss=5.4545e-01  lowest_loss=4.2114e-01
Epoch 40: train loss=1.9718e+02  valid_loss=5.2358e-01  lowest_loss=4.2114e-01
Epoch 45: train loss=1.9295e+02  valid_loss=5.0736e-01  lowest_loss=4.2114e-01
Epoch 50: train loss=1.9175e+02  valid_loss=5.0253e-01  lowest_loss=4.2114e-01
Epoch 55: train loss=1.9869e+02  valid_loss=5.3056e-01  lowest_loss=4.2114e-01
Epoch 60: train loss=1.9434e+02  valid_loss=5.1414e-01  lowest_loss=4.2114e-01
Epoch 65: train loss=1.8890e+02  valid_loss=4.8392e-01  lowest_loss=4.2114e-01
Epoch 70: train loss=1.8434e+02  valid_loss=4.6762e-01  lowest_loss=4.2114e-01
Epoch 75: train loss=1.8205e+02  valid_loss=4.5929e-

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(train_y[1], prediction_valid)

In [26]:
prediction_test = 2 - prediction_test

In [27]:
df = pd.read_csv('./data/sample_submission.csv')
df.iloc[:, 1:] = prediction_test
df.to_csv('./result/{}.csv'.format(datetime.now().strftime('%m%d-%H%M')), index=False)