# **Model1 : RandomForest/ Lgbm/ GradientBoosting Ensemble**


## 1. Libraries

In [1]:
from pycaret.classification import *

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from string import ascii_lowercase
from itertools import combinations

import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import  GradientBoostingClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

## 2. Loading the data


In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test_x.csv')

## 4. Feature Engineering

In [4]:
x_train = train.copy()

In [5]:
dataset = [x_train, test]

### 마키아밸리 테스트 FE

In [6]:
questions = [i for i in list(ascii_lowercase)[:20]]
answers = [('Q'+i+'A') for i in questions]

In [7]:
# for data in dataset:
#   data['T'] = data['QcA'] - data['QfA'] + data['QoA'] - data['QrA'] + data['QsA']
#   data['V'] = data['QbA'] - data['QeA'] + data['QhA'] + data['QjA'] + data['QmA'] - data['QqA']
#   data['M'] = - data['QkA']

Tactic/ Morality/ View에 따라 feature 항목을 나눠보았습니다.

In [8]:
flipping_columns = ["QeA", "QfA", "QkA", "QqA", "QrA"]
for data in dataset:
  for flip in flipping_columns: 
    data[flip] = 6 - data[flip]

In [9]:
flipping_secret_columns = ["QaA", "QdA", "QgA", "QiA", "QnA"]
for data in dataset:
  for flip in flipping_secret_columns: 
    data[flip] = 6 - data[flip]

In [10]:
for data in dataset:
  data['Mach_score'] = data[answers].mean(axis = 1)

In [11]:
for data in dataset:
  data['delay'] = data[[('Q'+i+'E') for i in questions]].sum(axis=1)
  data['delay'] = data['delay'] ** (1/10)

In [12]:
Ancoms = list(combinations(answers, 2))
for data in dataset:
  for a,b in Ancoms:
    data['%s_dv_%s'%(a,b)] = data[a]/data[b]

In [13]:
for data in dataset:
  data.drop([('Q'+i+'A') for i in questions], axis = 1, inplace = True)
  data.drop([('Q'+i+'E') for i in questions], axis = 1, inplace = True)

### 나머지 Features


In [14]:
for data in dataset:
  data.drop('hand', axis=1, inplace = True)

In [15]:
wr_list = [('wr_0'+str(i)) for i in range(1,10)]
wr_list.extend([('wr_'+str(i)) for i in range(10,14)])
wr_no_need = [i for i in wr_list if i not in ['wr_01', 'wr_03', 'wr_06', 'wr_09', 'wr_11']]

EDA에서 결과에 큰 영향이 없다고 판단된 feature들을 제거해주었습니다.

In [16]:
for data in dataset:
  data.drop(wr_no_need, axis=1, inplace = True)

In [17]:
for data in dataset:
  data['Ex'] = (data['tp01']+data['tp06'])/2
  data['Ag'] = (data['tp07']+data['tp02'])/2
  data['Con'] = (data['tp03']+data['tp08'])/2
  data['Es'] =(data['tp09']+data['tp04'])/2
  data['Op'] =(data['tp05']+data['tp10'])/2

TIPI test에 따라 feature 항목을 나눠놓았는데, 이때는 tipi feature들이 flip된 형태로 저장되어있는지 몰라서 따로 전처리를 해주지 않았었습니다.

In [18]:
for data in dataset:
  data.drop([('tp0'+str(i)) for i in range(1,10)], axis=1, inplace = True)
  data.drop('tp10', axis = 1, inplace = True)

In [19]:
index = test['index']
for data in dataset:
  data.drop('index', axis = 1, inplace = True)

In [20]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
needenco = ['age_group', 'gender', 'race', 'religion']
for i in needenco:
  x_train[i] = encoder.fit_transform(x_train[i])
  test[i] = encoder.transform(test[i])

## 5. Model

In [21]:
x_train_temp = x_train.copy()

In [22]:
clf = setup(data = x_train_temp, target = 'voted')

Unnamed: 0,Description,Value
0,Session id,1028
1,Target,voted
2,Target type,Binary
3,Target mapping,"1: 0, 2: 1"
4,Original data shape,"(45532, 215)"
5,Transformed data shape,"(45532, 215)"
6,Transformed train set shape,"(31872, 215)"
7,Transformed test set shape,"(13660, 215)"
8,Numeric features,214
9,Preprocess,True


In [23]:
best_3 = compare_models(sort = 'AUC', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.6936,0.7625,0.6489,0.7563,0.6985,0.3908,0.3956,5.483
lightgbm,Light Gradient Boosting Machine,0.6918,0.7604,0.6405,0.7583,0.6944,0.388,0.3936,0.852
ada,Ada Boost Classifier,0.6856,0.7535,0.6509,0.7425,0.6937,0.3737,0.3771,1.253
lda,Linear Discriminant Analysis,0.6701,0.7405,0.7253,0.6882,0.7062,0.3306,0.3313,0.337
lr,Logistic Regression,0.6652,0.7366,0.7381,0.6781,0.7068,0.3183,0.3199,1.03
rf,Random Forest Classifier,0.6715,0.736,0.6598,0.717,0.6872,0.3427,0.344,0.563
et,Extra Trees Classifier,0.6671,0.7321,0.6974,0.695,0.6961,0.3281,0.3283,0.971
nb,Naive Bayes,0.4652,0.633,0.0712,0.0591,0.0646,0.012,0.0123,0.18
qda,Quadratic Discriminant Analysis,0.4872,0.6191,0.1445,0.6926,0.1931,0.0424,0.0837,0.34
dt,Decision Tree Classifier,0.6016,0.5978,0.6389,0.6349,0.6368,0.1957,0.1957,0.462


Processing:   0%|          | 0/63 [00:00<?, ?it/s]

In [24]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6921,0.7599,0.6354,0.762,0.6929,0.3892,0.3957
1,0.6841,0.7577,0.644,0.7439,0.6903,0.3713,0.3753
2,0.6981,0.7641,0.652,0.7617,0.7026,0.3999,0.4049
3,0.6991,0.7699,0.6531,0.7625,0.7036,0.4017,0.4067
4,0.6967,0.7616,0.6399,0.7669,0.6976,0.3984,0.405
Mean,0.694,0.7626,0.6449,0.7594,0.6974,0.3921,0.3975
Std,0.0055,0.0042,0.0068,0.008,0.0052,0.0113,0.0118


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [25]:
pred_holdout = predict_model(blended)
final_model = finalize_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.6958,0.7634,0,0,0,0.3954,0.4006


In [26]:
predictions = predict_model(final_model, data = test)

In [27]:
predictions

Unnamed: 0,age_group,education,engnat,familysize,gender,married,race,religion,urban,wf_01,...,QrA_dv_QsA,QrA_dv_QtA,QsA_dv_QtA,Ex,Ag,Con,Es,Op,Label,Score
0,1.0,2.0,2.0,3.0,1.0,1.0,6.0,6.0,2.0,0.0,...,2.000000,1.000000,0.500000,2.0,1.5,2.0,4.0,2.5,2,0.6336
1,1.0,2.0,2.0,2.0,1.0,1.0,5.0,3.0,3.0,0.0,...,0.333333,0.250000,0.750000,7.0,7.0,7.0,7.0,7.0,2,0.7556
2,3.0,3.0,2.0,7.0,1.0,2.0,5.0,5.0,1.0,0.0,...,0.666667,0.400000,0.600000,3.5,2.0,2.5,2.0,3.5,1,0.5757
3,6.0,4.0,1.0,4.0,1.0,2.0,6.0,3.0,2.0,0.0,...,1.250000,5.000000,4.000000,3.0,2.5,3.0,3.5,3.0,1,0.6864
4,1.0,2.0,1.0,3.0,1.0,1.0,6.0,0.0,2.0,0.0,...,0.500000,0.666667,1.333333,3.0,3.0,2.0,3.0,3.0,2,0.6665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11378,2.0,3.0,2.0,3.0,0.0,1.0,5.0,3.0,0.0,0.0,...,2.000000,1.000000,0.500000,2.5,0.5,0.0,3.5,2.5,1,0.5462
11379,1.0,2.0,2.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,...,1.000000,1.250000,1.250000,4.0,0.0,1.0,3.0,2.0,2,0.7327
11380,3.0,4.0,1.0,4.0,1.0,2.0,6.0,1.0,1.0,0.0,...,1.000000,2.000000,2.000000,3.0,1.5,3.5,3.0,3.0,1,0.6729
11381,4.0,2.0,1.0,3.0,0.0,2.0,6.0,1.0,2.0,0.0,...,1.000000,2.000000,2.000000,3.0,3.5,3.5,3.5,2.5,1,0.6062


In [32]:
index = pd.read_csv('./data/test_x.csv')['index']

submission = pd.DataFrame({
    'index': index,
    'voted': predictions['Score']
    })

submission.to_csv('./data/model1_22.csv', index=False)

# **Model2: Lgbm Ensemble with different features**

## 1. Libraries

In [None]:
import pandas as pd
import numpy as np

from string import ascii_lowercase
from itertools import combinations

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
import eli5
from eli5.sklearn import PermutationImportance

import matplotlib.pyplot as plt

import warnings
import gc
warnings.filterwarnings("ignore")

## 2. Loading the data

In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test_x.csv')

## 3. Feature Engineering

In [None]:
x_train = train.copy()
x_train.drop('voted', axis=1, inplace = True)
y_train = train['voted']

In [None]:
dataset = [x_train, test]





### 마키아밸리 테스트 FE

In [None]:
questions = [i for i in list(ascii_lowercase)[:20]]
answers = [('Q'+i+'A') for i in questions]

In [None]:
# for data in dataset:
#   data['T'] = data['QcA'] - data['QfA'] + data['QoA'] - data['QrA'] + data['QsA']
#   data['V'] = data['QbA'] - data['QeA'] + data['QhA'] + data['QjA'] + data['QmA'] - data['QqA']
#   data['M'] = - data['QkA']

In [None]:
flipping_columns = ["QeA", "QfA", "QkA", "QqA", "QrA"]
for data in dataset:
  for flip in flipping_columns: 
    data[flip] = 6 - data[flip]

In [None]:
flipping_secret_columns = ["QaA", "QdA", "QgA", "QiA", "QnA"]
for data in dataset:
  for flip in flipping_secret_columns: 
    data[flip] = 6 - data[flip]

In [None]:
for data in dataset:
  data['Mach_score'] = data[answers].mean(axis = 1)

In [None]:
for data in dataset:
  data['delay'] = data[[('Q'+i+'E') for i in questions]].sum(axis=1)
  data['delay'] = data['delay'] ** (1/10)
  data['delay_var'] = data['delay'].var()

In [None]:

Ancoms = list(combinations(answers, 2))
for data in dataset:
  for a,b in Ancoms:
    data['mach_%s_dv_%s'%(a,b)] = data[a]/data[b]

In [None]:
for data in dataset:
  data['mach_var'] = data[answers].var(axis=1)


### 나머지 Features


In [None]:
tps = ['tp01', 'tp02', 'tp03', 'tp04', 'tp05', 'tp06', 'tp07', 'tp08', 'tp09', 'tp10']
for data in dataset:
  for tp in tps:
    data[tp] = 7 - data[tp]

tipi feature들을 일반적인 형태로 복구시켜줬습니다.

In [None]:
for data in dataset:
  for tp in tps:
    data[tp] = data[tp].replace(0, np.nan)
    mean = data[tp].mean(axis=0)
    data[tp] = data[tp].replace(np.nan , mean)


tp중 무응답 값들을 평균값으로 대체했습니다.

In [None]:
for data in dataset:
  data['Ex'] = (data['tp01']+data['tp06'])/2
  data['Ag'] = (data['tp07']+data['tp02'])/2
  data['Con'] = (data['tp03']+data['tp08'])/2
  data['Es'] =(data['tp09']+data['tp04'])/2
  data['Op'] =(data['tp05']+data['tp10'])/2

In [None]:
index = test['index']
for data in dataset:
  data.drop('index', axis = 1, inplace = True)

In [None]:
import numpy as np
for data in dataset:
  teenager_ox = 1*np.array(data['age_group'] == '10s')
  data['teenager_ox'] = teenager_ox

10대인지 아닌지의 여부가 투표 여부에 큰 영향을 미칠 것 같아 하나의 column을 더 만들어주었습니다. 

In [None]:
tpcoms = list(combinations(tps, 2))
for data in dataset:
  for a,b in tpcoms:
    data['tp_%s_dv_%s'%(a,b)] = data[a]/data[b]

tp 값들끼리 나눈 feature들을 생성해주었습니다.

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
needenco = ['age_group', 'gender', 'race', 'religion']
for i in needenco:
  x_train[i] = encoder.fit_transform(x_train[i])
  test[i] = encoder.transform(test[i])

In [None]:
for data in dataset:
  data['Es_gender'] = data['Es']*data['gender']
  data['Con_gender'] = data['Con']*data['gender']
  data['Op_gender'] = data['Op']*data['gender']

EDA 결과, 성별에 따라 Emotional Stability/ Conscience/ Open Minded가 투표 여부에 미치는 영향이 크다고 판단되어 feature를 추가해주었습니다.

정보 출처: https://www.sciencedirect.com/science/article/abs/pii/S0261379413001613

## 4. Feature Selection 1 & Model 2-1

In [None]:
def lgbm_rfe_4040(x_data, y_data, ratio=0.9, min_feats=40):
    feats = x_data.columns.tolist()
    archive = pd.DataFrame(columns=['model', 'n_feats', 'feats', 'score'])
    while True:
        model = LGBMClassifier(objective = 'binary', num_iterations=10**4)
        x_train, x_val, y_train, y_val = train_test_split(x_data[feats], y_data, random_state=4040)
        model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=100, verbose=False)
        val_pred = model.predict_proba(x_val)
        val_pred = val_pred[:,1]
        score = roc_auc_score(y_val, val_pred)
        n_feats = len(feats)
        print(n_feats, score)
        archive = archive.append({'model': model, 'n_feats': n_feats, 'feats': feats, 'score': score}, ignore_index=True)
        feat_imp = pd.Series(model.feature_importances_, index=feats).sort_values(ascending=False)        
        next_n_feats = int(n_feats * ratio)
        if next_n_feats < min_feats:
            break
        else:
            feats = feat_imp.iloc[:next_n_feats].index.tolist()
    return archive


In [None]:
lgbm_archive_4040 = lgbm_rfe_4040(x_train, y_train)

In [None]:
model = LGBMClassifier(objective="binary", num_iterations= 10**3)

x_train_1 = x_train[lgbm_archive_4040.iloc[7,2]]

model.fit(x_train_1, y_train)

pred_y1 = model.predict_proba(test[lgbm_archive_4040.iloc[7,2]])
pred_y1 = pred_y1[:,1]

## 5. Feature Selection 2 & Model 2-2

In [None]:
def lgbm_rfe_1234(x_data, y_data, ratio=0.9, min_feats=40):
    feats = x_data.columns.tolist()
    archive = pd.DataFrame(columns=['model', 'n_feats', 'feats', 'score'])
    while True:
        model = LGBMClassifier(objective = 'binary', num_iterations=10**4)
        x_train, x_val, y_train, y_val = train_test_split(x_data[feats], y_data, random_state=1234)
        model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=100, verbose=False)
        val_pred = model.predict_proba(x_val)
        val_pred = val_pred[:,1]
        score = roc_auc_score(y_val, val_pred)
        n_feats = len(feats)
        print(n_feats, score)
        archive = archive.append({'model': model, 'n_feats': n_feats, 'feats': feats, 'score': score}, ignore_index=True)
        feat_imp = pd.Series(model.feature_importances_, index=feats).sort_values(ascending=False)
        next_n_feats = int(n_feats * ratio)
        if next_n_feats < min_feats:
            break
        else:
            feats = feat_imp.iloc[:next_n_feats].index.tolist()
    return archive


In [None]:
lgbm_archive_1234 = lgbm_rfe_1234(x_train, y_train)

In [None]:
model2 = LGBMClassifier(objective="binary", num_iterations= 10**3)

x_train_2 = x_train[lgbm_archive_1234.iloc[14,2]]

model2.fit(x_train_2, y_train)

pred_y2 = model2.predict_proba(test[lgbm_archive_1234.iloc[14,2]])
pred_y2 = pred_y2[:,1]

## 6. Feature Selection 3 & Model 2-3

In [None]:
def lgbm_rfe_99087(x_data, y_data, ratio=0.9, min_feats=40):
    feats = x_data.columns.tolist()
    archive = pd.DataFrame(columns=['model', 'n_feats', 'feats', 'score'])
    while True:
        model = LGBMClassifier(objective = 'binary', num_iterations=10**4)
        x_train, x_val, y_train, y_val = train_test_split(x_data[feats], y_data, random_state=99087)
        model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=100, verbose=False)
        val_pred = model.predict_proba(x_val)
        val_pred = val_pred[:,1]
        score = roc_auc_score(y_val, val_pred)
        n_feats = len(feats)
        print(n_feats, score)
        archive = archive.append({'model': model, 'n_feats': n_feats, 'feats': feats, 'score': score}, ignore_index=True)
        feat_imp = pd.Series(model.feature_importances_, index=feats).sort_values(ascending=False)
        next_n_feats = int(n_feats * ratio)
        if next_n_feats < min_feats:
            break
        else:
            feats = feat_imp.iloc[:next_n_feats].index.tolist()
    return archive


In [None]:
lgbm_archive_99087 = lgbm_rfe_99087(x_train, y_train)

In [None]:
model3 = LGBMClassifier(objective="binary", num_iterations= 10**3)

x_train_3 = x_train[lgbm_archive_99087.iloc[7,2]]

model3.fit(x_train_3, y_train)

pred_y3 = model3.predict_proba(test[lgbm_archive_99087.iloc[7,2]])
pred_y3 = pred_y3[:,1]

## 7. Feature Selection 4 & Model 2-4 

In [None]:
def lgbm_rfe_42(x_data, y_data, ratio=0.9, min_feats=40):
    feats = x_data.columns.tolist()
    archive = pd.DataFrame(columns=['model', 'n_feats', 'feats', 'score'])
    while True:
        model = LGBMClassifier(objective = 'binary', num_iterations=10**4)
        x_train, x_val, y_train, y_val = train_test_split(x_data[feats], y_data, random_state=42)
        model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=100, verbose=False)
        val_pred = model.predict_proba(x_val)
        val_pred = val_pred[:,1]
        score = roc_auc_score(y_val, val_pred)
        n_feats = len(feats)
        print(n_feats, score)
        archive = archive.append({'model': model, 'n_feats': n_feats, 'feats': feats, 'score': score}, ignore_index=True)
        feat_imp = pd.Series(model.feature_importances_, index=feats).sort_values(ascending=False)
        next_n_feats = int(n_feats * ratio)
        if next_n_feats < min_feats:
            break
        else:
            feats = feat_imp.iloc[:next_n_feats].index.tolist()
    return archive


In [None]:
lgbm_archive_42 = lgbm_rfe_42(x_train, y_train)

In [None]:
model4 = LGBMClassifier(objective="binary", num_iterations= 10**3)

x_train_4 = x_train[lgbm_archive_42.iloc[8,2]]

model4.fit(x_train_4, y_train)

pred_y4 = model4.predict_proba(test[lgbm_archive_42.iloc[8,2]])
pred_y4 = pred_y4[:,1]

## 8. Ensemble

In [None]:
pred_all = (pred_y + pred_y2 + pred_y3 + pred_y4) * (1/4)

submission = pd.DataFrame({
    "index" : index,
    "voted" : pred_all
})
submission.to_csv('./data/model2.csv', index=False)


# **Model3: NN**

3번째 모델은 Junho Sun 님께서 공유해주신 코드를 그대로 활용하였습니다.

좋은 모델을 공유해주신 덕분에 public score도 0.78대로 올라갈 수 있었습니다. 
정말 감사합니다!

In [None]:
import random
from datetime import datetime

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

drop_list = ['QaE', 'QbE', 'QcE', 'QdE', 'QeE',
             'QfE', 'QgE', 'QhE', 'QiE', 'QjE',
             'QkE', 'QlE', 'QmE', 'QnE', 'QoE',
             'QpE', 'QqE', 'QrE', 'QsE', 'QtE',
             'index', 'hand']
replace_dict = {'education': str, 'engnat': str, 'married': str, 'urban': str}
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test_x.csv')
train_data = train_data.drop(train_data[train_data.familysize > 50].index)
train_y = train_data['voted']
train_x = train_data.drop(drop_list + ['voted'], axis=1)
test_x = test_data.drop(drop_list, axis=1)
train_x = train_x.astype(replace_dict)
test_x = test_x.astype(replace_dict)
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)
train_y = 2 - train_y.to_numpy()
train_x = train_x.to_numpy()
test_x = test_x.to_numpy()

train_y_t = torch.tensor(train_y, dtype=torch.float32)
train_x_t = torch.tensor(train_x, dtype=torch.float32)
test_x_t = torch.tensor(test_x, dtype=torch.float32)
train_x_t[:, :20] = (train_x_t[:, :20] - 3.) / 2.
test_x_t[:, :20] = (test_x_t[:, :20] - 3.) / 2
train_x_t[:, 20] = (train_x_t[:, 20] - 5.) / 4.
test_x_t[:, 20] = (test_x_t[:, 20] - 5.) / 4.
train_x_t[:, 21:31] = (train_x_t[:, 21:31] - 3.5) / 3.5
test_x_t[:, 21:31] = (test_x_t[:, 21:31] - 3.5) / 3.5
test_len = len(test_x_t)

N_REPEAT = 5
N_SKFOLD = 7
N_EPOCH = 48
BATCH_SIZE = 72000
LOADER_PARAM = {
    'batch_size': BATCH_SIZE,
    'num_workers': 4,
    'pin_memory': True
}
prediction = np.zeros((test_len, 1), dtype=np.float32)

for repeat in range(N_REPEAT):

    skf, tot = StratifiedKFold(n_splits=N_SKFOLD, random_state=repeat, shuffle=True), 0.
    for skfold, (train_idx, valid_idx) in enumerate(skf.split(train_x, train_y)):
        train_idx, valid_idx = list(train_idx), list(valid_idx)
        train_loader = DataLoader(TensorDataset(train_x_t[train_idx, :], train_y_t[train_idx]),
                                  shuffle=True, drop_last=True, **LOADER_PARAM)
        valid_loader = DataLoader(TensorDataset(train_x_t[valid_idx, :], train_y_t[valid_idx]),
                                  shuffle=False, drop_last=False, **LOADER_PARAM)
        test_loader = DataLoader(TensorDataset(test_x_t, torch.zeros((test_len,), dtype=torch.float32)),
                                 shuffle=False, drop_last=False, **LOADER_PARAM)
        model = nn.Sequential(
            nn.Dropout(0.05),
            nn.Linear(91, 180, bias=False),
            nn.LeakyReLU(0.05, inplace=True),
            nn.Dropout(0.5),
            nn.Linear(180, 32, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(32, 1)
        ).to(DEVICE)
        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.20665], device=DEVICE))
        optimizer = optim.AdamW(model.parameters(), lr=5e-3, weight_decay=7.8e-2)
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, T_0=N_EPOCH // 6, eta_min=4e-4)
        prediction_t, loss_t = np.zeros((test_len, 1), dtype=np.float32), 1.

        # for epoch in range(N_EPOCH):
        for epoch in tqdm(range(N_EPOCH), desc='{:02d}/{:02d}'.format(skfold + 1, N_SKFOLD)):
            model.train()
            for idx, (xx, yy) in enumerate(train_loader):
                optimizer.zero_grad()
                xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                pred = model(xx).squeeze()
                loss = criterion(pred, yy)
                loss.backward()
                optimizer.step()
                scheduler.step(epoch + idx / len(train_loader))

            with torch.no_grad():
                model.eval()
                running_acc, running_loss, running_count = 0, 0., 0
                for xx, yy in valid_loader:
                    xx, yy = xx.to(DEVICE), yy.to(DEVICE)
                    pred = model(xx).squeeze()
                    loss = criterion(pred, yy)
                    running_loss += loss.item() * len(yy)
                    running_count += len(yy)
                    running_acc += ((torch.sigmoid(pred) > 0.5).float() == yy).sum().item()
                # print('R{:02d} S{:02d} E{:02d} | {:6.4f}, {:5.2f}%'
                #       .format(repeat + 1, skfold + 1, epoch + 1, running_loss / running_count,
                #               running_acc / running_count * 100))

                if running_loss / running_count < loss_t:
                    loss_t = running_loss / running_count
                    for idx, (xx, _) in enumerate(test_loader):
                        xx = xx.to(DEVICE)
                        pred = (2. - torch.sigmoid(model(xx).detach().to('cpu'))).numpy()
                        prediction_t[BATCH_SIZE * idx:min(BATCH_SIZE * (idx + 1), len(prediction)), :] \
                            = pred[:, :].copy()
        prediction[:, :] += prediction_t[:, :].copy() / (N_REPEAT * N_SKFOLD)
        tot += loss_t
    print('R{} -> {:6.4f}'.format(repeat + 1, tot / N_SKFOLD))

df = pd.read_csv('./data/sample_submission.csv')
df.iloc[:, 1:] = prediction

In [None]:
torch.save(model, './data/model.pt')

In [None]:
df.to_csv('./data/model3.csv', index=False)

# Final Ensemble

In [33]:
model1 = pd.read_csv('./data/model1_22.csv', index_col = 'index')
model2 = pd.read_csv('./data/model2.csv', index_col='index')

pred_y = (model1)*(0.7) + (model2)*(0.3)

test = pd.read_csv('./data/test_x.csv')
index = test['index']

submission = pd.DataFrame({
    'index': index,
    'voted': pred_y['voted']
    })

submission.to_csv('./data/combined_model1_model2.csv', index=False)

In [34]:

combined_12 = pd.read_csv('./data/combined_model1_model2.csv', index_col = 'index')
model3 = pd.read_csv('./data/model3.csv', index_col='index')
model3['voted'] = model3['voted']-1

다른 모델과 같이 [0,1]의 범위(voted가 2일 확률)를 맞춰주기 위해 1을 빼주었습니다.

In [35]:
pred_y = (model3)*(0.8) + (combined_12)*(0.2)

test = pd.read_csv('./data/test_x.csv')
index = test['index']

submission = pd.DataFrame({
    'index': index,
    'voted': pred_y['voted']
    })

submission.to_csv('./data/submission_final_automl.csv', index=False)

In [37]:
pd.read_csv('./data/submission_final.csv')

Unnamed: 0,index,voted
0,0,0.531823
1,1,0.549404
2,2,0.497554
3,3,0.459782
4,4,0.536788
...,...,...
11378,11378,0.493869
11379,11379,0.556635
11380,11380,0.460075
11381,11381,0.476615


In [36]:
submission

Unnamed: 0,index,voted
0,0,0.534243
1,1,0.559904
2,2,0.508408
3,3,0.505558
4,4,0.543140
...,...,...
11378,11378,0.502594
11379,11379,0.560327
11380,11380,0.506495
11381,11381,0.504787
