In [1]:
import pandas as pd
import numpy as np
import os

import json

from sklearn.model_selection  import train_test_split

In [3]:
DATA_IN_PATH = './data/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'

train_q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
train_q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
train_labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))

In [4]:
train_q1_data

array([[  108,   244,  1138, ...,     0,     0,     0],
       [   11, 23037,  1578, ...,     0,     0,     0],
       [    4,     3, 25447, ...,     0,     0,     0],
       ...,
       [    4,    21,     7, ...,     0,     0,     0],
       [    2,    21,  7591, ...,     0,     0,     0],
       [    9,    15,   317, ...,     0,     0,     0]])

In [5]:
train_q2_data

array([[ 108,  244, 1138, ...,    0,    0,    0],
       [  22, 2599,    3, ...,    0,    0,    0],
       [   4,    9,    6, ...,    0,    0,    0],
       ...,
       [   4,   11,  134, ...,    0,    0,    0],
       [   2,   21, 7591, ...,    0,    0,    0],
       [   3,   19,  243, ...,    0,    0,    0]])

In [6]:
train_input = np.stack((train_q1_data, train_q2_data), axis=1)
# 두 질문을 하나의 쌍으로 만든다.
# np.stack(질문A, 질문B) == [[질문A],[질문B]]

In [7]:
train_input

array([[[  108,   244,  1138, ...,     0,     0,     0],
        [  108,   244,  1138, ...,     0,     0,     0]],

       [[   11, 23037,  1578, ...,     0,     0,     0],
        [   22,  2599,     3, ...,     0,     0,     0]],

       [[    4,     3, 25447, ...,     0,     0,     0],
        [    4,     9,     6, ...,     0,     0,     0]],

       ...,

       [[    4,    21,     7, ...,     0,     0,     0],
        [    4,    11,   134, ...,     0,     0,     0]],

       [[    2,    21,  7591, ...,     0,     0,     0],
        [    2,    21,  7591, ...,     0,     0,     0]],

       [[    9,    15,   317, ...,     0,     0,     0],
        [    3,    19,   243, ...,     0,     0,     0]]])

In [8]:
print(train_input.shape)
# 298526개 데이터에 대해 두 질문이 각 31개의 질문 길이를 가지고 있다.

(298526, 2, 31)


In [9]:
from sklearn.model_selection import train_test_split

train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_labels, test_size=0.2, random_state=4242)

In [10]:
# !pip install xgboost
import xgboost as xgb

train_data = xgb.DMatrix(train_input.sum(axis=1), label=train_label) # sum을 하는 이유는 두 개의 질문을 하나로 합치기 위해서.
eval_data = xgb.DMatrix(eval_input.sum(axis=1), label=eval_label)

data_list = [(train_data, 'train'), (eval_data, 'valid')]

In [11]:
train_input.sum(axis=1)

array([[   8,   22,    2, ...,    0,    0,    0],
       [  14, 1043, 7055, ...,    0,    0,    0],
       [   4,   12,   48, ...,    0,    0,    0],
       ...,
       [  77,   16, 3758, ...,    0,    0,    0],
       [   4,   22,  174, ...,    0,    0,    0],
       [   8,   18,   10, ...,    0,    0,    0]])

In [12]:
train_input.sum(axis=1).shape

(238820, 31)

In [13]:
eval_data

<xgboost.core.DMatrix at 0x21ddb688ba8>

In [14]:
params = {}
params['objective'] = 'binary:logistic' # 이진 로지스틱 함수
params['eval_metric'] = 'rmse' # 평가지표(root mean squared error)

bst = xgb.train(params, train_data, num_boost_round = 1000, evals = data_list, early_stopping_rounds=10)
# xgb.train(파라메터, 데이터, 데이터 반복 횟수, 모델 검증 시 사용할 전체 데이터 쌍, 조기 멈춤 횟수)

[0]	train-rmse:0.483702	valid-rmse:0.484399
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[1]	train-rmse:0.473436	valid-rmse:0.474737
[2]	train-rmse:0.466772	valid-rmse:0.468407
[3]	train-rmse:0.46219	valid-rmse:0.464227
[4]	train-rmse:0.458387	valid-rmse:0.460723
[5]	train-rmse:0.455753	valid-rmse:0.458389
[6]	train-rmse:0.453708	valid-rmse:0.456577
[7]	train-rmse:0.450985	valid-rmse:0.453965
[8]	train-rmse:0.449342	valid-rmse:0.452487
[9]	train-rmse:0.448037	valid-rmse:0.451503
[10]	train-rmse:0.446811	valid-rmse:0.450417
[11]	train-rmse:0.446027	valid-rmse:0.449821
[12]	train-rmse:0.444147	valid-rmse:0.448285
[13]	train-rmse:0.442851	valid-rmse:0.44708
[14]	train-rmse:0.442276	valid-rmse:0.446602
[15]	train-rmse:0.441669	valid-rmse:0.446207
[16]	train-rmse:0.441148	valid-rmse:0.445845
[17]	train-rmse:0.440668	valid-rmse:0.445569
[18]	train-rmse:0.440405	valid-rmse:0.445393
[19]	train-r

[179]	train-rmse:0.402312	valid-rmse:0.426108
[180]	train-rmse:0.401962	valid-rmse:0.425865
[181]	train-rmse:0.401766	valid-rmse:0.425805
[182]	train-rmse:0.401695	valid-rmse:0.425779
[183]	train-rmse:0.401436	valid-rmse:0.425754
[184]	train-rmse:0.401204	valid-rmse:0.425751
[185]	train-rmse:0.401091	valid-rmse:0.425734
[186]	train-rmse:0.400857	valid-rmse:0.42567
[187]	train-rmse:0.400677	valid-rmse:0.425632
[188]	train-rmse:0.400637	valid-rmse:0.425642
[189]	train-rmse:0.400467	valid-rmse:0.425587
[190]	train-rmse:0.400453	valid-rmse:0.425582
[191]	train-rmse:0.4002	valid-rmse:0.425535
[192]	train-rmse:0.399885	valid-rmse:0.425436
[193]	train-rmse:0.399744	valid-rmse:0.425404
[194]	train-rmse:0.399399	valid-rmse:0.425192
[195]	train-rmse:0.399158	valid-rmse:0.425167
[196]	train-rmse:0.399073	valid-rmse:0.425158
[197]	train-rmse:0.399063	valid-rmse:0.425146
[198]	train-rmse:0.398986	valid-rmse:0.425147
[199]	train-rmse:0.398945	valid-rmse:0.425126
[200]	train-rmse:0.398934	valid-rmse:

In [15]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'))

In [17]:
test_input = np.stack((test_q1_data, test_q2_data), axis=1) 
test_data = xgb.DMatrix(test_input.sum(axis=1))
test_predict = bst.predict(test_data)

In [18]:
test_predict

array([0.2603645 , 0.5800228 , 0.56899005, ..., 0.37042218, 0.20800486,
       0.51456386], dtype=float32)

In [19]:
DATA_OUT_PATH = './output/'

if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
output = pd.DataFrame({'test_id': test_id_data, 'is_duplicate': test_predict})
output.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)

In [20]:
output.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.260365
1,1,0.580023
2,2,0.56899
3,3,0.164245
4,4,0.641351


### 캐글에서 0.57503 으로 2720위 (private점수가 공개되는 캐글) 정도 나온다