In [28]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

./data\sample_submission.csv
./data\test.csv
./data\train.csv


In [31]:
pd.read_csv('./data/train.csv').shape

(7613, 5)

In [30]:
pd.read_csv('./data/test.csv').shape

(3263, 4)

In [4]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [7]:
import tokenization

In [8]:
# max len == 512
# 이 모든 토큰이 합쳐진 길이는 512개 이하여야 합니다. (OOM 때문)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text) # 텍스트 토큰화
            
        text = text[:max_len-2] # [CLS], [SEP] 자리 비워주기
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence) # 512 보다 짧은 문장의 경우 패딩 처리를 위한 빈 공간 계산 및 저장
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) # token -> vocab -> ids
        tokens += [0] * pad_len # token에 직접 padding 처리
        pad_masks = [1] * len(input_sequence) + [0] * pad_len# 1과 0으로 구성된 pad mask 생성(토근인지 아닌지)
        segment_ids = [0] * max_len # 문장 구분을 위한 부분
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

#### 참고용 코드 시작

In [62]:
max_len=512

input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

In [63]:
bert_layer([input_word_ids, input_mask, segment_ids])

[<KerasTensor: shape=(None, 1024) dtype=float32 (created by layer 'keras_layer')>,
 <KerasTensor: shape=(None, None, 1024) dtype=float32 (created by layer 'keras_layer')>]

In [64]:
bert_layer([input_word_ids, input_mask, segment_ids])[1][:,0,:]

<KerasTensor: shape=(None, 1024) dtype=float32 (created by layer 'tf.__operators__.getitem_1')>

#### 참고용 코드 끝

In [9]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids") # 입력층에서 사용할 input_words_ids 정의
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask") # 입력층에서 사용할 input_segment_ids 정의
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids") # 입력층에서 사용할 input_mask 정의

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    # 버트 계층의 입력 값으로 앞서 정의한 input_word_ids, input_segment_ids, input_mask 사용
    
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output) # 출력층(완전연결층) 정의
    # 우리는 트위터 게시글이 진짜 재난인지 아닌지 판단하는거니까 sigmoid 모델 사용
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model 

In [10]:
# Bert Large 사용.
# It uses L=24 hidden layers (i.e., Transformer blocks), a hidden size of H=1024, and A=16 attention heads.

%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

Wall time: 2min 24s


In [11]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
submission = pd.read_csv("./data/sample_submission.csv")

In [12]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() # Bert vocab 생성
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() # 텍스트 소문자 설정

tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case) # bert tokenizer 생성

In [76]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("don't be so judgmental"))

[2123, 1005, 1056, 2022, 2061, 8689, 2389]

In [13]:
train_input = bert_encode(train.text.values, tokenizer, max_len=160)
test_input = bert_encode(test.text.values, tokenizer, max_len=160)
train_labels = train.target.values

In [59]:
temp = 0

for i in train.text.values:
    if temp < len(i):
        temp = len(i)
print(temp)

157


In [14]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [22]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    batch_size=2
)

model.save('model.h5')

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [23]:
test_pred = model.predict(test_input)

In [24]:
submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)

In [25]:
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
