In [1]:
# We will use the official tokenization script created by the Google team
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [2]:
!pip install sentencepiece



In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

# Model Implement

In [4]:
def bert_encode(texts, tokenizer, max_len=128):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [5]:
def build_model(bert_layer, max_len=128):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [23]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 5.66 s, sys: 878 ms, total: 6.54 s
Wall time: 6.29 s


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!ls '/content/drive/My Drive/boaz_study/miniproj'

'한겨레_1차 전처리.csv'       'BERT Model.ipynb'
'동아일보_1차 전처리.csv'     '중앙일보 미니프로젝트 데이터(전처리 ver_1).csv'
'1차 전처리 데이터 종합.csv'  '경향신문 미니프로젝트 데이터(전처리 ver_1).csv'


In [9]:
data = pd.read_csv('/content/drive/My Drive/boaz_study/miniproj/1차 전처리 데이터 종합.csv')

In [10]:
data.isnull().sum()

기사 제목      0
기사 내용    186
label      0
dtype: int64

In [11]:
data.head()

Unnamed: 0,기사 제목,기사 내용,label
0,하태경 임을 위한 행진곡 은 민주주의 한류 보수가 앞장서서 수출해야,하태경 미래통합당 의원이 18일 임을 위한 행진곡 은 자랑스러운 민주주의 한류로...,0
1,단독 여야 과거사법 배상 조항 빼기로 합의 20일 마무리 본회의서 민생...,여야가 20대 국회 마지막 본회의를 오는 20일에 열고 코로나19 대응 관련 법안과...,0
2,정총리 5 18의 실체적 진실 역사의 심판대 위에 올려야,정세균 국무총리는 18일 아직 숨겨진 5 18민주화운동의 실체적 진실을 역사의 심...,0
3,정세균 총리 민주유공자 유족 가슴 아프게 하는 왜곡 폄훼 없어야,정세균 국무총리가 소설가 한강의 작품 소년이 온다 를 인용하면서 5 18 민주유공...,0
4,광주 간 잠룡들,김부겸 보수가 좋아 찍었다고 하는 게 나아 지역감정 비판유승민 보수 5 18 ...,0


In [12]:
data = data.sample(frac = 1)
data.reset_index(drop = True, inplace = True)

In [13]:
train = data.iloc[:80000]
test = data.iloc[80000:]

In [14]:
train['label'].value_counts()

1    45898
0    34102
Name: label, dtype: int64

In [15]:
train["기사 제목"] = train["기사 제목"].astype("string")
test["기사 제목"] = test["기사 제목"].astype("string")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

train_labels = train["label"]
test_labels = test["label"]

train = train.drop("label", axis = 1)
test = test.drop("label", axis = 1)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [18]:
train_input = bert_encode(train["기사 제목"].values, tokenizer, max_len=128)
test_input = bert_encode(test["기사 제목"].values, tokenizer, max_len=128)

# Model: Build, Train, Predict, Submit

In [24]:
model = build_model(bert_layer, max_len=128)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [25]:
 checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.3,
    epochs=5,
    callbacks=[checkpoint],
    batch_size=16
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
model.load_weights('model.h5')
model.evaluate(test_input, test_labels, batch_size = 16)



[0.46226948499679565, 0.7785264849662781]