In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [51]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from collections import defaultdict
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, Conv1D, Flatten, GlobalMaxPool1D, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from bert import tokenization
tf.gfile = tf.io.gfile

In [52]:
train = pd.read_csv('./train.csv', dtype={'id': np.int16, 'target': np.int8})
# train = pd.read_csv('./forHiper', dtype={'id': np.int16, 'target': np.int8})
test = pd.read_csv('./test.csv', dtype={'id': np.int16})
# test = pd.read_csv('./testWithFeatures.csv', dtype={'id': np.int16})
STOPWORDS = set(stopwords.words('english'))

In [53]:
test['text'] = test['text'].fillna('')

In [54]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [63]:
max_len = 160
print(train.text.values[0])
prueba = tokenizer.tokenize(train.text.values[0])
input_sequence = ["[CLS]"] + prueba + ["[SEP]"]
pad_len = max_len - len(input_sequence)
print(input_sequence)
print(pad_len)
tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train.text.values[0]))
tokens += [0] * pad_len
pad_masks = [1] * len(input_sequence) + [0] * pad_len
segment_ids = [0] * max_len
print(tokens)
print(pad_masks)
print(segment_ids)

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
['[CLS]', 'our', 'deeds', 'are', 'the', 'reason', 'of', 'this', '#', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all', '[SEP]']
144
[2256, 15616, 2024, 1996, 3114, 1997, 2023, 1001, 8372, 2089, 16455, 9641, 2149, 2035, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [33]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 10.8 s, sys: 2.05 s, total: 12.8 s
Wall time: 14.3 s


In [64]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [65]:
train_input = bert_encode(train.text.values, tokenizer, max_len=160)
test_input = bert_encode(test.text.values, tokenizer, max_len=160)
train_labels = train.target.values

In [69]:
train_input

(array([[  101,  2256, 15616, ...,     0,     0,     0],
        [  101,  3224,  2543, ...,     0,     0,     0],
        [  101,  2035,  3901, ...,     0,     0,     0],
        ...,
        [  101, 23290,  1012, ...,     0,     0,     0],
        [  101,  2610, 11538, ...,     0,     0,     0],
        [  101,  1996,  6745, ...,     0,     0,     0]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [36]:
def build_model1(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def build_model2(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out_first_layer    = Dense(16, activation='relu')(clf_output)
    out_first_dropout  = Dropout(0.2)(out_first_layer)
    out_second_layer   = Dense(8, activation='relu')(out_first_dropout)
    out_second_dropout = Dropout(0.2)(out_second_layer)
    out = Dense(1, activation='sigmoid')(out_second_dropout)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def build_model2_prima(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out_first_layer    = Dense(128, activation='relu')(clf_output)
    out_first_dropout  = Dropout(0.2)(out_first_layer)
    out_second_layer   = Dense(64, activation='relu')(out_first_dropout)
    out_second_dropout = Dropout(0.2)(out_second_layer)
    out = Dense(1, activation='sigmoid')(out_second_dropout)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def build_model3(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    print(sequence_output.shape)
    clf_output = sequence_output[:, 0, :]
    reshape            = Reshape((1024,1))(clf_output)
    out_first_filter   = Conv1D(32, kernel_size=3, activation='relu', input_shape=(1024,1))(reshape)
    out_second_filter  = Conv1D(64, kernel_size=3, activation='relu')(out_first_filter)
    out_max_pooling    = GlobalMaxPool1D()(out_second_filter)
    out_flatten        = Flatten()(out_max_pooling)
    out_first_layer    = Dense(16, activation='relu')(out_flatten)
    out_second_dropout = Dropout(0.2)(out_first_layer)
    # out_second_layer   = Dense(8, activation='relu')(out_second_dropout)
    # out_third_dropout  = Dropout(0.2)(out_second_layer)
    out = Dense(1, activation='sigmoid')(out_second_dropout)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# def build_model4(bert_layer, max_len=512):
#     input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
#     input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
#     segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

#     _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
#     clf_output         = sequence_output[:, 0, :]
#     clf_output = Reshape((1024, 1, 1))(clf_output)
#     out_first_filter   = Conv2D(32, kernel_size=(1, 1), activation='relu', input_shape=(1024, 1024, 3))(clf_output)
#     out_second_filter  = Conv2D(64, (1, 1), activation='relu')(out_first_filter)
#     out_max_pooling    = MaxPooling2D(pool_size=(1,1))(out_second_filter)
#     out_first_dropout  = Dropout(0.2)(out_max_pooling)
#     out_flatten        = Flatten()(out_first_dropout)
#     out_first_layer    = Dense(16, activation='relu')(out_flatten)
#     out_second_dropout = Dropout(0.2)(out_first_layer)
#     out_second_layer   = Dense(8, activation='relu')(out_second_dropout)
#     out_third_dropout  = Dropout(0.2)(out_second_layer)
#     out = Dense(1, activation='sigmoid')(out_third_dropout)
    
#     model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
#     model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
#     return model

In [37]:
model = build_model2_prima(bert_layer, max_len=160)
model.summary()


Model: "functional_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]     

In [38]:
checkpoint = ModelCheckpoint('modelv4.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    callbacks=[checkpoint],
    batch_size=16
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [47]:
model.load_weights('modelv4.h5')

In [48]:
test_pred = model.predict(test_input)

In [49]:
submission = pd.read_csv('./sample_submission.csv')
submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [50]:
submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)