In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
X = df.iloc[:, -2]
y = df.iloc[:, -1]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state = 42, 
                                                    test_size = 0.2,
                                                    stratify = df.target.values)

In [6]:
with tf.device('/cpu:0'):
    train_data = tf.data.Dataset.from_tensor_slices((X_train.values,
                                                     y_train.values))
    valid_data = tf.data.Dataset.from_tensor_slices((X_test.values,
                                                     y_test.values))
    for text, label in train_data.take(1):
        print(text)
        print(label)

tf.Tensor(b'Sassy city girl country hunk stranded in Smoky Mountain snowstorm #AoMS http://t.co/nkKcTttsD9 #ibooklove #bookboost', shape=(), dtype=string)
tf.Tensor(1, shape=(), dtype=int64)


In [7]:
label_list = [0, 1]
max_seq_length = 128
train_batch_size = 32

bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2',
                            trainable = True)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [8]:
tokenizer.wordpiece_tokenizer.tokenize('hi, how are you doing?')

['hi', '##,', 'how', 'are', 'you', 'doing', '##?']

In [9]:
tokenizer.convert_tokens_to_ids(tokenizer.wordpiece_tokenizer.tokenize('hi, how are you doing?'))

[7632, 29623, 2129, 2024, 2017, 2725, 29632]

In [10]:
def to_feature(text, label, label_list = label_list, max_seq_length = max_seq_length, tokenizer = tokenizer):
    example = classifier_data_lib.InputExample(guid = None,
                                             text_a = text.numpy(),
                                             text_b = None,
                                             label = label.numpy())
    feature = classifier_data_lib.convert_single_example(0, example, label_list, max_seq_length, tokenizer)

    return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)

In [11]:
def to_feature_map(text, label):
    input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp = [text, label],
                                                                Tout = [tf.int32, tf.int32, tf.int32, tf.int32])
    input_ids.set_shape([max_seq_length])
    input_mask.set_shape([max_seq_length])
    segment_ids.set_shape([max_seq_length])
    label_id.set_shape([])

    x = {'input_word_ids': input_ids,
       'input_mask': input_mask,
       'input_type_ids': segment_ids}

    return (x, label_id)

In [14]:
with tf.device('cpu:0'):
    train_data = (train_data.map(to_feature_map,
                               num_parallel_calls = tf.data.experimental.AUTOTUNE)
      .shuffle(1000)
      .batch(32, drop_remainder = True)
      .prefetch(tf.data.experimental.AUTOTUNE))

    valid_data = (valid_data.map(to_feature_map,
                                   num_parallel_calls = tf.data.experimental.AUTOTUNE)
      .batch(32, drop_remainder = True)
      .prefetch(tf.data.experimental.AUTOTUNE))

In [15]:
train_data.element_spec

({'input_word_ids': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None),
  'input_mask': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None),
  'input_type_ids': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None)},
 TensorSpec(shape=(32,), dtype=tf.int32, name=None))

In [16]:
valid_data.element_spec

({'input_word_ids': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None),
  'input_mask': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None),
  'input_type_ids': TensorSpec(shape=(32, 128), dtype=tf.int32, name=None)},
 TensorSpec(shape=(32,), dtype=tf.int32, name=None))

In [21]:
def create_model():
    input_word_ids = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32,
                                         name = 'input_word_ids')
    input_mask = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32,
                                     name = 'input_mask')
    input_type_ids = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32,
                                      name = 'input_type_ids')
  
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])

    drop = tf.keras.layers.Dropout(0.2)(pooled_output)
    dense_1 = tf.keras.layers.Dense(300, activation = 'elu', kernel_initializer = 'he_normal')(drop)
    dense_2 = tf.keras.layers.Dense(100, activation = 'elu', kernel_initializer = 'he_normal')(dense_1)
    output = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'output')(dense_2)

    model = tf.keras.Model(inputs = {'input_word_ids': input_word_ids,
                                   'input_mask': input_mask,
                                   'input_type_ids': input_type_ids},
                         outputs = output)
    return model

In [22]:
model = create_model()
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 2e-5),
              loss = 'binary_crossentropy',
               metrics = ['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 128, 768)]                'input_mask[0][0]',       

In [23]:
tf.keras.utils.plot_model(model = model, show_shapes = True, dpi = 76)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [25]:
epochs = 5
history = model.fit(train_data,
                    validation_data = valid_data,
                    epochs = epochs,
                    verbose = 2)

Epoch 1/5


KeyboardInterrupt: 