In [1]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [3]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 23.5 MB/s eta 0:00:01[K     |▌                               | 20 kB 30.9 MB/s eta 0:00:01[K     |▉                               | 30 kB 37.5 MB/s eta 0:00:01[K     |█                               | 40 kB 39.5 MB/s eta 0:00:01[K     |█▍                              | 51 kB 37.0 MB/s eta 0:00:01[K     |█▋                              | 61 kB 33.2 MB/s eta 0:00:01[K     |██                              | 71 kB 18.5 MB/s eta 0:00:01[K     |██▏                             | 81 kB 18.6 MB/s eta 0:00:01[K     |██▍                             | 92 kB 19.1 MB/s eta 0:00:01[K     |██▊                             | 102 kB 20.3 MB/s eta 0:00:01[K     |███                             | 112 kB 20.3 MB/s eta 0:00:01[K     |███▎                            | 122 kB 20.3 MB/s eta 0:00:01[K     |██

In [7]:
import tokenization
from wordcloud import STOPWORDS

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
import pandas as pd
import numpy as np

In [19]:
train = pd.read_csv("cleaned_train.csv")
test = pd.read_csv("cleaned_test.csv")
all_data = [train, test]

In [16]:
FullTokenizer = tokenization.FullTokenizer

In [17]:
ans = input("Which Bert should I use? \n a. Base uncased \n b. Large uncased \n c. Basic cased \n d. Large cased \n")

if ans is 'a':
    BERT_MODEL_HUB = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
    disc = 'Base_uncased'
elif ans is 'b':
    BERT_MODEL_HUB = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/2' 
    disc = 'Large_uncased'
elif ans is 'c':
    BERT_MODEL_HUB = 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/2'
    disc = 'Base_cased'
elif ans is 'd':
    BERT_MODEL_HUB = 'https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/2'
    disc = 'Large_cased'

bert_layer = hub.KerasLayer(BERT_MODEL_HUB, trainable=True)
print('Bert layer is ready to use!')


if ans =='a' or ans =='b':
    to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    tokenizer = FullTokenizer(vocabulary_file, to_lower_case)
    
    
elif ans =='c' or ans =='d':
    vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    tokenizer = FullTokenizer(vocabulary_file, do_lower_case=False)

print('Bert Tokenizer is ready!!!')

def tokenize_tweets(text_):
    return tokenizer.convert_tokens_to_ids(['[CLS]'] + tokenizer.tokenize(text_) + ['[SEP]'])

Which Bert should I use? 
 a. Base uncased 
 b. Large uncased 
 c. Basic cased 
 d. Large cased 
a
Bert layer is ready to use!
Bert Tokenizer is ready!!!


In [20]:
for dataframe in all_data:
  dataframe['tokenized_tweets'] = dataframe.cleaned_text.apply(lambda x: tokenize_tweets(x))

In [22]:
test.head()

Unnamed: 0,cleaned_text,tokenized_tweets
0,happened terrible car crash,"[101, 3047, 6659, 2482, 5823, 102]"
1,heard earthquake different cities stay safe ev...,"[101, 2657, 8372, 2367, 3655, 2994, 3647, 3071..."
2,forest fire spot pond geese fleeing across str...,"[101, 3224, 2543, 3962, 8644, 28519, 14070, 24..."
3,apocalypse lighting spokane wildfires,"[101, 16976, 7497, 21878, 3748, 26332, 102]"
4,typhoon soudelor kills 28 china taiwan,"[101, 15393, 2061, 12672, 10626, 8563, 2654, 2..."


In [26]:
train_lenght = len(max(train.tokenized_tweets, key = len))
test_lenght = len(max(test.tokenized_tweets, key = len))
max_lenght = max(train_lenght, test_lenght)
train_lenght, test_lenght, max_lenght

(53, 47, 53)

In [27]:
for dataframe in all_data:
  dataframe['padded_tweets'] = dataframe.tokenized_tweets.apply(lambda x: x + [0] * (max_lenght - len(x)))


In [28]:
train.head()

Unnamed: 0,target,cleaned_text,tokenized_tweets,padded_tweets
0,1,deeds reason earthquake may allah forgive us,"[101, 15616, 3114, 8372, 2089, 16455, 9641, 21...","[101, 15616, 3114, 8372, 2089, 16455, 9641, 21..."
1,1,forest fire near la ronge sask canada,"[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...","[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187..."
2,1,residents asked shelter place notified officer...,"[101, 3901, 2356, 7713, 2173, 19488, 3738, 139...","[101, 3901, 2356, 7713, 2173, 19488, 3738, 139..."
3,1,13000 people receive wildfires evacuation orde...,"[101, 19527, 2692, 2111, 4374, 3748, 26332, 13...","[101, 19527, 2692, 2111, 4374, 3748, 26332, 13..."
4,1,got sent photo ruby alaska smoke wildfires pou...,"[101, 2288, 2741, 6302, 10090, 7397, 5610, 374...","[101, 2288, 2741, 6302, 10090, 7397, 5610, 374..."


In [47]:
class TweetClassifier:
    
    def __init__(self, tokenizer, bert_layer, max_len, lr = 0.0001,
                 epochs = 15, batch_size = 32,
                 activation = 'sigmoid', optimizer = 'SGD',
                 beta_1=0.9, beta_2=0.999, epsilon=1e-07,
                 metrics = 'accuracy', loss = 'binary_crossentropy'):
        
        self.lr = lr
        self.epochs = epochs
        self.max_len = max_len
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.bert_layer = bert_layer
        self.activation = activation
        self.optimizer = optimizer
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon =epsilon
        self.metrics = metrics
        self.loss = loss

        
    def encode(self, texts): 
        all_tokens = []
        masks = []
        segments = []
        for text in texts:
            tokenized = self.tokenizer.convert_tokens_to_ids(['[CLS]'] + self.tokenizer.tokenize(text) + ['[SEP]'])
            len_zeros = self.max_len - len(tokenized)
            padded = tokenized + [0] * len_zeros
            mask = [1] * len(tokenized) + [0] * len_zeros
            segment = [0] * self.max_len
            all_tokens.append(padded)
            masks.append(mask)
            segments.append(segment)
        
        return np.array(all_tokens), np.array(masks), np.array(segments)


    def make_model(self):
        # Shaping the inputs to our model
        input_ids = Input(shape = (self.max_len, ), dtype = tf.int32, name = 'input_ids')
        input_mask = Input(shape = (self.max_len, ), dtype = tf.int32, name = 'input_mask')
        segment_ids = Input(shape = (self.max_len, ), dtype = tf.int32,  name = 'segment_ids')
        pooled_output, sequence_output = bert_layer([input_ids, input_mask, segment_ids] )
        clf_output = sequence_output[:, 0, :]
        out = tf.keras.layers.Dense(1, activation = self.activation)(clf_output)
        model = Model(inputs = [input_ids, input_mask, segment_ids], outputs = out)
        
        # define the optimizer
        if self.optimizer is 'SGD':
            optimizer = SGD(learning_rate = self.lr)
        elif self.optimizer is 'Adam': 
            optimizer = Adam(learning_rate = self.lr, beta_1=self.beta_1, beta_2=self.beta_2, epsilon=self.epsilon)
        model.compile(loss = self.loss, optimizer = self.optimizer, metrics = [self.metrics])
        print('Model is compiled with {} optimizer'.format(self.optimizer))
        return model
    
    def train(self, x_train, y_train):    
        checkpoint = ModelCheckpoint('model.h5', monitor='val_loss',
                                     save_best_only=True)
        
        model = self.make_model()
        X = self.encode(x_train)
        Y = y_train
        model.fit(X, Y, shuffle = True, validation_split = 0.2, 
                  batch_size=self.batch_size, epochs = self.epochs,
                  callbacks=[checkpoint])    
        print('Model is fit!')
        
    def predict(self, x_test):
        X_test_encoded = self.encode(x_test)
        best_model = tf.keras.models.load_model('model.h5',custom_objects={'KerasLayer':hub.KerasLayer})
        y_pred = best_model.predict(X_test_encoded)
        return y_pred

In [37]:
model = TweetClassifier(tokenizer = tokenizer, bert_layer = bert_layer,
                              max_len = max_lenght, lr = 0.0001,
                              epochs = 3,  activation = 'sigmoid',
                              batch_size = 32,optimizer = 'SGD',
                              beta_1=0.9, beta_2=0.999, epsilon=1e-07)

In [38]:
train.head()

Unnamed: 0,target,cleaned_text,tokenized_tweets,padded_tweets
0,1,deeds reason earthquake may allah forgive us,"[101, 15616, 3114, 8372, 2089, 16455, 9641, 21...","[101, 15616, 3114, 8372, 2089, 16455, 9641, 21..."
1,1,forest fire near la ronge sask canada,"[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...","[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187..."
2,1,residents asked shelter place notified officer...,"[101, 3901, 2356, 7713, 2173, 19488, 3738, 139...","[101, 3901, 2356, 7713, 2173, 19488, 3738, 139..."
3,1,13000 people receive wildfires evacuation orde...,"[101, 19527, 2692, 2111, 4374, 3748, 26332, 13...","[101, 19527, 2692, 2111, 4374, 3748, 26332, 13..."
4,1,got sent photo ruby alaska smoke wildfires pou...,"[101, 2288, 2741, 6302, 10090, 7397, 5610, 374...","[101, 2288, 2741, 6302, 10090, 7397, 5610, 374..."


In [39]:
x_train = train["cleaned_text"]
y_train = train["target"]

In [40]:
model.train(x_train, y_train)

Model is compiled with SGD optimizer
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model is fit!


In [46]:
test.head()

Unnamed: 0,cleaned_text,tokenized_tweets,padded_tweets
0,happened terrible car crash,"[101, 3047, 6659, 2482, 5823, 102]","[101, 3047, 6659, 2482, 5823, 102, 0, 0, 0, 0,..."
1,heard earthquake different cities stay safe ev...,"[101, 2657, 8372, 2367, 3655, 2994, 3647, 3071...","[101, 2657, 8372, 2367, 3655, 2994, 3647, 3071..."
2,forest fire spot pond geese fleeing across str...,"[101, 3224, 2543, 3962, 8644, 28519, 14070, 24...","[101, 3224, 2543, 3962, 8644, 28519, 14070, 24..."
3,apocalypse lighting spokane wildfires,"[101, 16976, 7497, 21878, 3748, 26332, 102]","[101, 16976, 7497, 21878, 3748, 26332, 102, 0,..."
4,typhoon soudelor kills 28 china taiwan,"[101, 15393, 2061, 12672, 10626, 8563, 2654, 2...","[101, 15393, 2061, 12672, 10626, 8563, 2654, 2..."


In [49]:
y_pred = model.predict(test)

In [52]:
test.shape

(3262, 3)

In [55]:
y_pred[1]

array([0.99403477], dtype=float32)

In [56]:
prediction = np.round(y_pred)

In [61]:
test.head()

Unnamed: 0,cleaned_text,tokenized_tweets,padded_tweets
0,happened terrible car crash,"[101, 3047, 6659, 2482, 5823, 102]","[101, 3047, 6659, 2482, 5823, 102, 0, 0, 0, 0,..."
1,heard earthquake different cities stay safe ev...,"[101, 2657, 8372, 2367, 3655, 2994, 3647, 3071...","[101, 2657, 8372, 2367, 3655, 2994, 3647, 3071..."
2,forest fire spot pond geese fleeing across str...,"[101, 3224, 2543, 3962, 8644, 28519, 14070, 24...","[101, 3224, 2543, 3962, 8644, 28519, 14070, 24..."
3,apocalypse lighting spokane wildfires,"[101, 16976, 7497, 21878, 3748, 26332, 102]","[101, 16976, 7497, 21878, 3748, 26332, 102, 0,..."
4,typhoon soudelor kills 28 china taiwan,"[101, 15393, 2061, 12672, 10626, 8563, 2654, 2...","[101, 15393, 2061, 12672, 10626, 8563, 2654, 2..."


In [63]:
submission = test.copy()
submission["id"] = 0
submission["target"] = prediction

In [65]:
submission.head(20)

Unnamed: 0,cleaned_text,tokenized_tweets,padded_tweets,id,target
0,happened terrible car crash,"[101, 3047, 6659, 2482, 5823, 102]","[101, 3047, 6659, 2482, 5823, 102, 0, 0, 0, 0,...",0,1.0
1,heard earthquake different cities stay safe ev...,"[101, 2657, 8372, 2367, 3655, 2994, 3647, 3071...","[101, 2657, 8372, 2367, 3655, 2994, 3647, 3071...",0,1.0
2,forest fire spot pond geese fleeing across str...,"[101, 3224, 2543, 3962, 8644, 28519, 14070, 24...","[101, 3224, 2543, 3962, 8644, 28519, 14070, 24...",0,1.0
3,apocalypse lighting spokane wildfires,"[101, 16976, 7497, 21878, 3748, 26332, 102]","[101, 16976, 7497, 21878, 3748, 26332, 102, 0,...",0,1.0
4,typhoon soudelor kills 28 china taiwan,"[101, 15393, 2061, 12672, 10626, 8563, 2654, 2...","[101, 15393, 2061, 12672, 10626, 8563, 2654, 2...",0,1.0
5,re shaking s earthquake,"[101, 2128, 5513, 1055, 8372, 102]","[101, 2128, 5513, 1055, 8372, 102, 0, 0, 0, 0,...",0,1.0
6,d probably still show life arsenal yesterday e...,"[101, 1040, 2763, 2145, 2265, 2166, 9433, 7483...","[101, 1040, 2763, 2145, 2265, 2166, 9433, 7483...",0,0.0
7,hey,"[101, 4931, 102]","[101, 4931, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0,0.0
8,nice hat,"[101, 3835, 6045, 102]","[101, 3835, 6045, 102, 0, 0, 0, 0, 0, 0, 0, 0,...",0,0.0
9,fuck,"[101, 6616, 102]","[101, 6616, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0,0.0


In [66]:
submission.to_csv("submission.csv", index=False)