In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert
import swifter
pd.options.display.max_colwidth = 1000

In [3]:
# Load Training dataset
# dataset download link https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
columns = ['sentiment','id','date','query','user','text']
df = pd.read_csv('train.csv',header=None,names=columns,engine='python',encoding='latin1')

# using only 50% of the data due to RAM and GPU limitation and reducing traning time
df = df.sample(frac=0.5)
df.head(3)

Unnamed: 0,sentiment,id,date,query,user,text
83818,0,1753447776,Sun May 10 01:41:09 PDT 2009,NO_QUERY,Sherribobs,Wants to go to London... its everywhere!! ahh well 20 days 2 go ... im sure it wont kill me lol... gna try and do revision 2day :S:S
1503202,4,2071891520,Sun Jun 07 20:04:50 PDT 2009,NO_QUERY,djchemical,@Taryn_Itup ur awesome
1556770,4,2185444193,Mon Jun 15 17:23:27 PDT 2009,NO_QUERY,grouchpotato,@firegirlpj I have plenty! Come on over and we'll knit!


In [4]:
df['text'].sample(3)

727107                                                            Just got up. Still tired after last night. Not gonna see my boyfriend today 
313829    my friend's bedroom caught on fire 2 nights ago. thank gawd she's okay. i noticed my bad luck is upon other people this week. GASP. 
242353                                                                                                 cabelodealgodÃ£odoceisthenewblack -not 
Name: text, dtype: object

In [5]:
# function to clean tweet
def clean_tweet(tweet):
    # convert html content to raw text
    tweet = BeautifulSoup(tweet, "lxml").get_text()

    # remove @ mentions
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)

    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)

    # remove extra spaces
    tweet = " ".join(tweet.split())

    return tweet

In [6]:
df['clean_tweet'] = df['text'].swifter.apply(clean_tweet)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=800000.0, style=ProgressStyle(descript…




In [7]:
df[['text','clean_tweet']].sample(3)

Unnamed: 0,text,clean_tweet
512127,aaaaaaaah ramen im sorry but im full,aaaaaaaah ramen im sorry but im full
1419640,@BBCTravelAlert Goodnight you,Goodnight you
1293546,has three competition running at the moment - gig tickets &amp; sunglasses check out www.onesmallseed.net to find out all the details.,has three competition running at the moment - gig tickets & sunglasses check out www.onesmallseed.net to find out all the details.


In [8]:
df.shape

(800000, 7)

In [9]:
# create binary target variable
df['target'] = (df['sentiment'] == 4).astype(int)
df['target'].value_counts(normalize=True)

0    0.500287
1    0.499713
Name: target, dtype: float64

In [10]:
# create tokenizer
FullTokenizer = bert.bert_tokenization.FullTokenizer

# donwload bert small model from tensroflow hub
url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(url,trainable=False)

# get model assets
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [11]:
# bert uses word piece tokenizer which can out of vocab word by breaking into small chunks
tokenizer.tokenize("i love itttt coefee")

['i', 'love', 'it', '##tt', '##t', 'coe', '##fe', '##e']

In [12]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("i love itttt coefee"))

[1045, 2293, 2009, 4779, 2102, 24873, 7959, 2063]

In [13]:
# function to tokenize sentence using bert tokenizer
# bert needs [CLS] token at the begnining of the sentence and [SEP] token at the end
def bert_tokenize(sent,tokenizer):
    return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [14]:
df['tokenize'] = df['clean_tweet'].swifter.apply(bert_tokenize,tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=800000.0, style=ProgressStyle(descript…




In [15]:
df['tokenize'].sample(3)

948563     [[CLS], on, the, exchange, fi, ##c, ?, i, ', d, been, planning, on, commenting, ,, i, just, forgot, *, hug, *, [SEP]]
817504                                            [[CLS], yeah, ., ., ., little, location, button, on, the, home, screen, [SEP]]
1139336         [[CLS], finished, entertainment, center, last, night, ,, now, enjoying, it, by, watching, tv, tonight, ., [SEP]]
Name: tokenize, dtype: object

In [16]:
# Preparing input for BERT
# BERT Needs three input
# one is tokens ID
# second is mask token i.e. which token to ignore...we will ignore [PAD] tokens for embedding
# third is sentence id which seq of 1 and 0
# 1 is when token is from first sent
# 0 is when token is from next sent
# then 1 and 0 series of id for alternating sent

# function to get first bert input
def token_id(tokens):
    return  tokenizer.convert_tokens_to_ids(tokens)

# function to get second bert input
def mask_token(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

# function to get third bert input
def segment_id(tokens):
    seg_id = []
    current_id = 0
    for token in tokens:
        seg_id.append(current_id)
        if token =="[SEP]":
            current_id = 1-current_id  # toggle between 1 and 0
    return seg_id

In [17]:
# computing len of each tokens
df['length'] = df['tokenize'].swifter.apply(lambda x : len(x))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=800000.0, style=ProgressStyle(descript…




In [18]:
df['length'].describe()

count    800000.000000
mean         19.880009
std           9.898869
min           2.000000
25%          12.000000
50%          19.000000
75%          27.000000
max         177.000000
Name: length, dtype: float64

In [19]:
# sort df by the length so that while feeding into batches we have all obs in batch with same length
# this will will ensure we are adding same padding in each batch to improve training time performance
# shuffling the data set  as well 
df = df.sample(frac=1,random_state=42)
df.sort_values(by='length',inplace=True)
df.head(3)

Unnamed: 0,sentiment,id,date,query,user,text,clean_tweet,target,tokenize,length
1266427,4,1999648846,Mon Jun 01 20:34:57 PDT 2009,NO_QUERY,l0RENElAiNE,@CaLips,,1,"[[CLS], [SEP]]",2
461203,0,2174392538,Sun Jun 14 22:27:31 PDT 2009,NO_QUERY,tyblackdude,@durtynate24,,0,"[[CLS], [SEP]]",2
718322,0,2260261993,Sat Jun 20 18:56:10 PDT 2009,NO_QUERY,mallycakes,@wintermourning,,0,"[[CLS], [SEP]]",2


In [20]:
df.shape

(800000, 10)

In [21]:
# keeping tweets with 10+ words for training
MIN_LEN = 10
sub_df = df[df['length'] >= MIN_LEN].copy()
sub_df.shape

(673589, 10)

In [22]:
sub_df['token_ids'] = sub_df['tokenize'].swifter.apply(token_id)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=673589.0, style=ProgressStyle(descript…




In [23]:
sub_df['mask_token'] = sub_df['tokenize'].swifter.apply(mask_token)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=673589.0, style=ProgressStyle(descript…




In [24]:
sub_df['segment_ids'] = sub_df['tokenize'].swifter.apply(segment_id)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=673589.0, style=ProgressStyle(descript…




In [25]:
sub_df[['token_ids','mask_token','segment_ids']].sample(3)

Unnamed: 0,token_ids,mask_token,segment_ids
560354,"[101, 3160, 1011, 1019, 3286, 20116, 2102, 2003, 1015, 9737, 13938, 2102, 1029, 2009, 1005, 1055, 2397, 1998, 1045, 2342, 2062, 3255, 15872, 2869, 1045, 3984, 2065, 2061, 1010, 2009, 2052, 2022, 23409, 2213, 20008, 3367, 1025, 2488, 4638, 2077, 11360, 1025, 102]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
963110,"[101, 4931, 3071, 999, 999, 999, 2092, 10047, 8670, 2243, 2188, 999, 999, 999, 2074, 2404, 7629, 7680, 2047, 2774, 2006, 5003, 26322, 9061, 4402, 22038, 2595, 102]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
773680,"[101, 1996, 6077, 2562, 19372, 2012, 1996, 7381, 1012, 2644, 13847, 2033, 15068, 2226, 2226, 2226, 4904, 1010, 28844, 3111, 999, 999, 999, 102]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [26]:
# convert all three input into tuple
input_ids = list(zip(sub_df['token_ids'],sub_df['mask_token'],sub_df['segment_ids'])    )

# convert input and output in list of tuple
gen = list(zip(input_ids,sub_df['target']))

In [27]:
# create dataset generator for training
dataset = tf.data.Dataset.from_generator(lambda:gen,output_types=(tf.int32, tf.int32))

In [28]:
# example of data gen
sample = next(iter(dataset))
sample

(<tf.Tensor: shape=(3, 10), dtype=int32, numpy=
 array([[ 101, 1011, 1045, 3335, 2017, 2525, 2026, 7710,  999,  102],
        [   1,    1,    1,    1,    1,    1,    1,    1,    1,    1],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0]],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [29]:
sample[0][1]

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)>

In [30]:
# Bert Embedding layers example
# Bert needs three input
# bert gives two output
# word embedding for each token
# sentence embedding for complete sentence
sent_emb, word_emb = bert_layer([tf.expand_dims(sample[0][0],axis=0),tf.expand_dims(sample[0][1],axis=0),tf.expand_dims(sample[0][2],axis=0)])       

In [31]:
# bert sentence embedding
sent_emb

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-0.76737165, -0.41097584, -0.9254713 ,  0.65501165,  0.5293678 ,
        -0.21975581,  0.7349396 ,  0.28127664, -0.84252703, -0.9999422 ,
        -0.40607893,  0.92529947,  0.96100897,  0.67281246,  0.92393225,
        -0.5952092 , -0.19115444, -0.5368186 ,  0.4108936 , -0.00423791,
         0.6886967 ,  0.9999906 ,  0.18431018,  0.18238546,  0.5212475 ,
         0.9894527 , -0.8066786 ,  0.9052497 ,  0.92836547,  0.5342323 ,
        -0.49583486,  0.10000655, -0.9786017 , -0.353842  , -0.9704756 ,
        -0.98830414,  0.32899258, -0.62806594, -0.02852749,  0.09683861,
        -0.8435362 ,  0.25194606,  0.9999431 ,  0.19922172,  0.4086358 ,
        -0.36052614, -0.99999976,  0.2834919 , -0.80205107,  0.8751598 ,
         0.8829856 ,  0.8694397 ,  0.17662226,  0.50954425,  0.4441942 ,
        -0.17182097, -0.07987472,  0.10378689, -0.2412517 , -0.5843585 ,
        -0.49333397,  0.4459089 , -0.88422734, -0.84868956,  0.9129511 ,
 

In [32]:
# bert word emmbeddings
word_emb

<tf.Tensor: shape=(1, 10, 768), dtype=float32, numpy=
array([[[-0.06861989,  0.15293457,  0.03007803, ..., -0.4028083 ,
          0.1657426 ,  0.44624186],
        [ 0.33549878,  0.34711733,  0.34191743, ...,  0.01532548,
          0.429602  ,  0.40605515],
        [ 0.14023837,  0.310545  ,  0.42587104, ..., -0.15992254,
          0.11591094,  0.94670236],
        ...,
        [-0.7463669 , -0.34005493,  0.4941042 , ..., -0.06445795,
          0.07051917,  1.055614  ],
        [-0.27299654,  0.13023369,  0.04562852, ...,  0.28933996,
          0.2239339 ,  0.06370082],
        [ 0.41037756, -0.01631321, -0.12888537, ..., -0.17260408,
         -0.5833017 , -0.29110155]]], dtype=float32)>

In [33]:
BATCH_SIZE = 32

# get padded batches using padded batch 
batch = dataset.padded_batch(BATCH_SIZE, padded_shapes=((3,None), ()),padding_values=(0,0))

In [34]:
# test size for splitting batches into train and test batch
TEST_SIZE = 0.2

# Total  number of batch
N_BATCH = int(np.ceil(sub_df.shape[0]/BATCH_SIZE))

# shuffle batches before dividing into train and test
batch.shuffle(N_BATCH)

# Test batch count
N_TEST_BATCH = int(N_BATCH * TEST_SIZE)


# train and test batches
test = batch.take(N_TEST_BATCH)
train = batch.skip(N_TEST_BATCH)


In [35]:
# build Text Classifier model using 1D CBB

class classifier(tf.keras.Model):
    
    def __init__(self,vocab_size,nb_filters=50, FFN_units=512,dropout_rate=0.1,training=False):
        
        super().__init__()
        
        # define bert embedding layer
        url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
        self.bert_layer =  hub.KerasLayer(url,trainable=False)
        
        # biagram 1D CNN Layer
        self.bigram = layers.Conv1D(filters=nb_filters, kernel_size= 2, padding="valid", activation="relu")
        
        # triagram 1D CNN Layer
        self.trigram = layers.Conv1D(filters=nb_filters,kernel_size=3, padding="valid",activation="relu")
        
        # fourgram 1D CNN Layer
        self.fourgram = layers.Conv1D(filters=nb_filters,kernel_size=4, padding="valid", activation="relu")
        
        # global maxpooling layer
        self.pool = layers.GlobalMaxPool1D()
        
        # final full connected dense layer
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        
        # drop out layer
        self.dropout = layers.Dropout(rate=dropout_rate)

        # final output layer     
        self.last_dense = layers.Dense(units=1,activation="sigmoid")

        return

    def bert_embedding(self,tokens):
        sent_emb, word_emb = self.bert_layer([tokens[:,0,:],tokens[:,1,:], tokens[:,2,:]])
        return word_emb
    
    def call(self, inputs, training):
        x = self.bert_embedding(inputs)
        x_1 = self.bigram(x) # batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)

        # Dropout layer, have different behaviors during training and inference. 
        # For such layers, it is standard practice to expose a training (boolean) argument in the call() method.
        merged = self.dropout(merged, training)
        
        output = self.last_dense(merged)
        
        return output

In [36]:
# define hyperparameters
VOCAB_SIZE = len(tokenizer.vocab)
NB_FILTERS = 10
FFN_UNITS = 64
DROPOUT_RATE = 0.2
NB_EPOCHS = 3

In [37]:
VOCAB_SIZE

30522

In [38]:
# create model
model = classifier(vocab_size=VOCAB_SIZE,nb_filters=NB_FILTERS,FFN_units=FFN_UNITS,dropout_rate=DROPOUT_RATE)

In [39]:
# compile model
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [40]:
# save checkpoints
checkpoint_path = "ckpt_token"

ckpt = tf.train.Checkpoint(model=model)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [41]:
# callback to save checkpoint
class save_checkpoint_callback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [42]:
# train the model
model.fit(train, epochs=NB_EPOCHS,callbacks=[save_checkpoint_callback()])

Epoch 1/3
  16840/Unknown - 2020s 120ms/step - loss: 0.3890 - accuracy: 0.8258Checkpoint saved at ckpt_token.
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f25e3f7d898>

In [43]:
# evaluate on test dataset
eval = model.evaluate(test)
print(eval)

[0.3570321500301361, 0.8447223901748657]


In [44]:
predict = model.predict(test)

In [45]:
predict

array([[0.14979693],
       [0.9642483 ],
       [0.97849065],
       ...,
       [0.82095337],
       [0.25183102],
       [0.40005735]], dtype=float32)