In [88]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert
import itertools
pd.options.display.max_colwidth = 1000

In [39]:
# Load Training dataset
# dataset download link https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
columns = ['sentiment','id','date','query','user','text']
df = pd.read_csv('train.csv',header=None,names=columns,engine='python',encoding='latin1')
df.head(3)

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds


In [40]:
df['text'].sample(3)

1016144    @DistinQue Thanks for the mention in your #followfriday 
576268                           Very, very tired  crappy weather..
1358562               @heroz: huzzah! all yours, i'll set it aside 
Name: text, dtype: object

In [41]:
# function to clean tweet
def clean_tweet(tweet):
    # convert html content to raw text
    tweet = BeautifulSoup(tweet, "lxml").get_text()

    # remove @ mentions
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)

    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)

    # remove extra spaces
    tweet = " ".join(tweet.split())

    return tweet

In [42]:
df['clean_tweet'] = df['text'].apply(clean_tweet)

In [43]:
df[['text','clean_tweet']].sample(3)

Unnamed: 0,text,clean_tweet
1483894,"@Hattz_4_Lifee i wish but i only found it on yt ,, but it still awesomee im going to buy her album when it comes out in my town","_4_Lifee i wish but i only found it on yt ,, but it still awesomee im going to buy her album when it comes out in my town"
696961,i'd rather spend my saturday evening reading Blink by Malcolm Gladwell. This honestly sucks!,i'd rather spend my saturday evening reading Blink by Malcolm Gladwell. This honestly sucks!
256968,@LilianTheNerd Buuuuu como crees,Buuuuu como crees


In [44]:
df.shape

(1600000, 7)

In [45]:
# create binary target variable
df['target'] = (df['sentiment'] == 4).astype(int)
df['target'].value_counts(normalize=True)

1    0.5
0    0.5
Name: target, dtype: float64

In [46]:
# create tokenizer
FullTokenizer = bert.bert_tokenization.FullTokenizer

# donwload bert small model from tensroflow hub
url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(url,trainable=False)

# get model assets
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [47]:
# bert uses word piece tokenizer which can out of vocab word by breaking into small chunks
tokenizer.tokenize("i love itttt coefee")

['i', 'love', 'it', '##tt', '##t', 'coe', '##fe', '##e']

In [48]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("i love itttt coefee"))

[1045, 2293, 2009, 4779, 2102, 24873, 7959, 2063]

In [51]:
# function to tokenize sentence using bert tokenizer and return the integer id for each token
def bert_tokenize(sent,tokenizer):
    return np.array(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent)))

In [52]:
df['tokenize'] = df['clean_tweet'].apply(bert_tokenize,tokenizer=tokenizer)

In [53]:
df['tokenize'].sample(3)

1130704                                                                   [2204, 2305, 10474, 1010, 2265, 2589, 1011, 2261, 6385, 2125, 2077, 5958, 6385, 2265]
1305693                                                                                                       [2000, 5470, 1024, 3398, 2632, 4140, 1997, 20929]
51947      [2129, 2116, 2111, 2024, 2025, 2183, 2000, 15216, 29247, 1029, 2017, 2064, 2022, 2035, 14777, 1998, 24067, 2100, 2007, 2033, 1012, 1026, 1064, 1017]
Name: tokenize, dtype: object

In [54]:
# computing len of each tokens
df['length'] = df['tokenize'].apply(lambda x : len(x))

In [55]:
df['length'].describe()

count    1.600000e+06
mean     1.787814e+01
std      9.902549e+00
min      0.000000e+00
25%      1.000000e+01
50%      1.700000e+01
75%      2.500000e+01
max      2.280000e+02
Name: length, dtype: float64

In [56]:
# sort df by the length so that while feeding into batches we have all obs in batch with same length
# this will will ensure we are adding same padding in each batch to improve training time performance
# shuffling the data set  as well 
df = df.sample(frac=1,random_state=42)
df.sort_values(by='length',inplace=True)
df.head(3)

Unnamed: 0,sentiment,id,date,query,user,text,clean_tweet,target,tokenize,length
556869,0,2204382319,Wed Jun 17 02:04:11 PDT 2009,NO_QUERY,gcdevine,@judithkeane,,0,[],0
391703,0,2054974194,Sat Jun 06 08:27:16 PDT 2009,NO_QUERY,soniasierra,@evieeelove,,0,[],0
1458013,4,2063629791,Sun Jun 07 03:26:41 PDT 2009,NO_QUERY,karmified,@vishaltom,,1,[],0


In [57]:
df.shape

(1600000, 10)

In [58]:
# keeping tweets with 5+ words for training
MIN_LEN = 5
sub_df = df[df['length'] >= MIN_LEN]
sub_df.shape

(1507385, 10)

In [109]:
# convert input and output in list of tuple
gen = list(zip(sub_df['tokenize'],sub_df['target']))

In [None]:
gen

In [111]:
# create dataset generator for training
dataset = tf.data.Dataset.from_generator(lambda:gen,output_types=(tf.int32, tf.int32))

In [112]:
# example of data gen
next(iter(dataset))

(<tf.Tensor: shape=(5,), dtype=int32, numpy=array([2009, 2515, 4757, 4757, 2015], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [115]:
BATCH_SIZE = 64

# get padded batches using padded batch 
batch = dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [116]:
# test size for splitting batches into train and test batch
TEST_SIZE = 0.2

# Total  number of batch
N_BATCH = int(np.ceil(sub_df.shape[0]/BATCH_SIZE))

# shuffle batches before dividing into train and test
batch.shuffle(N_BATCH)

# Test batch count
N_TEST_BATCH = int(N_BATCH * TEST_SIZE)


# train and test batches
test = batch.take(N_TEST_BATCH)
train = batch.skip(N_TEST_BATCH)


In [117]:
# build Text Classifier model using 1D CBB

class classifier(tf.keras.Model):
    
    def __init__(self,vocab_size,emb_dim=128, nb_filters=50, FFN_units=512,dropout_rate=0.1,training=False):
        
        super().__init__()
        
        # define embedding layer
        self.embedding = layers.Embedding(vocab_size,emb_dim)
        
        # biagram 1D CNN Layer
        self.bigram = layers.Conv1D(filters=nb_filters, kernel_size= 2, padding="valid", activation="relu")
        
        # triagram 1D CNN Layer
        self.trigram = layers.Conv1D(filters=nb_filters,kernel_size=3, padding="valid",activation="relu")
        
        # fourgram 1D CNN Layer
        self.fourgram = layers.Conv1D(filters=nb_filters,kernel_size=4, padding="valid", activation="relu")
        
        # global maxpooling layer
        self.pool = layers.GlobalMaxPool1D()
        
        # final full connected dense layer
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        
        # drop out layer
        self.dropout = layers.Dropout(rate=dropout_rate)

        # final output layer     
        self.last_dense = layers.Dense(units=1,activation="sigmoid")

        return
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)

        # Dropout layer, have different behaviors during training and inference. 
        # For such layers, it is standard practice to expose a training (boolean) argument in the call() method.
        merged = self.dropout(merged, training)
        
        output = self.last_dense(merged)
        
        return output

In [118]:
# define hyperparameters
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 128
NB_FILTERS = 10
FFN_UNITS = 64
DROPOUT_RATE = 0.2
NB_EPOCHS = 3

In [119]:
VOCAB_SIZE

30522

In [120]:
# create model
model = classifier(vocab_size=VOCAB_SIZE,emb_dim=EMB_DIM,nb_filters=NB_FILTERS,FFN_units=FFN_UNITS,dropout_rate=DROPOUT_RATE)

In [121]:
# compile model
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [122]:
# save checkpoints
checkpoint_path = "ckpt_token"

ckpt = tf.train.Checkpoint(model=model)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [123]:
# callback to save checkpoint
class save_checkpoint_callback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [124]:
# train the model
model.fit(train, epochs=NB_EPOCHS,callbacks=[save_checkpoint_callback()])

Epoch 1/3
  18843/Unknown - 1097s 58ms/step - loss: 0.3531 - accuracy: 0.8446Checkpoint saved at ckpt_token.
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f9b48b91dd8>

In [128]:
# evaluate on test dataset
eval = model.evaluate(test)
print(eval)

[0.43240100145339966, 0.821888267993927]


In [132]:
predict = model.predict(test)

In [133]:
predict

array([[0.43792996],
       [0.01830937],
       [0.85869336],
       ...,
       [0.00756258],
       [0.03378687],
       [0.08181273]], dtype=float32)