In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Concatenate,Dropout,Bidirectional
from tensorflow.keras.optimizers import Adam

import re
import nltk
from nltk.corpus import stopwords


In [2]:
data_train=pd.read_csv('twitter_training.csv')
data_test=pd.read_csv('twitter_validation.csv')

In [3]:
data_train.columns=['no1','no2','sentiment','text']
data_train.drop('no1',axis=1,inplace=True)
data_train.drop('no2',axis=1,inplace=True)
data_train.head()

Unnamed: 0,sentiment,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [4]:
data_test.columns=['no1','no2','sentiment','text']
data_test.drop('no1',axis=1,inplace=True)
data_test.drop('no2',axis=1,inplace=True)
data_test

Unnamed: 0,sentiment,text
0,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,Negative,@Microsoft Why do I pay for WORD when it funct...
2,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,Neutral,Now the President is slapping Americans in the...
4,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...
994,Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,Positive,Today sucked so it’s time to drink wine n play...
997,Positive,Bought a fraction of Microsoft today. Small wins.


In [5]:
data_train.dropna(inplace=True)
data_test.dropna(inplace=True)

In [6]:
data_train['sentiment']=data_train['sentiment'].map({'Positive':0,'Negative':2,'Neutral':1,'Irrelevant':3})
data_test['sentiment']=data_test['sentiment'].map({'Positive':0,'Negative':2,'Neutral':1,'Irrelevant':3})


In [7]:
x_train_sentiment=np.array(data_train.iloc[:,0])
x_train_sentiment=tf.one_hot(x_train_sentiment, 4)
x_train_sentiment

<tf.Tensor: shape=(73995, 4), dtype=float32, numpy=
array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)>

In [8]:
x_test_sentiment=np.array(data_test.iloc[:,0])
x_test_sentiment=tf.one_hot(x_test_sentiment, 4)
x_test_sentiment

<tf.Tensor: shape=(999, 4), dtype=float32, numpy=
array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)>

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [10]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    '''Removes HTML tags: replaces anything between opening and closing <> with empty space'''

    return TAG_RE.sub('', text)

In [11]:
def preprocess_text(sen):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase'''

    sentence = sen.lower()

    # Remove html tags
    sentence = remove_tags(sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence

In [12]:
x_train_text = []
sentences = list(data_train['text'])
for sen in sentences:
    x_train_text.append(preprocess_text(sen))


x_test_text = []
sentences = list(data_test['text'])
for sen in sentences:
    x_test_text.append(preprocess_text(sen))

In [13]:
x_train=x_train_text
y_train=x_train_sentiment
x_test=x_test_text
y_test=x_test_sentiment

In [14]:
y_train=np.array(y_train)
y_test=np.array(y_test)

In [15]:
from keras.layers import Layer
import keras.backend as K
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")
        super(attention, self).build(input_shape)

    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

In [16]:
import tensorflow_hub as hub
import tensorflow_text as text

In [17]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2",trainable=True)

In [18]:
# text_inputs = [tf.keras.layers.Input(shape=(), dtype=tf.string),
#                ...] # This SavedModel accepts up to 2 text inputs.
# tokenize = hub.KerasLayer(preprocessor.tokenize)
# tokenized_inputs = [tokenize(segment) for segment in text_inputs]

# # Step 2 (optional): modify tokenized inputs.
# pass

# # Step 3: pack input sequences for the Transformer encoder.
# seq_length = 32 # Your choice here.
# bert_pack_inputs = hub.KerasLayer(
#     preprocessor.bert_pack_inputs,
#     arguments=dict(seq_length=seq_length))  # Optional argument.
# encoder_inputs = bert_pack_inputs(tokenized_inputs)

In [19]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)


# CNN layer
num_filters = 128
kernel_size = 5
cnn_layer = Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu')(outputs["sequence_output"])
# cnn_layer=Dropout(0.2)(cnn_layer)
cnn_layer = MaxPooling1D()(cnn_layer)

# CNN layer
num_filters = 256
kernel_size = 5
cnn_layer2 = Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu')(cnn_layer)
# cnn_layer2=Dropout(0.2)(cnn_layer2)
cnn_layer2 = MaxPooling1D()(cnn_layer2)
# LSTM layer

lstm_layer = Bidirectional(LSTM(128, return_sequences=True))(cnn_layer2)
lstm_layer = Bidirectional(LSTM(256, return_sequences=True))(lstm_layer)
attention_layer=attention()(lstm_layer)


# Neural network layers
# l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(4, activation='sigmoid', name="output")(attention_layer)


# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [20]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
x_train=np.array(x_train)
x_test=np.array(x_test)


In [22]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [23]:
model.fit(x_train, y_train, batch_size=128, epochs=10,validation_data=(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x211b98c3280>