In [24]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Concatenate,Dropout,Bidirectional
from tensorflow.keras.optimizers import Adam

import re
import nltk
from nltk.corpus import stopwords

In [2]:
data_train=pd.read_csv('twitter_training.csv')
data_test=pd.read_csv('twitter_validation.csv')

In [3]:
data_train.columns=['no1','no2','sentiment','text']
data_train.drop('no1',axis=1,inplace=True)
data_train.drop('no2',axis=1,inplace=True)
data_train.head()

Unnamed: 0,sentiment,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [4]:
data_test.columns=['no1','no2','sentiment','text']
data_test.drop('no1',axis=1,inplace=True)
data_test.drop('no2',axis=1,inplace=True)
data_test

Unnamed: 0,sentiment,text
0,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,Negative,@Microsoft Why do I pay for WORD when it funct...
2,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,Neutral,Now the President is slapping Americans in the...
4,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...
994,Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,Positive,Today sucked so it’s time to drink wine n play...
997,Positive,Bought a fraction of Microsoft today. Small wins.


In [5]:
data_train.dropna(inplace=True)
data_test.dropna(inplace=True)

In [6]:
data_train['sentiment']=data_train['sentiment'].map({'Positive':0,'Negative':2,'Neutral':1,'Irrelevant':3})
data_test['sentiment']=data_test['sentiment'].map({'Positive':0,'Negative':2,'Neutral':1,'Irrelevant':3})

In [7]:
x_train_sentiment=np.array(data_train.iloc[:,0])
x_train_sentiment=tf.one_hot(x_train_sentiment, 4)
x_train_sentiment

<tf.Tensor: shape=(73995, 4), dtype=float32, numpy=
array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)>

In [8]:
x_test_sentiment=np.array(data_test.iloc[:,0])
x_test_sentiment=tf.one_hot(x_test_sentiment, 4)
x_test_sentiment

<tf.Tensor: shape=(999, 4), dtype=float32, numpy=
array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)>

In [9]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    '''Removes HTML tags: replaces anything between opening and closing <> with empty space'''

    return TAG_RE.sub('', text)

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [11]:
def preprocess_text(sen):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase'''
    
    sentence = sen.lower()

    # Remove html tags
    sentence = remove_tags(sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence

In [12]:
x_train_text = []
sentences = list(data_train['text'])
for sen in sentences:
    x_train_text.append(preprocess_text(sen))
    
    
x_test_text = []
sentences = list(data_test['text'])
for sen in sentences:
    x_test_text.append(preprocess_text(sen))

In [13]:
x_train=x_train_text
y_train=x_train_sentiment
x_test=x_test_text
y_test=x_test_sentiment

In [14]:
y_train=np.array(y_train)
y_test=np.array(y_test)

In [15]:
max_length = 32
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
x_train = bert_tokenizer.batch_encode_plus(
    x_train,
    add_special_tokens=True,
    return_attention_mask=True,
    return_token_type_ids=False,
    padding='max_length',
    max_length=max_length,
    return_tensors='tf',
    truncation=True
)

max_length = 32
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
x_test = bert_tokenizer.batch_encode_plus(
    x_test,
    add_special_tokens=True,
    return_attention_mask=True,
    return_token_type_ids=False,
    padding='max_length',
    max_length=max_length,
    return_tensors='tf',
    truncation=True
)

In [16]:
input_ids = x_train['input_ids'].numpy()
attention_masks = x_train['attention_mask'].numpy()


In [17]:
x_train=[input_ids,attention_masks]

In [18]:
x_train

[array([[  101,  2746,  6645, ...,     0,     0,     0],
        [  101, 10047,  2893, ...,     0,     0,     0],
        [  101, 10047,  2746, ...,     0,     0,     0],
        ...,
        [  101,  3651,  3645, ...,     0,     0,     0],
        [  101,  3651,  3645, ...,     0,     0,     0],
        [  101,  2066,  3645, ...,     0,     0,     0]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])]

In [19]:
input_ids2 = x_test['input_ids'].numpy()
attention_masks2 = x_test['attention_mask'].numpy()

In [20]:
x_test=[input_ids2,attention_masks2]

In [21]:
x_test

[array([[  101,  4035,  2739, ...,     0,     0,     0],
        [  101,  7513,  3477, ...,     0,     0,     0],
        [  101, 20116,  3995, ...,     0,     0,     0],
        ...,
        [  101,  2651,  8631, ...,     0,     0,     0],
        [  101,  4149, 12884, ...,     0,     0,     0],
        [  101,  3779,  3779, ...,     0,     0,     0]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])]

In [22]:
y_train

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)

In [28]:
from keras.layers import Layer
import keras.backend as K
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

In [29]:
# Load the BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Freeze the BERT layers
bert_model.trainable = True

# BERT embeddings
input_ids_input = Input(shape=(max_length,), dtype='int32')
attention_mask_input = Input(shape=(max_length,), dtype='int32')
bert_output = bert_model([input_ids_input, attention_mask_input])[0]

# CNN layer
num_filters = 32
kernel_size = 5
cnn_layer = Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu')(bert_output)
cnn_layer=Dropout(0.2)(cnn_layer)
cnn_layer = MaxPooling1D()(cnn_layer)

# CNN layer
num_filters = 64
kernel_size = 5
cnn_layer2 = Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu')(cnn_layer)
cnn_layer2=Dropout(0.2)(cnn_layer2)
cnn_layer2 = MaxPooling1D()(cnn_layer2)
# LSTM layer

lstm_layer = Bidirectional(LSTM(32, return_sequences=True))(cnn_layer2)
lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(lstm_layer)
attention_layer=attention()(lstm_layer)
# Output layer
output_layer = Dense(4, activation='sigmoid')(attention_layer)

# Create the model
model = Model(inputs=[input_ids_input, attention_mask_input], outputs=output_layer)

# Compile the model
optimizer = Adam(lr=2e-5)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [30]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 tf_bert_model_2 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_5[0][0]',                
                                thPoolingAndCrossAt               'input_6[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 32,                                          

In [None]:
model.fit(x_train, y_train, batch_size=256, epochs=2,validation_data=(x_test,y_test))

Epoch 1/2
  6/290 [..............................] - ETA: 9:05:31 - loss: 0.6766 - accuracy: 0.2760

In [None]:
y_test_pred=model.predict(x_test)

In [None]:
input_array=y_test_pred
binary_array = np.zeros_like(input_array)
max_indices = np.argmax(input_array, axis=1)
binary_array[np.arange(len(input_array)), max_indices] = 1
y_test_pred=binary_array
print(binary_array)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_test_pred))