In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,Conv1D,GlobalMaxPooling1D,MaxPooling1D,Dense,AdditiveAttention,MultiHeadAttention,Bidirectional,LSTM,Flatten,Cropping3D,UpSampling3D,Attention,Dropout
import tensorflow.compat.v1 as tf

In [2]:
# importing data
data_train=pd.read_csv('twitter_training.csv')
data_test=pd.read_csv('twitter_validation.csv')

In [3]:
data_train.shape,data_test.shape

((74681, 4), (999, 4))

In [4]:
data_train.columns=['no1','no2','sentiment','text']

In [5]:
data_train.drop('no1',axis=1,inplace=True)
data_train.drop('no2',axis=1,inplace=True)
data_train.head()

Unnamed: 0,sentiment,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [6]:
data_test.columns=['no1','no2','sentiment','text']

In [7]:
data_test.drop('no1',axis=1,inplace=True)
data_test.drop('no2',axis=1,inplace=True)
data_test

Unnamed: 0,sentiment,text
0,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,Negative,@Microsoft Why do I pay for WORD when it funct...
2,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,Neutral,Now the President is slapping Americans in the...
4,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...
994,Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,Positive,Today sucked so it’s time to drink wine n play...
997,Positive,Bought a fraction of Microsoft today. Small wins.


In [8]:
data_train.dropna(inplace=True)
data_test.dropna(inplace=True)

In [9]:
data_train.shape,data_test.shape

((73995, 2), (999, 2))

In [10]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    '''Removes HTML tags: replaces anything between opening and closing <> with empty space'''

    return TAG_RE.sub('', text)

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def preprocess_text(sen):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase'''
    
    sentence = sen.lower()

    # Remove html tags
    sentence = remove_tags(sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence

In [13]:
x_train_text = []
sentences = list(data_train['text'])
for sen in sentences:
    x_train_text.append(preprocess_text(sen))
    
    
x_test_text = []
sentences = list(data_test['text'])
for sen in sentences:
    x_test_text.append(preprocess_text(sen))

In [14]:
data_train['sentiment']=data_train['sentiment'].map({'Positive':0,'Negative':2,'Neutral':1,'Irrelevant':3})
data_test['sentiment']=data_test['sentiment'].map({'Positive':0,'Negative':2,'Neutral':1,'Irrelevant':3})

In [15]:
data_train['sentiment'].head(30)


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    1
12    1
13    1
14    1
15    1
16    1
17    0
18    0
19    0
20    0
21    0
22    0
23    2
24    2
25    2
26    2
27    2
28    2
29    0
Name: sentiment, dtype: int64

In [16]:
x_train_sentiment=np.array(data_train.iloc[:,0])
x_train_sentiment=tf.one_hot(x_train_sentiment, 4)
x_train_sentiment

<tf.Tensor: shape=(73995, 4), dtype=float32, numpy=
array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)>

In [17]:
x_test_sentiment=np.array(data_test.iloc[:,0])
x_test_sentiment=tf.one_hot(x_test_sentiment, 4)
x_test_sentiment

<tf.Tensor: shape=(999, 4), dtype=float32, numpy=
array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)>

In [18]:
x_train=x_train_text
y_train=x_train_sentiment
x_test=x_test_text
y_test=x_test_sentiment

In [19]:
y_train.shape,y_test.shape

(TensorShape([73995, 4]), TensorShape([999, 4]))

In [20]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(x_train)

x_train = word_tokenizer.texts_to_sequences(x_train)
x_test = word_tokenizer.texts_to_sequences(x_test)

In [21]:
vocab_length = len(word_tokenizer.word_index) +1

vocab_length


29131

In [22]:
maxlen = 100

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

In [23]:
x_train.shape,y_train.shape

((73995, 100), TensorShape([73995, 4]))

In [24]:
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('a2_glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [25]:
embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [26]:
embedding_matrix.shape

(29131, 100)

In [27]:
model=Sequential()

In [28]:
model.add(Embedding(vocab_length,100,weights=[embedding_matrix], input_length=maxlen))
model.add(Conv1D(64,5,padding ="same",activation="relu"))
model.add(MaxPooling1D())
model.add(Dropout(0.2))
model.add(Conv1D(128,5,padding="same",activation="relu"))
#model.add(MultiHeadAttention(key_dim=128, value_dim=128, num_heads=1,value=10))
model.add(MaxPooling1D())
model.add(Dropout(0.2))
#model.add(Bidirectional(LSTM(20)))
model.add(Bidirectional(LSTM(64)))
#model.add(Flatten())
# model.add(Cropping3D(cropping=((1, 1), (1, 1), (1, 1))))
# model.add(UpSampling3D(size=(2, 2, 2)))
# model.add(AdditiveAttention()) 
#model.add(MultiHeadAttention(num_heads=2, key_dim=128, value=128))
# model.add(Flatten())

model.add(Dense(units=4,activation="sigmoid"))

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          2913100   
                                                                 
 conv1d (Conv1D)             (None, 100, 64)           32064     
                                                                 
 max_pooling1d (MaxPooling1  (None, 50, 64)            0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 50, 64)            0         
                                                                 
 conv1d_1 (Conv1D)           (None, 50, 128)           41088     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 25, 128)           0         
 g1D)                                                   

In [30]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [31]:
model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1a36bccc7f0>

In [44]:
y_test_pred=model.predict(x_test)
model.evaluate(x_test,y_test) 



[0.08856090158224106, 0.9609609842300415]

In [45]:
y_test_pred

array([[1.4531445e-05, 9.9995512e-01, 1.7946973e-05, 3.3229655e-05],
       [1.1366004e-05, 2.2129741e-05, 9.9997234e-01, 5.8337064e-06],
       [1.9460191e-05, 3.8436567e-05, 9.9993527e-01, 1.1972564e-05],
       ...,
       [9.9993145e-01, 3.8174359e-05, 2.5950721e-05, 4.8653212e-05],
       [9.9992901e-01, 4.6977199e-05, 2.9451974e-05, 5.4563512e-05],
       [6.6681505e-06, 9.9998879e-01, 6.2872941e-06, 9.0375979e-06]],
      dtype=float32)

In [46]:
input_array=y_test_pred
binary_array = np.zeros_like(input_array)
max_indices = np.argmax(input_array, axis=1)
binary_array[np.arange(len(input_array)), max_indices] = 1
y_test_pred=binary_array
print(binary_array)

[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]


In [47]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       277
           1       0.95      0.96      0.96       285
           2       0.96      0.97      0.96       266
           3       0.98      0.95      0.97       171

   micro avg       0.96      0.96      0.96       999
   macro avg       0.96      0.96      0.96       999
weighted avg       0.96      0.96      0.96       999
 samples avg       0.96      0.96      0.96       999

