In [6]:
!pip install kaggle



In [7]:
!kaggle datasets download -d datatattle/covid-19-nlp-text-classification


Dataset URL: https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification
License(s): copyright-authors
Downloading covid-19-nlp-text-classification.zip to /content
  0% 0.00/4.38M [00:00<?, ?B/s]
100% 4.38M/4.38M [00:00<00:00, 146MB/s]


In [8]:
!unzip covid-19-nlp-text-classification.zip


Archive:  covid-19-nlp-text-classification.zip
  inflating: Corona_NLP_test.csv     
  inflating: Corona_NLP_train.csv    


In [74]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, SimpleRNN, LSTM, GRU, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import SparseCategoricalAccuracy

In [2]:
import plotly.express as px

In [3]:
tf.random.set_seed(42)
np.random.seed(42)

In [11]:
df = pd.read_csv('Corona_NLP_train.csv', encoding='latin-1')

In [13]:
df.sample(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
7060,10859,55811,9893 Bring The Heat Boulevard,19-03-2020,On a positive note Covid 19 got them gas price...,Positive
18282,22081,67033,"Abuja & Makurdi, Nigeria",23-03-2020,The real challenge with isolation in Nigeria a...,Extremely Negative
38538,42337,87289,Ireland,11-04-2020,Today's poll: Are you doing more online shoppi...,Extremely Negative
32265,36064,81016,Global,07-04-2020,In Our latest response to COVID-19: 6 April vi...,Extremely Positive
20138,23937,68889,,24-03-2020,"Our sister company, @LRWonline, asked over 44,...",Neutral
21124,24923,69875,,25-03-2020,I was at the grocery store today and I heard s...,Positive
21109,24908,69860,Los Angeles,25-03-2020,Nice! Gas prices drop under $2 at this Jurupa ...,Negative
33120,36919,81871,,07-04-2020,@KagutaMuseveni I wish commend and appreciate ...,Extremely Positive
18327,22126,67078,,23-03-2020,Mum works in the most popular supermarket in t...,Positive
39539,43338,88290,YouTube: EG EdmGamer,13-04-2020,Japan s favorite emergency food gains new fans...,Positive


In [14]:
df = df[['OriginalTweet', 'Sentiment']]

In [15]:
df.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [16]:
df['Sentiment'].value_counts()

Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

In [20]:
fig = px.histogram(df['Sentiment'], x='Sentiment')
fig.show()

In [38]:
df['target'] = df['Sentiment'].astype('category').cat.codes

In [39]:
df.head()

Unnamed: 0,OriginalTweet,Sentiment,target
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,3
1,advice Talk to your neighbours family to excha...,Positive,4
2,Coronavirus Australia: Woolworths to give elde...,Positive,4
3,My food stock is not the only one which is emp...,Positive,4
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,0


In [41]:
label_mapping = dict(enumerate(df['Sentiment'].astype('category').cat.categories))
label_mapping

{0: 'Extremely Negative',
 1: 'Extremely Positive',
 2: 'Negative',
 3: 'Neutral',
 4: 'Positive'}

In [42]:
df_train, df_test = train_test_split(df, test_size=0.3)

In [43]:
tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(df_train['OriginalTweet'])

In [44]:
Xtrain = tokenizer.texts_to_sequences(df_train['OriginalTweet'])
Xtest = tokenizer.texts_to_sequences(df_test['OriginalTweet'])

In [45]:
Xtrain = pad_sequences(Xtrain)
dim = Xtrain.shape[1]

In [46]:
dim

68

In [47]:
Xtest = pad_sequences(Xtest, maxlen=dim)

In [48]:
Xtrain.shape

(28809, 68)

In [49]:
Xtest.shape

(12348, 68)

In [50]:
word2idx = tokenizer.word_index
V = len(word2idx)
V = V + 1
V

66576

In [85]:
input = Input(shape=(dim,))
x = Embedding(V, 50)(input)
x = SimpleRNN(100, return_sequences=True)(x)
# x = SimpleRNN(100, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
output = Dense(5, activation = 'softmax')(x)
model = Model(input, output, name='SimpleRNN')

In [86]:
model.summary()

Model: "SimpleRNN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 68)]              0         
                                                                 
 embedding_6 (Embedding)     (None, 68, 50)            3328800   
                                                                 
 simple_rnn_12 (SimpleRNN)   (None, 68, 100)           15100     
                                                                 
 global_max_pooling1d_1 (Gl  (None, 100)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_5 (Dense)             (None, 5)                 505       
                                                                 
Total params: 3344405 (12.76 MB)
Trainable params: 3344405 (12.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________

In [87]:
model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = Adam(learning_rate = 0.01),
    metrics = ['accuracy']
)

In [88]:
result = model.fit(
    Xtrain, df_train['target'],
    validation_data = (Xtest, df_test['target']),
    epochs = 20,
    batch_size = 128
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [91]:
fig = px.line(result.history, y=['loss', 'val_loss'], labels={'x': 'Epoch', 'value': 'Loss'})
fig.show()

In [92]:
fig = px.line(result.history, y=['accuracy', 'val_accuracy'], labels={'x': 'Epoch', 'value': 'Accuracy'})
fig.show()

In [93]:
input = Input(shape = (dim,))
x = Embedding(V, 50)(input)
x = LSTM(100, return_sequences=True)(x)
# x = LSTM(100, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
output = Dense(5, activation = 'softmax')(x)
model = Model(input, output, name='LSTM')

In [94]:
model.summary()

Model: "LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 68)]              0         
                                                                 
 embedding_7 (Embedding)     (None, 68, 50)            3328800   
                                                                 
 lstm (LSTM)                 (None, 68, 100)           60400     
                                                                 
 global_max_pooling1d_2 (Gl  (None, 100)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_6 (Dense)             (None, 5)                 505       
                                                                 
Total params: 3389705 (12.93 MB)
Trainable params: 3389705 (12.93 MB)
Non-trainable params: 0 (0.00 Byte)
______________________

In [97]:
model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = Adam(learning_rate = 0.01),
    metrics = ['accuracy']
)

In [98]:
result_lstm = model.fit(
    Xtrain, df_train['target'],
    validation_data = (Xtest, df_test['target']),
    epochs = 20,
    batch_size = 128
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [105]:
fig = px.line(result_lstm.history, y=['loss', 'val_loss'], labels={'x': 'Epoch', 'value': 'Loss'})
fig.show()

In [100]:
fig = px.line(result_lstm.history, y=['accuracy', 'val_accuracy'], labels={'x': 'Epoch', 'value': 'Accuracy'})
fig.show()

In [101]:
input = Input(shape = (dim,))
x = Embedding(V, 50)(input)
x = GRU(100, return_sequences=True)(x)
# x = LSTM(100, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
output = Dense(5, activation = 'softmax')(x)
model = Model(input, output, name='LSTM')

In [102]:
model.summary()

Model: "LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 68)]              0         
                                                                 
 embedding_8 (Embedding)     (None, 68, 50)            3328800   
                                                                 
 gru (GRU)                   (None, 68, 100)           45600     
                                                                 
 global_max_pooling1d_3 (Gl  (None, 100)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_7 (Dense)             (None, 5)                 505       
                                                                 
Total params: 3374905 (12.87 MB)
Trainable params: 3374905 (12.87 MB)
Non-trainable params: 0 (0.00 Byte)
______________________

In [103]:
model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = Adam(learning_rate = 0.01),
    metrics = ['accuracy']
)

In [106]:
result_gru = model.fit(
    Xtrain, df_train['target'],
    validation_data = (Xtest, df_test['target']),
    epochs = 20,
    batch_size = 128
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [107]:
fig = px.line(result_gru.history, y=['loss', 'val_loss'], labels={'x': 'Epoch', 'value': 'Loss'})
fig.show()

In [108]:
fig = px.line(result_gru.history, y=['accuracy', 'val_accuracy'], labels={'x': 'Epoch', 'value': 'Accuracy'})
fig.show()