# Loading data

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow.keras.layers as layer
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords

df_train = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding = 'latin')
df_test = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding = 'latin')
x = df_train['OriginalTweet']
y = df_train['Sentiment']

#Cleaning the texts and getting rid of unclear symbols or characters
#This function is from: https://www.kaggle.com/code/shahraizanwar/covid19-tweets-sentiment-prediction-rnn-85-acc
import re
def text_cleaner(tweet):   
    # remove urls
    tweet = re.sub(r'http\S+', ' ', tweet)  
    # remove html tags
    tweet = re.sub(r'<.*?>',' ', tweet)
    # remove digits
    tweet = re.sub(r'\d+',' ', tweet)    
    # remove hashtags
    tweet = re.sub(r'#\w+',' ', tweet)    
    # remove mentions
    tweet = re.sub(r'@\w+',' ', tweet)    
    #removing stop words
    tweet = tweet.split()
    tweet = " ".join([word for word in tweet if not word in stop_words])   
    return tweet

stop_words = stopwords.words('english')

X_c = x.apply(text_cleaner)
X_c.head()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_c)

X = tokenizer.texts_to_sequences(X_c)
vocab_size = len(tokenizer.word_index)+1

print("Vocabulary size: {}".format(vocab_size))
X = pad_sequences(X, padding='post')


sentiments = {'Extremely Negative': 0,
            'Negative': 0,
            'Neutral': 1,
            'Positive':2,
            'Extremely Positive': 2
           }
y = y.map(sentiments)
labels = ['Negative', 'Neutral', 'Positive']

X_test = df_test['OriginalTweet'].copy()
y_test = df_test['Sentiment'].copy()

X_test = X_test.apply(text_cleaner)
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, padding='post', maxlen = X.shape[1])

y_test = y_test.map(sentiments)
print(len(df_train))
print(len(df_test))



/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv
/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv
Vocabulary size: 36117
41157
3798


# Model training

In [2]:
def training(model, eps, bs, model_name):

    callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)

    model.compile(loss=SparseCategoricalCrossentropy(from_logits = False),
              optimizer='adam',metrics=['accuracy'])
            
    history = model.fit(X, y, epochs=eps, 
                        validation_split=0.15, batch_size=bs,
                        callbacks = [callback])
    
    return model
def perdicting(model):
    pred = np.argmax(np.round(model.predict(X_test)),axis = 1)
    cr = classification_report(y_true=y_test, y_pred=pred, digits=4)
    print(cr)
    
    
def generic_model(model_layer,units, epochs, bs):
    EPOCHS = epochs
    BATCH_SIZE = bs
    embedding_dim = 16
    model= tf.keras.Sequential([
        layer.Embedding(vocab_size, embedding_dim, input_length=X.shape[1]),
        model_layer(units,return_sequences = True),
        layer.GlobalMaxPooling1D(),
        layer.Dense(3,activation = 'softmax')
    ])
    model.summary()
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)
    model.compile(loss=SparseCategoricalCrossentropy(from_logits = False),
              optimizer='adam',metrics=['accuracy'])   
    history = model.fit(X, y, epochs=EPOCHS, 
                        validation_split=0.15, batch_size=BATCH_SIZE,
                        callbacks = [callback])
    hist_model = pd.DataFrame(history.history)    
    return model

# CNN model

In [3]:
embedding_dim = 16
#Input size is 54x1
i= layer.Input(shape=(X.shape[1],),)
x = layer.Embedding(vocab_size, embedding_dim, input_length=X.shape[1])(i)

#Output of this next layer is (54-3+1,32) = (52,32)
x = layer.Conv1D(32, 3, activation='relu')(x)
#Output of this next layer is (52/2, 32) = (26,32)
x = layer.MaxPooling1D(2)(x)

#Output of this next layer is (26-3+1, 64) = (24,64)
x = layer.Conv1D(64, 3, activation='relu')(x)
#Output of this next layer is (24/2, 64) = (12,64)
x = layer.MaxPooling1D(2)(x)

#Output of this next layer is (12-3+1,128) = (10,128) 
x = layer.Conv1D(128, 3, activation='relu')(x)
#Output of this next layer is (10/2,128) = (5,128)
x = layer.MaxPooling1D(2)(x)

#Output of this next layer is (1,128) 
x = layer.GlobalMaxPooling1D()(x)
outputs = layer.Dense(3, activation='softmax')(x)
model = tf.keras.Model(i, outputs)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 54)]              0         
                                                                 
 embedding (Embedding)       (None, 54, 16)            577872    
                                                                 
 conv1d (Conv1D)             (None, 52, 32)            1568      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 26, 32)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 24, 64)            6208      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 12, 64)           0         
 1D)                                                         

In [4]:
cnn = training(model, 5, 32, 'CNN')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


In [5]:
perdicting(cnn)

              precision    recall  f1-score   support

           0     0.8018    0.8573    0.8286      1633
           1     0.8190    0.7383    0.7766       619
           2     0.8467    0.8182    0.8322      1546

    accuracy                         0.8220      3798
   macro avg     0.8225    0.8046    0.8125      3798
weighted avg     0.8229    0.8220    0.8216      3798



# RNN model

In [6]:
model_RNN = generic_model(layer.SimpleRNN, 20, 5, 32)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 54, 16)            577872    
                                                                 
 simple_rnn (SimpleRNN)      (None, 54, 20)            740       
                                                                 
 global_max_pooling1d_1 (Glo  (None, 20)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_1 (Dense)             (None, 3)                 63        
                                                                 
Total params: 578,675
Trainable params: 578,675
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
perdicting(model_RNN)

              precision    recall  f1-score   support

           0     0.8037    0.8347    0.8189      1633
           1     0.8018    0.7060    0.7509       619
           2     0.8253    0.8312    0.8282      1546

    accuracy                         0.8123      3798
   macro avg     0.8103    0.7906    0.7993      3798
weighted avg     0.8122    0.8123    0.8116      3798



In [8]:
# LSTM model

In [9]:
model_LSTM = generic_model(layer.LSTM,15, 5, 32)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 54, 16)            577872    
                                                                 
 lstm (LSTM)                 (None, 54, 15)            1920      
                                                                 
 global_max_pooling1d_2 (Glo  (None, 15)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 3)                 48        
                                                                 
Total params: 579,840
Trainable params: 579,840
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
perdicting(model_LSTM)

              precision    recall  f1-score   support

           0     0.8117    0.8500    0.8304      1633
           1     0.8249    0.7076    0.7617       619
           2     0.8388    0.8448    0.8418      1546

    accuracy                         0.8246      3798
   macro avg     0.8251    0.8008    0.8113      3798
weighted avg     0.8249    0.8246    0.8238      3798



In [11]:
# GRU model

In [12]:
model_GRU = generic_model(layer.GRU,15, 5, 32)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 54, 16)            577872    
                                                                 
 gru (GRU)                   (None, 54, 15)            1485      
                                                                 
 global_max_pooling1d_3 (Glo  (None, 15)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_3 (Dense)             (None, 3)                 48        
                                                                 
Total params: 579,405
Trainable params: 579,405
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
perdicting(model_GRU)

              precision    recall  f1-score   support

           0     0.8137    0.8481    0.8306      1633
           1     0.8694    0.7205    0.7880       619
           2     0.8219    0.8415    0.8316      1546

    accuracy                         0.8246      3798
   macro avg     0.8350    0.8034    0.8167      3798
weighted avg     0.8261    0.8246    0.8240      3798

