# Text Classification

We used Kaggle data sets to experiement with text classification. The data set we've chosen classifies tweets based on their sentiment related to COVID-19 and the pandemic. Sentiment varies from extremely positive to extremely negative.

In [12]:
import tensorflow as tf
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import datasets, layers, models
from sklearn.preprocessing import LabelEncoder

## Seeing what data we're working with

In [13]:
data_test = pd.read_csv('Kaggle_Data/Corona_NLP_test.csv', header=0, usecols=[4,5], encoding='latin-1')
data_train = pd.read_csv('Kaggle_Data/Corona_NLP_train.csv', header=0, usecols=[4,5], encoding='latin-1')[:16000]

data_test.groupby(['Sentiment'])['Sentiment'].count()
data_train.groupby(['Sentiment'])['Sentiment'].count()

Sentiment
Extremely Negative    2464
Extremely Positive    2423
Negative              4014
Neutral               2813
Positive              4286
Name: Sentiment, dtype: int64

## Vectorizing the tweets

In [14]:
# set up X and Y
num_labels = 5
vocab_size = 25000
batch_size = 100

# fit the tokenizer on the training data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(data_train.OriginalTweet)

x_train = tokenizer.texts_to_matrix(data_train.OriginalTweet, mode='tfidf')
x_test = tokenizer.texts_to_matrix(data_test.OriginalTweet, mode='tfidf')

encoder = LabelEncoder()
encoder.fit(data_train.Sentiment)

y_train = encoder.transform(data_train.Sentiment)
y_test = encoder.transform(data_test.Sentiment)

# check shape
print("train shapes:", x_train.shape, y_train.shape)
print("test shapes:", x_test.shape, y_test.shape)
print("test first five labels:", y_test[:5])

from keras.utils.np_utils import to_categorical
y_train = to_categorical(y_train, num_classes=None)
y_test = to_categorical(y_test, num_classes=None)

train shapes: (16000, 25000) (16000,)
test shapes: (3798, 25000) (3798,)
test first five labels: [0 4 1 2 3]


## Sequential Model

In [15]:
seq_model = models.Sequential()
seq_model.add(layers.Dense(24, input_dim=vocab_size, kernel_initializer='normal', activation='relu'))
seq_model.add(layers.Dropout(.35))
seq_model.add(layers.Dense(12, kernel_initializer='normal', activation='relu'))
seq_model.add(layers.Dropout(.35))
seq_model.add(layers.Dense(5, kernel_initializer='normal', activation='softmax'))


seq_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = seq_model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
score = seq_model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('Accuracy: ', score[1])

Accuracy:  0.5250131487846375


## GRU (RNN) Architecture

We train a GRU and use one-hot encoding to vectorize the words of each tweet.

In [17]:
LSTM_VOCAB_SIZE = 500

maxlen = 500
batch_size = 32

from tensorflow import keras


encoder = LabelEncoder()
encoder.fit(data_train.Sentiment)
y_train = encoder.transform(data_train.Sentiment)
y_test = encoder.transform(data_test.Sentiment)

encoder = keras.layers.TextVectorization(max_tokens=LSTM_VOCAB_SIZE)
encoder.adapt(data_train.OriginalTweet)

x_train = encoder(data_train.OriginalTweet)
x_test = encoder(data_test.OriginalTweet)

x_train = tf.keras.utils.to_categorical(x_train, num_classes = LSTM_VOCAB_SIZE)
x_test = tf.keras.utils.to_categorical(x_test, num_classes = LSTM_VOCAB_SIZE)

from keras.utils.np_utils import to_categorical
y_train = to_categorical(y_train, num_classes=None)
y_test = to_categorical(y_test, num_classes=None)

In [21]:
lstm = keras.Sequential()
lstm.add(layers.Bidirectional(layers.GRU(48, input_dim=LSTM_VOCAB_SIZE)))
lstm.add(layers.Dense(10, activation='relu'))
lstm.add(layers.Dense(5, activation='softmax'))

lstm.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = lstm.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
score = lstm.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('Accuracy: ', score[1])

Accuracy:  0.4539231061935425


### Applying an embedding layer to our GRU model

In [25]:
LSTM_VOCAB_SIZE = 10000

maxlen = 500
batch_size = 32

from tensorflow import keras

encoder = LabelEncoder()
encoder.fit(data_train.Sentiment)
y_train = encoder.transform(data_train.Sentiment)
y_test = encoder.transform(data_test.Sentiment)

encoder = keras.layers.TextVectorization(max_tokens=LSTM_VOCAB_SIZE)
encoder.adapt(data_train.OriginalTweet)

x_train = encoder(data_train.OriginalTweet)
x_test = encoder(data_test.OriginalTweet)

from keras.utils.np_utils import to_categorical
y_train = to_categorical(y_train, num_classes=None)
y_test = to_categorical(y_test, num_classes=None)

In [26]:
lstm = keras.Sequential()
lstm.add(layers.Embedding(input_dim=LSTM_VOCAB_SIZE, output_dim=64))
lstm.add(layers.Bidirectional(layers.GRU(64, input_dim=LSTM_VOCAB_SIZE)))
lstm.add(layers.Dense(10, activation='relu'))
lstm.add(layers.Dense(5, activation='softmax'))

lstm.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = lstm.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
score = lstm.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('Accuracy: ', score[1])

Accuracy:  0.622169554233551
