# Problem: RNN Text Classification

In [3]:
# Imports
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Dropout, LSTM, GRU
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder

### Tweet_emotions Dataset

- X contains 40000 sentences (strings)
- Y contains string labels in 13 classes

Dataset link:

https://www.kaggle.com/datasets/pashupatigupta/emotion-detection-from-text?resource=download


In [4]:
# Read csv file and split data
data_frame = pd.read_csv("/content/drive/MyDrive/dataset/tweet_emotions.csv")

data_train = data_frame[:30000]
data_test = data_frame[30000:]

# Get all classes
all_classes = data_train.sentiment.unique().tolist()
print(all_classes)

# Get max length of sentences
max_len = len(max(np.array(data_frame["content"]), key=len).split(" "))
max_len

['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise', 'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger']


27

In [8]:
# Convert labels to number
le = LabelEncoder()

Y_train = le.fit_transform(data_train["sentiment"])
Y_test = le.fit_transform(data_test["sentiment"])
Y_train, len(Y_train), Y_test, len(Y_test)

(array([ 2, 10, 10, ...,  5,  4,  5]),
 30000,
 array([ 8, 12,  8, ...,  7,  5,  7]),
 10000)

In [9]:
# Get data from dataframe
X_train = data_train["content"]
X_test = data_test["content"]

In [12]:
# Replace labels with related emoji
def label_to_emoji(label):
    emojies = ["ü•î", "üòû", "üòÑ", "üôÇ", "üòß", "ü§©", "‚ù§Ô∏è", "üòÇ", "ü§¢", "üòÜ", "üòì", "üòå", "üò°"]
    return emojies[label]

index = 200
print(X_train[index], label_to_emoji(Y_train[index]))

@AlexanderGWhite daaammmnnnnn I do wish I was there. üòì


## Emojifier-V1

Each word has some feature, and in Emojifier-V1 we want to classify sentences using multilayer perceptron:

- We get the average of words in each sentence and then forward it to the multilayer perceptron with 50 input neurons(each word has 50 features, then the average of words in the sentence has 50 features) and an output layer of softmax with 5 neurons.

- For feature vectors, we can get from this link: http://nlp.stanford.edu/data/glove.6B.zip

<br>

<center>
<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/image_1.png?raw=1" style="width:900px;height:300px;">
</center>


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
# Convert labels to one hot
num_classes = len(np.unique(Y_train))

Y_train_oh = tf.keras.utils.to_categorical(Y_train, num_classes)
Y_test_oh = tf.keras.utils.to_categorical(Y_test, num_classes)
num_classes

13

In [15]:
index = 5
print(Y_train[index], "is converted into one hot", Y_train_oh[index])

12 is converted into one hot [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [16]:
# Download feature vectors and extract
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glov.6B

--2022-11-10 20:16:26--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-11-10 20:16:27--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-11-10 20:16:27--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‚Äòglove.6B.zip‚Äô



In [17]:
# Read feature vectors and save them
"""
In the text file, in each line,
the word comes first, and then the feature vectors(each word is in one line).
"""
def read_glov_vectors(glove_file):
  f = open(glove_file, encoding="utf8")
  words = set()
  words_to_vec = dict()
  for line in f:
    line = line.strip().split()
    word = line[0]
    vec = line[1:]
    words.add(word)
    words_to_vec[word] = np.array(vec, dtype=np.float64)
  return words_to_vec

In [18]:
words_to_vec = read_glov_vectors("/content/glov.6B/glove.6B.50d.txt")

# Test the output of read_glov_vectors function
words_to_vec["hello"]

array([-0.38497 ,  0.80092 ,  0.064106, -0.28355 , -0.026759, -0.34532 ,
       -0.64253 , -0.11729 , -0.33257 ,  0.55243 , -0.087813,  0.9035  ,
        0.47102 ,  0.56657 ,  0.6985  , -0.35229 , -0.86542 ,  0.90573 ,
        0.03576 , -0.071705, -0.12327 ,  0.54923 ,  0.47005 ,  0.35572 ,
        1.2611  , -0.67581 , -0.94983 ,  0.68666 ,  0.3871  , -1.3492  ,
        0.63512 ,  0.46416 , -0.48814 ,  0.83827 , -0.9246  , -0.33722 ,
        0.53741 , -1.0616  , -0.081403, -0.67111 ,  0.30923 , -0.3923  ,
       -0.55002 , -0.68827 ,  0.58049 , -0.11626 ,  0.013139, -0.57654 ,
        0.048833,  0.67204 ])

In [19]:
#  Convert sentences to the average of the word vectors
def sentence_to_avg(sentence):
  words = sentence.lower().split() # Convert uppercase to lowercase
  sum_vectors = np.zeros((50, ))
  for w in words:
    # if w.startswith('@'):
    #   continue
    # else
    try:
      sum_vectors += words_to_vec[w]
    except:
      pass
  avg_vectors = sum_vectors / len(words)
  return avg_vectors

In [20]:
# Test sentence_to_avg function
sentence_to_avg("Pasta is my favorite food")

array([ 0.242832  ,  0.370774  , -0.524396  ,  0.018644  ,  0.568756  ,
        0.0219878 , -0.48206322, -0.152204  ,  0.235412  ,  0.1979466 ,
       -0.178818  ,  0.3203976 ,  0.3379962 ,  0.1399654 ,  0.56775044,
        0.118648  , -0.04531252,  0.335416  ,  0.149832  , -0.522814  ,
        0.095746  , -0.0468764 ,  0.5508066 ,  0.39369132,  0.275182  ,
       -1.275018  , -0.76076   ,  0.449102  ,  0.7542772 , -0.2332608 ,
        2.82554   ,  0.287742  , -0.325976  ,  0.608572  , -0.020543  ,
        0.286476  , -0.24984   ,  0.899408  ,  0.38995   , -0.270266  ,
        0.3004734 ,  0.315962  , -0.2408782 ,  0.1586226 ,  0.5400462 ,
        0.412066  , -0.1657008 , -0.253566  ,  0.3091806 ,  0.371192  ])

In [21]:
# Get the average of all sentences
X_train_avg = []
for i in range(X_train.shape[0]):
  X_train_avg.append(sentence_to_avg(X_train[i].strip()))

X_train_avg = np.array(X_train_avg)

X_train_avg.shape, Y_train_oh.shape

((30000, 50), (30000, 13))

In [22]:
# Create model(using perceptron)
class EmojiNet_V1(Model):
    def __init__(self):
        super().__init__()
        self.dense = Dense(num_classes, input_shape=(50,), activation='softmax')

    def call(self, x):
        x = self.dense(x)
        return x

In [23]:
# Compile and fit the model
model = EmojiNet_V1()

model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.fit(X_train_avg, Y_train_oh, batch_size=64, epochs=300, shuffle=True)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x7fb27058b990>

In [24]:
# Evaluation
X_test_avg = []
for i in range(X_test.shape[0]):
    X_test_avg.append(sentence_to_avg(X_test[i+30000].strip()))

X_test_avg = np.array(X_test_avg)
model.evaluate(X_test_avg, Y_test_oh)



[2.1332316398620605, 0.24240000545978546]

In [25]:
# Inference
X_me = np.array(["not sad", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy and funny"])
Y_me = np.array([[2], [0], [0], [2], [1], [4], [3]])
X_me_avg = []

for x in X_me:
    X_me_avg.append(sentence_to_avg(x))

X_me_avg = np.array(X_me_avg)
pred = model.predict(X_me_avg)

for i in range(X_me.shape[0]):
    print(X_me[i], label_to_emoji(np.argmax(pred[i])))

not sad üòì
i adore you üòÇ
i love you üòÇ
funny lol üòß
lets play with a ball ü§¢
food is ready ü§¢
not feeling happy and funny üòÇ


## Emojifier-V2: Using RNNs: 

Let's build an LSTM model that takes as input word sequences. This model will be able to take word ordering into account. Emojifier-V2 will continue to use pre-trained word embeddings to represent words, but will feed them into an LSTM, whose job it is to predict the most appropriate emoji. 

Run the following cell to load the Keras packages.

<br>

<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/emojifier-v2.png?raw=1" style="width:700px;height:400px;"> <br>
<caption><center> **Figure 3**: Emojifier-V2. A 2-layer LSTM sequence classifier. </center></caption>

<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/embedding1.png?raw=1" style="width:700px;height:250px;">
<caption><center> **Figure 4**: Embedding layer. This example shows the propagation of two examples through the embedding layer. Both have been zero-padded to a length of `max_len=5`. The final dimension of the representation is  `(2,max_len,50)` because the word embeddings we are using are 50 dimensional. </center></caption>

In [29]:
# Define model
class EmojiNet_V2(Model):
    def __init__(self):
        super().__init__()
        
        self.lstm_1 = GRU(128, return_sequences=True)
        self.dropout_1 = Dropout(0.3)
        self.lstm_2 = GRU(256)
        self.dropout_2 = Dropout(0.5)
        self.dense = Dense(num_classes, activation='softmax')

    def call(self, x):
        x = self.lstm_1(x)
        # x = self.dropout_1(x)
        x = self.lstm_2(x)
        x = self.dropout_2(x)
        x = self.dense(x)
        return x

In [30]:
# Compile model
model = EmojiNet_V2()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [41]:
# Fix the size of all sentences to max_len
def convert_sentences_to_embeddings(X):
    emb_dim = words_to_vec["cucumber"].shape[0]  # define dimensionality of your GloVe word vectors (= 50)
    emb_matrix = np.zeros((X.shape[0], max_len, emb_dim))
    for i in range(X.shape[0]):
        words = X[i].lower().split()
        for j in range(len(words)):
          try:
            emb_matrix[i, j, :] = words_to_vec[words[j]]
          except:
            pass
    return emb_matrix

In [32]:
# Test convert_sentences_to_embeddings function
X_me = np.array(["funny lol", "lets play baseball", "food is ready for you"])
print(X_me)
print(convert_sentences_to_embeddings(X_me))

['funny lol' 'lets play baseball' 'food is ready for you']
[[[-0.014547 -0.20208  -0.75278  ... -0.13429   0.21133   1.5368  ]
  [-0.54289   0.053743 -0.46978  ...  0.20745  -0.074958  0.080575]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[ 0.30423  -0.24405   1.0303   ... -0.43296  -0.096168  0.43463 ]
  [-0.73571   0.19937  -0.89408  ... -0.075279 -0.44448   0.47437 ]
  [-1.9327    1.0421   -0.78515  ...  0.55667  -0.70315   0.17157 ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[ 0.47222  -0.44545  -0.51833  ...  0.34932   0.33934   0.25499 ]
  [ 0.6185    0.64254  -0.46552  ... -0.2

In [33]:
# Run convert_sentences_to_embeddings function for training data 
X_train_embs =convert_sentences_to_embeddings(X_train)
X_train_embs.shape

(30000, 27, 50)

In [None]:
model.fit(X_train_embs, Y_train_oh, epochs=100, batch_size=64, shuffle=True)

In [38]:
# Fix the size of all sentences to max_len
def convert_sentences_to_embeddings(X):
    emb_dim = words_to_vec["cucumber"].shape[0]  # define dimensionality of your GloVe word vectors (= 50)
    emb_matrix = np.zeros((X.shape[0], max_len, emb_dim))
    for i in range(X.shape[0]):
        words = X[i+30000].lower().split()
        for j in range(len(words)):
          try:
            emb_matrix[i, j, :] = words_to_vec[words[j]]
          except:
            pass
    return emb_matrix

In [39]:
# Evaluation
X_test_embs = convert_sentences_to_embeddings(X_test)
print(X_test_embs.shape)
model.evaluate(X_test_embs, Y_test_oh)

(10000, 27, 50)


[2.365351915359497, 0.29420000314712524]

In [None]:
# Inference
X_me = np.array(["not happy", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy and funny"])
Y_me = np.array([[2], [0], [0], [2], [1], [4], [3]])
X_me_embed = convert_sentences_to_embeddings(X_me) 

pred = model.predict(X_me_embed)

for i in range(X_me.shape[0]):
    print(X_me[i], label_to_emoji(np.argmax(pred[i])))