In [1]:
## imports and configuration
import numpy as np
from emo_utils import *
import emoji
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn

%matplotlib inline
%load_ext autoreload
%autoreload 2
torch.set_printoptions(linewidth=200)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## 1 - Baseline model: Emojifier-V1

### 1.1 - Dataset EMOJISET

Let's start by building a simple baseline classifier. 

You have a tiny dataset (X, Y) where:
- X contains 127 sentences (strings)
- Y contains a integer label between 0 and 4 corresponding to an emoji for each sentence

<img src="images/data_set.png" style="width:700px;height:300px;">
<caption><center> **Figure 1**: EMOJISET - a classification problem with 5 classes. A few examples of sentences are given here. </center></caption>

Let's load the dataset using the code below. We split the dataset between training (127 examples) and testing (56 examples).

In [2]:
X_train, Y_train = read_csv('data/train_emoji.csv')
X_test, Y_test = read_csv('data/tesss.csv')

In [3]:
maxLen = len(max(X_train, key=len).split())

Run the following cell to print sentences from X_train and corresponding labels from Y_train. Change `index` to see different examples. Because of the font the iPython notebook uses, the heart emoji may be colored black rather than red.

In [4]:
index = 1
print(X_train[index], label_to_emoji(Y_train[index]))

I am proud of your achievements 😄


### 1.2 - Overview of the Emojifier-V1

In this part, you are going to implement a baseline model called "Emojifier-v1".  

<center>
<img src="images/image_1.png" style="width:900px;height:300px;">
<caption><center> **Figure 2**: Baseline model (Emojifier-V1).</center></caption>
</center>

The input of the model is a string corresponding to a sentence (e.g. "I love you"). In the code, the output will be a probability vector of shape (1,5), that you then pass in an argmax layer to extract the index of the most likely emoji output.

To get our labels into a format suitable for training a softmax classifier, lets convert $Y$ from its current shape  current shape $(m, 1)$ into a "one-hot representation" $(m, 5)$, where each row is a one-hot vector giving the label of one example, You can do so using this next code snipper. Here, `Y_oh` stands for "Y-one-hot" in the variable names `Y_oh_train` and `Y_oh_test`: 


In [5]:
Y_oh_train = convert_to_one_hot(Y_train, C = 5)
Y_oh_test = convert_to_one_hot(Y_test, C = 5)

In [6]:
index = 50
print(Y_train[index], "is converted into one hot", Y_oh_train[index])

0 is converted into one hot [1. 0. 0. 0. 0.]


All the data is now ready to be fed into the Emojify-V2 model. Let's implement the model!

### 1.3 - Implementing Emojifier-V1

As shown in Figure (2), the first step is to convert an input sentence into the word vector representation, which then get averaged together. Similar to the previous exercise, we will use pretrained 50-dimensional GloVe embeddings. Run the following cell to load the `word_to_vec_map`, which contains all the vector representations.

In [7]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

You've loaded:
- `word_to_index`: dictionary mapping from words to their indices in the vocabulary (400,001 words, with the valid indices ranging from 0 to 400,000)
- `index_to_word`: dictionary mapping from indices to their corresponding words in the vocabulary
- `word_to_vec_map`: dictionary mapping words to their GloVe vector representation.

Run the following cell to check if it works.

In [8]:
word = "cucumber"
index = 289846
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_word[index])

the index of cucumber in the vocabulary is 113317
the 289846th word in the vocabulary is potatos


**Exercise**: Implement `sentence_to_avg()`. You will need to carry out two steps:
1. Convert every sentence to lower-case, then split the sentence into a list of words. `X.lower()` and `X.split()` might be useful. 
2. For each word in the sentence, access its GloVe representation. Then, average all these values.

In [9]:
def sentence_to_avg(sentence, word_to_vec_map):
    words = sentence.lower().split()
    vectors = [torch.tensor(word_to_vec_map[word], dtype=torch.float32) for word in words]
    vectors = torch.stack(vectors, axis=1)
    return vectors.mean(axis=1)

In [10]:
class Emo_Dataset(Dataset):
    def __init__(self, X, Y, word_to_vec_map):
        self.word_to_vec_map = word_to_vec_map
        self.X = X
        self.Y = Y
        super().__init__()
    
    def __getitem__(self, index):
        x = sentence_to_avg(self.X[index], word_to_vec_map)
        y = self.Y[index]
        return x, y
    
    def __len__(self):
        return self.X.shape[0]

In [11]:
batch_size = 8
trn_ds = Emo_Dataset(X_train, Y_train, word_to_vec_map)
trn_dl = DataLoader(trn_ds, batch_size=batch_size, shuffle=True)
test_ds = Emo_Dataset(X_test, Y_test, word_to_vec_map)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)

#### Model

You now have all the pieces to finish implementing the `model()` function.

**Exercise**: Implement the `model()` function described in Figure (2). Assuming here that $Yoh$ ("Y one hot") is the one-hot encoding of the output labels, the equations you need to implement in the forward pass and to compute the cross-entropy cost are:
$$ z^{(i)} = W . avg^{(i)} + b$$
$$ a^{(i)} = softmax(z^{(i)})$$
$$ \mathcal{L}^{(i)} = - \sum_{k = 0}^{n_y - 1} Yoh^{(i)}_k * log(a^{(i)}_k)$$

It is possible to come up with a more efficient vectorized implementation. But since we are using a for-loop to convert the sentences one at a time into the avg^{(i)} representation anyway, let's not bother this time. 

In [12]:
model = nn.Linear(in_features=50, out_features=5).to(device)
nn.init.xavier_uniform_(model.weight)  # Initialize parameters using Xavier initialization
loss_fn = nn.CrossEntropyLoss()  # already contains softmax in CrossEntropyLoss
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [13]:
def compute_accuracy(model, dl):
    model.eval()
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for x, y in dl:
            x, y = x.to(device), y.to(device)
            z = model(x)
            pred_cls = torch.softmax(z, dim=1).argmax(dim=1)
            correct_preds += (y == pred_cls).sum().item()
            total_preds += y.shape[0]
    model.train()
    return correct_preds / total_preds

In [14]:
def train(model, loss_fn, optimizer, trn_dl, num_epochs=400):
    model.train()
    for e in range(num_epochs):
        for x, y in trn_dl:
            x, y = x.to(device), y.to(device)
            z = model(x)
            optimizer.zero_grad()
            loss = loss_fn(z, y)
            loss.backward()
            optimizer.step()
        if e % 100 == 0:
            print(f'Epoch: {e} --- cost = {loss}')
            accuracy = compute_accuracy(model, trn_dl)
            print(f"Accuracy: {accuracy}")

In [15]:
train(model, loss_fn, optimizer, trn_dl)

Epoch: 0 --- cost = 2.171079158782959
Accuracy: 0.2727272727272727
Epoch: 100 --- cost = 1.025998592376709
Accuracy: 0.7272727272727273
Epoch: 200 --- cost = 0.655185341835022
Accuracy: 0.8333333333333334
Epoch: 300 --- cost = 0.6623440384864807
Accuracy: 0.8409090909090909


Great! Your model has pretty high accuracy on the training set. Lets now see how it does on the test set.

In [16]:
print("Training set accuracy:", compute_accuracy(model, trn_dl))
print("Test set accuracy:", compute_accuracy(model, test_dl))

Training set accuracy: 0.8863636363636364
Test set accuracy: 0.8571428571428571


Random guessing would have had 20% accuracy given that there are 5 classes. This is pretty good performance after training on only 127 examples. 

In the training set, the algorithm saw the sentence "*I love you*" with the label ❤️. You can check however that the word "adore" does not appear in the training set. Nonetheless, lets see what happens if you write "*I adore you*."



In [17]:
def test_custom_sentences(model, sentences, labels, word_to_vec_map):
    model.eval()
    num_examples = len(sentences)
    correct_preds = 0
    y = torch.tensor(labels).to(device)
    with torch.no_grad():
        for (sentence, label) in zip(sentences, y):
            x = sentence_to_avg(sentence, word_to_vec_map).to(device)
            z = model(x)
            pred_cls = torch.softmax(z, dim=-1).argmax()
            correct_preds += (label == pred_cls).item()
            print(sentence, label_to_emoji(pred_cls.item()))
    
    
    print('\nAccuracy:', correct_preds/num_examples)

In [18]:
X_my_sentences = ["you are a cute dog", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy"]
Y_my_labels = [0, 0, 2, 1, 4, 3]

In [19]:
test_custom_sentences(model, X_my_sentences, Y_my_labels, word_to_vec_map)

you are a cute dog ❤️
i love you ❤️
funny lol 😄
lets play with a ball ⚾
food is ready 🍴
not feeling happy 😄

Accuracy: 0.8333333333333334


## 2 - Emojifier-V2: Using LSTMs in PyTorch: 

Let's build an LSTM model that takes as input word sequences. This model will be able to take word ordering into account. Emojifier-V2 will continue to use pre-trained word embeddings to represent words, but will feed them into an LSTM, whose job it is to predict the most appropriate emoji. 

Run the following cell to load the PyTorch packages.

In [20]:
import torch.nn.functional as F

### 2.1 - Overview of the model

Here is the Emojifier-v2 you will implement:

<img src="images/emojifier-v2.png" style="width:700px;height:400px;"> <br>
<caption><center> **Figure 3**: Emojifier-V2. A 2-layer LSTM sequence classifier. </center></caption>



### 2.2 PyTorch and mini-batching 

In this exercise, we want to train our model using mini-batches. However, most deep learning frameworks require that all sequences in the same mini-batch have the same length. This is what allows vectorization to work: If you had a 3-word sentence and a 4-word sentence, then the computations needed for them are different (one takes 3 steps of an LSTM, one takes 4 steps) so it's just not possible to do them both at the same time.

The common solution to this is to use padding. Specifically, set a maximum sequence length, and pad all sequences to the same length. For example, of the maximum sequence length is 20, we could pad every sentence with "0"s so that each input sentence is of length 20. Thus, a sentence "i love you" would be represented as $(e_{i}, e_{love}, e_{you}, \vec{0}, \vec{0}, \ldots, \vec{0})$. In this example, any sentences longer than 20 words would have to be truncated. One simple way to choose the maximum sequence length is to just pick the length of the longest sentence in the training set. 


In [21]:
def sentence_padding(sentence, word_to_vec_map, max_len):
    # Split sentence into words and lowercase
    words = sentence.lower().split()

    # Convert words to vectors
    vectors = [torch.tensor(word_to_vec_map[word], dtype=torch.float32) for word in words]

    # Check if the number of words is less than the max length
    if len(vectors) < max_len:
        # Pad with zero vectors
        pad_size = max_len - len(vectors)
        vectors += [torch.zeros(50, dtype=torch.float32) for _ in range(pad_size)]
    else:
        # Truncate the vectors to max_len
        vectors = vectors[:max_len]

    # Stack vectors along the second dimension, transpose to match pytorch LSTM
    vectors = torch.stack(vectors, dim=1).transpose(0, 1)

    return vectors

In [22]:
class Emo_Dataset_V2(Dataset):
    def __init__(self, X, Y, word_to_vec_map, max_len):
        self.word_to_vec_map = word_to_vec_map
        self.X = X
        self.Y = Y
        self.max_len = max_len
        super().__init__()
    
    def __getitem__(self, index):
        x = sentence_padding(self.X[index], word_to_vec_map, self.max_len)
        y = self.Y[index]
        return x, y
    
    def __len__(self):
        return self.X.shape[0]

## 2.3 Building the Emojifier-V2

Lets now build the Emojifier-V2 model. You will do so using the embedding layer you have built, and feed its output to an LSTM network. 

<img src="images/emojifier-v2.png" style="width:700px;height:400px;"> <br>
<caption><center> **Figure 3**: Emojifier-v2. A 2-layer LSTM sequence classifier. </center></caption>


**Exercise:** Implement `Emojify_V2()`, which builds a Keras graph of the architecture shown in Figure 3. The model takes as input an array of sentences of shape (`m`, `max_len`, ) defined by `input_shape`. It should output a softmax probability vector of shape (`m`, `C = 5`). You may need `Input(shape = ..., dtype = '...')`, [LSTM()](https://keras.io/layers/recurrent/#lstm), [Dropout()](https://keras.io/layers/core/#dropout), [Dense()](https://keras.io/layers/core/#dense), and [Activation()](https://keras.io/activations/).

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers):
        super(LSTMModel, self).__init__()

        # Define the LSTM layer with dropout
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            dropout=0.5)  # Dropout between LSTM layers

        # Define a dropout layer for applying after LSTM
        self.dropout = nn.Dropout(0.5)

        # Define the final, fully-connected (linear) layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        # Apply Xavier Uniform initialization to LSTM weights
        # This is to match the original Keras implementation
        for name, param in self.lstm.named_parameters():
            if 'weight_ih' in name:  # Input-hidden weights
                nn.init.xavier_uniform_(param)
            elif 'weight_hh' in name:  # Hidden-hidden weights
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:  # Biases
                nn.init.zeros_(param)

        # Apply Xavier Uniform initialization to the fully connected layer
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x):
        # x shape: [batch_size, seq_len, embedding_dim]

        # LSTM
        lstm_out, (hn, cn) = self.lstm(x)

        # Only take the output from the final timestep
        last_timestep_out = lstm_out[:, -1, :]

        # Dropout after LSTM
        out = self.dropout(last_timestep_out)

        # Fully-connected layer
        out = self.fc(out)

        # No need to add softmax here as it is implemented in CrossEntropyLoss

        return out

In [24]:
# Parameters
embedding_dim = 50  # Dimension of word embeddings
hidden_dim = 128    # LSTM output size
output_dim = 5      # Final output size (number of classes)
num_layers = 2      # Number of LSTM layers
loss_fn = nn.CrossEntropyLoss()  # already contains softmax in CrossEntropyLoss

# Model instance
model = LSTMModel(embedding_dim, hidden_dim, output_dim, num_layers).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [25]:
batch_size = 32
trn_ds = Emo_Dataset_V2(X_train, Y_train, word_to_vec_map, 10)
trn_dl = DataLoader(trn_ds, batch_size=batch_size, shuffle=True)
test_ds = Emo_Dataset_V2(X_test, Y_test, word_to_vec_map, 10)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)

In [26]:
def compute_accuracy(model, dl):
    model.eval()
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for x, y in dl:
            x, y = x.to(device), y.to(device)
            z = model(x)
            pred_cls = torch.softmax(z, dim=1).argmax(dim=1)
            correct_preds += (y == pred_cls).sum().item()
            total_preds += y.shape[0]
    model.train()
    return correct_preds / total_preds

In [27]:
def train(model, loss_fn, optimizer, trn_dl, num_epochs=50):
    model.train()
    for e in range(num_epochs):
        for x, y in trn_dl:
            x, y = x.to(device), y.to(device)
            z = model(x)
            optimizer.zero_grad()
            loss = loss_fn(z, y)
            loss.backward()
            optimizer.step()
        if e % 20 == 0:
            print(f'Epoch: {e} --- cost = {loss}')
            accuracy = compute_accuracy(model, trn_dl)
            print(f"Accuracy: {accuracy}")

In [28]:
train(model, loss_fn, optimizer, trn_dl)

Epoch: 0 --- cost = 1.5565693378448486
Accuracy: 0.3333333333333333
Epoch: 20 --- cost = 0.4079158306121826
Accuracy: 0.8560606060606061
Epoch: 40 --- cost = 0.04211046174168587
Accuracy: 0.946969696969697


In [29]:
print("Training set accuracy:", compute_accuracy(model, trn_dl))
print("Test set accuracy:", compute_accuracy(model, test_dl))

Training set accuracy: 0.9924242424242424
Test set accuracy: 0.8392857142857143
