In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q emoji

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/433.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m286.7/433.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

### emo_utils.py

In [2]:
import csv
import numpy as np
import emoji
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


def read_csv(filename = 'data/emojify_data.csv'):
    phrase = []
    emoji = []

    with open (filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)

        for row in csvReader:
            phrase.append(row[0])
            emoji.append(row[1])

    X = np.asarray(phrase)
    Y = np.asarray(emoji, dtype=int)

    return X, Y

def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y


emoji_dictionary = {#"0": ":red_heart:",    # :heart: prints a black instead of red heart depending on the font
                    "0": "\u2764\ufe0f",
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

def label_to_emoji(label):
    """
    Converts a label (int or string) into the corresponding emoji code (string) ready to be printed
    """
    return emoji.emojize(emoji_dictionary[str(label)])


def print_predictions(X, pred):
    print()
    for i in range(X.shape[0]):
        print(X[i], label_to_emoji(int(pred[i])))


def plot_confusion_matrix(y_actu, y_pred, title='Confusion matrix', cmap=plt.cm.gray_r):

    df_confusion = pd.crosstab(y_actu, y_pred.reshape(y_pred.shape[0],), rownames=['Actual'], colnames=['Predicted'], margins=True)

    df_conf_norm = df_confusion / df_confusion.sum(axis=1)

    plt.matshow(df_confusion, cmap=cmap) # imshow
    #plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(df_confusion.columns))
    plt.xticks(tick_marks, df_confusion.columns, rotation=45)
    plt.yticks(tick_marks, df_confusion.index)
    #plt.tight_layout()
    plt.ylabel(df_confusion.index.name)
    plt.xlabel(df_confusion.columns.name)



def predict(X, Y, W, b, word_to_vec_map):
    """
    Given X (sentences) and Y (emoji indices), predict emojis and compute the accuracy of your model over the given set.

    Arguments:
    X -- input data containing sentences, numpy array of shape (m, None)
    Y -- labels, containing index of the label emoji, numpy array of shape (m, 1)

    Returns:
    pred -- numpy array of shape (m, 1) with your predictions
    """
    m = X.shape[0]
    pred = np.zeros((m, 1))
    any_word = list(word_to_vec_map.keys())[0]
    # number of classes
    n_h = word_to_vec_map[any_word].shape[0]

    for j in range(m):                       # Loop over training examples

        # Split jth test example (sentence) into list of lower case words
        words = X[j].lower().split()

        # Average words' vectors
        avg = np.zeros((n_h,))
        count = 0
        for w in words:
            if w in word_to_vec_map:
                avg += word_to_vec_map[w]
                count += 1

        if count > 0:
            avg = avg / count

        # Forward propagation
        Z = np.dot(W, avg) + b
        A = softmax(Z)
        pred[j] = np.argmax(A)

    print("Accuracy: "  + str(np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))))

    return pred


### test_utils.py

In [4]:


from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv2DTranspose
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import ZeroPadding2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import RepeatVector


# Compare the two inputs
def comparator(learner, instructor):
    for a, b in zip(learner, instructor):
        if tuple(a) != tuple(b):
            print(colored("Test failed", attrs=['bold']),
                  "\n Expected value \n\n", colored(f"{b}", "green"),
                  "\n\n does not match the input value: \n\n",
                  colored(f"{a}", "red"))
            raise AssertionError("Error in test")
    print(colored("All tests passed!", "green"))

# extracts the description of a given model
def summary(model):
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    result = []
    for layer in model.layers:
        descriptors = [layer.__class__.__name__, layer.output_shape, layer.count_params()]
        if (type(layer) == Conv2D):
            descriptors.append(layer.padding)
            descriptors.append(layer.activation.__name__)
            descriptors.append(layer.kernel_initializer.__class__.__name__)
        if (type(layer) == MaxPooling2D):
            descriptors.append(layer.pool_size)
            descriptors.append(layer.strides)
            descriptors.append(layer.padding)
        if (type(layer) == Dropout):
            descriptors.append(layer.rate)
        if (type(layer) == ZeroPadding2D):
            descriptors.append(layer.padding)
        if (type(layer) == Dense):
            descriptors.append(layer.activation.__name__)
        if (type(layer) == LSTM):
            descriptors.append(layer.input_shape)
            descriptors.append(layer.activation.__name__)
            descriptors.append(layer.return_sequences)
        if (type(layer) == RepeatVector):
            descriptors.append(layer.n)
        result.append(descriptors)
    return result

# Utility Function

In [3]:
path="/content/drive/MyDrive/NLP/glove.6B.100d.txt"

In [4]:
import numpy as np

def read_glove_vecs(path):
  words = set()
  word_to_vec_map = {}
  with open(path,'r', encoding='utf-8') as f:
    for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      words.add(word)
      word_to_vec_map[word] = coefs

  i = 1
  words_to_index = {}
  index_to_words = {}
  for word in words:
    words_to_index[word] = i
    index_to_words[i] = word

  return words,word_to_vec_map,words_to_index, index_to_words,

words, word_to_vec_map,word_to_index, index_to_word = read_glove_vecs(path)

In [5]:
# Define sentences_to_indices function
def sentences_to_indices(X, word_to_index, max_len):
    m = len(X)
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = X[i].lower().split()
        j = 0
        for w in sentence_words:
            if w in word_to_index:
                X_indices[i, j] = word_to_index[w]
                j += 1
    return X_indices


# tensorflow Version

In [6]:
import numpy as np
import tensorflow
np.random.seed(0)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.initializers import glorot_uniform
np.random.seed(1)



from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv2DTranspose
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import ZeroPadding2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import RepeatVector

In [8]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.

    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """

    vocab_size = len(word_to_index) + 1              # adding 1 to fit Keras embedding (requirement)
    any_word = list(word_to_vec_map.keys())[0]
    emb_dim = word_to_vec_map[any_word].shape[0]    # define dimensionality of your GloVe word vectors (= 50)

    ### START CODE HERE ###
    # Step 1
    # Initialize the embedding matrix as a numpy array of zeros.
    # See instructions above to choose the correct shape.
    emb_matrix = np.zeros((vocab_size, emb_dim))

    # Step 2
    # Set each row "idx" of the embedding matrix to be
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Step 3
    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(vocab_size, emb_dim)
    ### END CODE HERE ###

    # Step 4 (already done for you; please do not modify)
    # Build the embedding layer, it is required before setting the weights of the embedding layer.
    embedding_layer.build((None,)) # Do not modify the "None".  This line of code is complete as-is.

    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])

    return embedding_layer

In [9]:
def Emojify_V2(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.

    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """

    ### START CODE HERE ###
    # Define sentence_indices as the input of the graph.
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(shape=input_shape, dtype='int32')

    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

    # Propagate sentence_indices through your embedding layer
    # (See additional hints in the instructions).
    embeddings = embedding_layer(sentence_indices)

    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with 5 units
    X = Dense(5)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)

    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices, outputs=X)

    ### END CODE HERE ###

    return model

In [10]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/NLP/train_emoji.csv')
X_train, Y_train = df.iloc[:,0].values,df.iloc[:,1].values

df = pd.read_csv('/content/drive/MyDrive/NLP/tesss.csv')
X_test, Y_test = df.iloc[:,0].values,df.iloc[:,1].values

maxLen = len(max(X_train, key=len).split())
print(f"maxLen : {maxLen}")

X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
Y_train_oh = np.eye(5)[Y_train.reshape(-1)] # 5 classes , 1D array

<class 'numpy.ndarray'> 131
<class 'numpy.ndarray'> 131
<class 'numpy.ndarray'> 55
<class 'numpy.ndarray'> 55
maxLen : 10


In [11]:
model = Emojify_V2((maxLen,), word_to_vec_map, word_to_index)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x78e8ad52dcc0>

In [12]:

X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
Y_test_oh = np.eye(5)[Y_test.reshape(-1)]
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.290909081697464


In [13]:
C = 5
y_test_oh = np.eye(C)[Y_test.reshape(-1)]
X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen)
pred = model.predict(X_test_indices)
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    if(num != Y_test[i]):
        print('Expected emoji:'+ label_to_emoji(Y_test[i]) + ' prediction: '+ X_test[i] + label_to_emoji(num).strip())

Expected emoji::disappointed: prediction: he did not answer	:smile:
Expected emoji::smile: prediction: he got a very nice raise	:disappointed:
Expected emoji::smile: prediction: she got me a nice present	:disappointed:
Expected emoji::smile: prediction: ha ha ha it was so funny	:disappointed:
Expected emoji::disappointed: prediction: I am upset	:smile:
Expected emoji::smile: prediction: We had such a lovely dinner tonight	:disappointed:
Expected emoji:🍴 prediction: where is the food	:smile:
Expected emoji::smile: prediction: Stop making this joke ha ha ha	:disappointed:
Expected emoji:⚾ prediction: where is the ball	:smile:
Expected emoji::disappointed: prediction: work is hard	:smile:
Expected emoji::disappointed: prediction: are you serious:smile:
Expected emoji:⚾ prediction: Let us go play baseball	:smile:
Expected emoji::disappointed: prediction: work is horrible	:smile:
Expected emoji::disappointed: prediction: stop pissing me off:smile:
Expected emoji:🍴 prediction: any suggestion

In [14]:
# Change the sentence below to see your prediction. Make sure all the words are in the Glove embeddings.
x_test = np.array(['I cannot play'])
X_test_indices = sentences_to_indices(x_test, word_to_index, maxLen)
print(x_test[0] +' '+  label_to_emoji(np.argmax(model.predict(X_test_indices))))

I cannot play :smile:


# pytorch version

In [20]:
import gc

gc.collect()

264

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

In [20]:
# Loading data
df = pd.read_csv('/content/drive/MyDrive/NLP/train_emoji.csv')
X_train, Y_train = df.iloc[:,0].values, df.iloc[:,1].values


df = pd.read_csv('/content/drive/MyDrive/NLP/tesss.csv')
X_test, Y_test = df.iloc[:,0].values, df.iloc[:,1].values


# Preprocessing
maxLen = len(max(X_train, key=len).split())
print(f"maxLen : {maxLen}")

X_train_indices = torch.from_numpy(sentences_to_indices(X_train, word_to_index, maxLen)).long()
Y_train_oh = torch.from_numpy(np.eye(5)[Y_train.reshape(-1)]).float()  # 5 classes, 1D array


maxLen : 10


In [34]:
# Embedding layer
class EmbeddingLayer(nn.Module):
    def __init__(self, word_to_vec_map, word_to_index):
        super(EmbeddingLayer, self).__init__()
        vocab_size = len(word_to_index) + 1
        any_word = list(word_to_vec_map.keys())[0]
        emb_dim = word_to_vec_map[any_word].shape[0]

        emb_matrix = np.zeros((vocab_size, emb_dim))
        for word, idx in word_to_index.items():
            emb_matrix[idx, :] = word_to_vec_map[word]

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(emb_matrix))
        self.embedding.weight.requires_grad = False

    def forward(self, inputs):
        return self.embedding(inputs)


In [32]:
class Emojify(nn.Module):
    def __init__(self, input_size, embedding_layer):
        super(Emojify, self).__init__()
        self.embedding = embedding_layer
        self.lstm1 = nn.LSTM(input_size, 128, batch_first=True, bidirectional=True)
        self.dropout1 = nn.Dropout(0.5)
        self.lstm2 = nn.LSTM(256, 128, batch_first=True, bidirectional=True)
        self.dropout2 = nn.Dropout(0.5)
        self.lstm3 = nn.LSTM(256, 128, batch_first=True)
        self.dropout3 = nn.Dropout(0.5)
        self.fc = nn.Linear(128, 5) # output has 5 classes C=5

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.lstm1(embeddings)
        output = self.dropout1(output)
        output, _ = self.lstm2(output)
        output = self.dropout2(output)
        output, (hidden, _) = self.lstm3(output)
        output = self.dropout3(hidden.squeeze(0))
        output = self.fc(output)
        return output


In [35]:
# Instantiate model
embedding_layer = EmbeddingLayer(word_to_vec_map, word_to_index)
model = Emojify(word_to_vec_map[list(word_to_vec_map.keys())[0]].shape[0],
                embedding_layer)

# Model Details
print(model)

Emojify(
  (embedding): EmbeddingLayer(
    (embedding): Embedding(400001, 100)
  )
  (lstm1): LSTM(100, 128, batch_first=True, bidirectional=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (lstm2): LSTM(256, 128, batch_first=True, bidirectional=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (lstm3): LSTM(256, 128, batch_first=True)
  (dropout3): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=128, out_features=5, bias=True)
)


In [None]:
# Training
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

for epoch in range(50):
    optimizer.zero_grad()
    outputs = model(X_train_indices)
    loss = criterion(outputs, Y_train_oh.argmax(dim=1))
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/50], Loss: {loss.item():.4f}')

In [24]:

# Testing
X_test_indices = torch.from_numpy(sentences_to_indices(X_test, word_to_index, maxLen)).long()
Y_test_oh = torch.from_numpy(np.eye(5)[Y_test.reshape(-1)]).float()
Y_test_tensor = torch.from_numpy(Y_test)  # Convert Y_test to a PyTorch Tensor

model.eval()
with torch.no_grad():
    outputs = model(X_test_indices)
    loss = criterion(outputs, Y_test_oh.argmax(dim=1))
    _, predicted = torch.max(outputs.data, 1)
    total = Y_test_tensor.size(0)  # Use Y_test_tensor instead of Y_test
    correct = (predicted == Y_test_oh.argmax(dim=1)).sum().item()
    acc = correct / total

print(f'Test Accuracy: {acc:.4f}')

Test Accuracy: 0.2545


In [26]:

C = 5
y_test_oh = torch.from_numpy(np.eye(C)[Y_test.reshape(-1)]).float()

X_test_indices = torch.from_numpy(sentences_to_indices(X_test, word_to_index, maxLen)).long()

model.eval()
with torch.no_grad():
    pred = model(X_test_indices)
    _, predicted = torch.max(pred.data, 1)

for i in range(len(X_test)):
    if predicted[i] != Y_test[i]:
        print('Expected emoji: ' + label_to_emoji(Y_test[i]) + ' prediction: ' + X_test[i] + label_to_emoji(predicted[i].item()).strip())

Expected emoji: :smile: prediction: he got a very nice raise	:disappointed:
Expected emoji: :smile: prediction: she got me a nice present	:disappointed:
Expected emoji: :smile: prediction: ha ha ha it was so funny	:disappointed:
Expected emoji: :smile: prediction: he is a good friend	:disappointed:
Expected emoji: :disappointed: prediction: I am upset	:smile:
Expected emoji: :smile: prediction: We had such a lovely dinner tonight	:disappointed:
Expected emoji: 🍴 prediction: where is the food	:disappointed:
Expected emoji: :smile: prediction: Stop making this joke ha ha ha	:disappointed:
Expected emoji: ⚾ prediction: where is the ball	:disappointed:
Expected emoji: :disappointed: prediction: work is hard	:smile:
Expected emoji: :disappointed: prediction: are you serious:smile:
Expected emoji: ⚾ prediction: Let us go play baseball	:disappointed:
Expected emoji: :disappointed: prediction: work is horrible	:smile:
Expected emoji: :smile: prediction: Congratulation for having a baby	:disapp

In [27]:
# Prediction
x_test = np.array(['I cannot play'])
X_test_indices = torch.from_numpy(sentences_to_indices(x_test, word_to_index, maxLen)).long()

with torch.no_grad():
    output = model(X_test_indices)
    _, predicted = torch.max(output.data, 1)
    print(x_test[0] + ' ' + label_to_emoji(predicted.item()))

I cannot play :smile:


In [None]:
#