In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path="/content/drive/MyDrive/NLP/glove.6B.100d.txt"

In [None]:
import numpy as np

def read_glove_vecs(path):
  words = set()
  word_to_vec_map = {}
  with open(path,'r', encoding='utf-8') as f:
    for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      words.add(word)
      word_to_vec_map[word] = coefs

  i = 1
  words_to_index = {}
  index_to_words = {}
  for word in words:
    words_to_index[word] = i
    index_to_words[i] = word

  return words,word_to_vec_map,words_to_index, index_to_words,

words, word_to_vec_map,word_to_index, index_to_word = read_glove_vecs(path)

In [None]:
# Define sentences_to_indices function
def sentences_to_indices(X, word_to_index, max_len):
    m = len(X)
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = X[i].lower().split()
        j = 0
        for w in sentence_words:
            if w in word_to_index:
                X_indices[i, j] = word_to_index[w]
                j += 1
    return X_indices


In [None]:
import gc

gc.collect()

44

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

In [None]:
# Loading data
df = pd.read_csv('/content/drive/MyDrive/NLP/train_emoji.csv')
X_train, Y_train = df.iloc[:,0].values, df.iloc[:,1].values


df = pd.read_csv('/content/drive/MyDrive/NLP/tesss.csv')
X_test, Y_test = df.iloc[:,0].values, df.iloc[:,1].values


# Preprocessing
maxLen = len(max(X_train, key=len).split())
print(f"maxLen : {maxLen}")

X_train_indices = torch.from_numpy(sentences_to_indices(X_train, word_to_index, maxLen)).long()
Y_train_oh = torch.from_numpy(np.eye(5)[Y_train.reshape(-1)]).float()  # 5 classes, 1D array


maxLen : 10


In [None]:
# Embedding layer
class EmbeddingLayer(nn.Module):
    def __init__(self, word_to_vec_map, word_to_index):
        super(EmbeddingLayer, self).__init__()
        vocab_size = len(word_to_index) + 1
        any_word = list(word_to_vec_map.keys())[0]
        emb_dim = word_to_vec_map[any_word].shape[0]

        emb_matrix = np.zeros((vocab_size, emb_dim))
        for word, idx in word_to_index.items():
            emb_matrix[idx, :] = word_to_vec_map[word]

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(emb_matrix))
        self.embedding.weight.requires_grad = False

    def forward(self, inputs):
        return self.embedding(inputs)


In [None]:
class Emojify(nn.Module):
    def __init__(self, input_size, embedding_layer):
        super(Emojify, self).__init__()
        self.embedding = embedding_layer
        self.lstm1 = nn.LSTM(input_size, 128, batch_first=True, bidirectional=True)
        self.dropout1 = nn.Dropout(0.5)
        self.lstm2 = nn.LSTM(256, 128, batch_first=True, bidirectional=True)
        self.dropout2 = nn.Dropout(0.5)
        self.lstm3 = nn.LSTM(256, 128, batch_first=True)
        self.dropout3 = nn.Dropout(0.5)
        self.fc = nn.Linear(128, 5) # output has 5 classes C=5

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.lstm1(embeddings)
        output = self.dropout1(output)
        output, _ = self.lstm2(output)
        output = self.dropout2(output)
        output, (hidden, _) = self.lstm3(output)
        output = self.dropout3(hidden.squeeze(0))
        output = self.fc(output)
        return output


In [None]:
# Instantiate model
input_size = word_to_vec_map[list(word_to_vec_map.keys())[0]].shape[0]
embedding_layer = EmbeddingLayer(word_to_vec_map, word_to_index)
model = Emojify(input_size,embedding_layer)

# Model Details
print(model)

Emojify(
  (embedding): EmbeddingLayer(
    (embedding): Embedding(400001, 100)
  )
  (lstm1): LSTM(100, 128, batch_first=True, bidirectional=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (lstm2): LSTM(256, 128, batch_first=True, bidirectional=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (lstm3): LSTM(256, 128, batch_first=True)
  (dropout3): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=128, out_features=5, bias=True)
)


In [None]:
# Training
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

for epoch in range(50):
    optimizer.zero_grad()
    outputs = model(X_train_indices)
    loss = criterion(outputs, Y_train_oh.argmax(dim=1))
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/50], Loss: {loss.item():.4f}')

Epoch [10/50], Loss: 1.5570
Epoch [20/50], Loss: 1.5641
Epoch [30/50], Loss: 1.5391
Epoch [40/50], Loss: 1.5045
Epoch [50/50], Loss: 1.4997


In [None]:

# Testing
X_test_indices = torch.from_numpy(sentences_to_indices(X_test, word_to_index, maxLen)).long()
Y_test_oh = torch.from_numpy(np.eye(5)[Y_test.reshape(-1)]).float()
Y_test_tensor = torch.from_numpy(Y_test)  # Convert Y_test to a PyTorch Tensor

model.eval()
with torch.no_grad():
    outputs = model(X_test_indices)
    loss = criterion(outputs, Y_test_oh.argmax(dim=1))
    _, predicted = torch.max(outputs.data, 1)
    total = Y_test_tensor.size(0)  # Use Y_test_tensor instead of Y_test
    correct = (predicted == Y_test_oh.argmax(dim=1)).sum().item()
    acc = correct / total

print(f'Test Accuracy: {acc:.4f}')

Test Accuracy: 0.2545


In [None]:
!pip install -q emoji

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/433.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/433.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m430.1/433.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import emoji

emoji_dictionary = {#"0": ":red_heart:",    # :heart: prints a black instead of red heart depending on the font
                    "0": "\u2764\ufe0f",
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

def label_to_emoji(label):
    """
    Converts a label (int or string) into the corresponding emoji code (string) ready to be printed
    """
    return emoji.emojize(emoji_dictionary[str(label)])

In [None]:

C = 5
y_test_oh = torch.from_numpy(np.eye(C)[Y_test.reshape(-1)]).float()

X_test_indices = torch.from_numpy(sentences_to_indices(X_test, word_to_index, maxLen)).long()

model.eval()
with torch.no_grad():
    pred = model(X_test_indices)
    _, predicted = torch.max(pred.data, 1)


for i in range(len(X_test)):
    if predicted[i] != Y_test[i]:
        print('Expected emoji: ' + label_to_emoji(Y_test[i]) + ' prediction: ' + X_test[i] + label_to_emoji(predicted[i].item()).strip())

Expected emoji: :smile: prediction: he got a very nice raise	:disappointed:
Expected emoji: :smile: prediction: she got me a nice present	:disappointed:
Expected emoji: :smile: prediction: ha ha ha it was so funny	:disappointed:
Expected emoji: :smile: prediction: he is a good friend	:disappointed:
Expected emoji: :disappointed: prediction: I am upset	:smile:
Expected emoji: :smile: prediction: We had such a lovely dinner tonight	:disappointed:
Expected emoji: 🍴 prediction: where is the food	:disappointed:
Expected emoji: :smile: prediction: Stop making this joke ha ha ha	:disappointed:
Expected emoji: ⚾ prediction: where is the ball	:disappointed:
Expected emoji: :disappointed: prediction: work is hard	:smile:
Expected emoji: :disappointed: prediction: are you serious:smile:
Expected emoji: ⚾ prediction: Let us go play baseball	:disappointed:
Expected emoji: :disappointed: prediction: work is horrible	:smile:
Expected emoji: :smile: prediction: Congratulation for having a baby	:disapp

In [None]:
# Prediction
x_test = np.array(['I cannot play'])
X_test_indices = torch.from_numpy(sentences_to_indices(x_test, word_to_index, maxLen)).long()

with torch.no_grad():
    output = model(X_test_indices)
    _, predicted = torch.max(output.data, 1)
    print(x_test[0] + ' ' + label_to_emoji(predicted.item()))

I cannot play :smile:


In [None]:
#