In [40]:
import numpy as np
import pandas as pd
import emoji
import os.path as path
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [42]:
data = pd.read_csv('emoji_data.csv', header=None)

emoji_dict = {
    0: ":red_heart:",
    1: ":baseball:",
    2: ":grinning_face_with_big_eyes:",
    3: ":disappointed_face:",
    4: ":fork_and_knife_with_plate:"
}

In [43]:
print(data.head())

                             0  1
0  French macaroon is so tasty  4
1             work is horrible  3
2                   I am upset  3
3               throw the ball  1
4                    Good joke  2


In [44]:
def label_to_emoji(label):
    return emoji.emojize(emoji_dict[label])

In [45]:
x = data[0].values
y = data[1].values

In [35]:
print(x[:10])
print(y[:10])

['French macaroon is so tasty' 'work is horrible' 'I am upset'
 'throw the ball' 'Good joke' 'what is your favorite baseball game'
 'I cooked meat' 'stop messing around' 'I want chinese food'
 'Let us go play baseball']
tensor([4, 3, 3, 1, 2, 1, 4, 3], device='cuda:0')


Apply Tokenizer and convert sentence to tokens with Inbuilt Function ====================================================

In [46]:
# initialize the token
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)
word2index = tokenizer.word_index
print(word2index)

{'i': 1, 'you': 2, 'is': 3, 'the': 4, 'a': 5, 'so': 6, 'am': 7, 'my': 8, 'to': 9, 'this': 10, 'are': 11, 'ha': 12, 'for': 13, 'she': 14, 'he': 15, 'me': 16, 'not': 17, 'love': 18, 'your': 19, 'want': 20, 'have': 21, 'it': 22, 'got': 23, 'like': 24, 'did': 25, 'baseball': 26, 'food': 27, 'was': 28, 'do': 29, 'joke': 30, 'stop': 31, 'will': 32, 'miss': 33, 'life': 34, 'ball': 35, 'good': 36, 'what': 37, 'go': 38, 'job': 39, 'funny': 40, 'bad': 41, 'day': 42, 'great': 43, 'dinner': 44, 'that': 45, 'with': 46, 'at': 47, 'of': 48, 'game': 49, 'we': 50, 'again': 51, 'said': 52, 'yes': 53, 'lol': 54, 'and': 55, 'down': 56, 'had': 57, 'her': 58, 'fun': 59, 'smile': 60, 'lot': 61, 'working': 62, 'him': 63, 'cute': 64, 'on': 65, 'lets': 66, 'messing': 67, 'us': 68, 'play': 69, 'exercise': 70, 'lost': 71, 'never': 72, 'where': 73, 'can': 74, 'well': 75, 'much': 76, 'valentine': 77, 'restaurant': 78, 'awesome': 79, 'likes': 80, 'such': 81, 'shouting': 82, 'proud': 83, 'bravo': 84, 'two': 85, 'fore

In [48]:
sentence_to_token = tokenizer.texts_to_sequences(x)
print(sentence_to_token)

[[103, 104, 3, 6, 105], [106, 3, 107], [1, 7, 108], [109, 4, 35], [36, 30], [37, 3, 19, 110, 26, 49], [1, 111, 112], [31, 67, 113], [1, 20, 114, 27], [115, 68, 38, 69, 26], [2, 11, 116, 10, 70], [117, 50, 71, 51], [36, 39], [12, 12, 12, 22, 28, 6, 40], [1, 32, 21, 5, 118, 119], [120, 11, 2, 121, 41], [1, 20, 9, 30], [1, 72, 52, 53, 13, 10], [4, 122, 3, 123], [73, 3, 4, 35], [1, 7, 124], [12, 12, 12, 54], [14, 52, 53], [15, 23, 5, 125], [126, 3, 127, 1, 21], [15, 74, 128, 129, 75], [1, 18, 9, 4, 130, 55, 131], [29, 2, 24, 132], [2, 133, 134, 10, 135], [1, 33, 2, 6, 76], [1, 24, 19, 136], [14, 23, 16, 5, 137], [32, 2, 138, 8, 77], [2, 139, 4, 140], [141, 3, 56, 13, 5, 78], [77, 42, 3, 142], [43, 6, 79], [29, 2, 21, 5, 35], [15, 74, 17, 29, 143], [15, 80, 26], [50, 57, 81, 5, 144, 44, 145], [146, 11, 147], [15, 3, 5, 36, 148], [72, 149, 9, 16, 51], [1, 33, 58], [27, 3, 34], [1, 7, 150, 59], [6, 41, 45, 2, 151, 152, 46, 68], [29, 2, 20, 9, 153, 16, 13, 44], [1, 24, 9, 60], [15, 25, 154, 15

In [49]:
# Find maxlen
maxlen = max(len(seq) for seq in sentence_to_token)
sentence_to_token = pad_sequences(sentence_to_token, maxlen=maxlen, padding='post', truncating='post')

print(maxlen)
print(sentence_to_token[0:5])

10
[[103 104   3   6 105   0   0   0   0   0]
 [106   3 107   0   0   0   0   0   0   0]
 [  1   7 108   0   0   0   0   0   0   0]
 [109   4  35   0   0   0   0   0   0   0]
 [ 36  30   0   0   0   0   0   0   0   0]]


In [50]:
print(type(sentence_to_token))
print(type(sentence_to_token[0]))
print(type(sentence_to_token[0][0]))

# as the number in the array and the inner and outside array are in np.array .... so I in my custom tokinezation I need to convert in the same way

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.int32'>


Apply Tokenizer and convert sentence to tokens Manually ====================================================

In [11]:
word2index_temp = set()
for sent in x: 
    data = sent.split(" ")
    for tt in data: 
        word2index_temp.add(tt) if tt!="" else "" 

# print(word2index_temp)
# convert it to Dictionary with index number to tokenize

word2index = {}
for i , data in enumerate(word2index_temp):
    word2index[data] = i
    
    
print(word2index)
print(word2index['i'], word2index['you'], word2index['is'], word2index['the'])

{'ha': 0, 'long': 1, 'awful': 2, 'Are': 3, 'during': 4, 'said': 5, 'she': 6, 'seen': 7, 'want': 8, 'not': 9, 'chinese': 10, 'prize': 11, 'were': 12, 'dear': 13, 'hours': 14, 'breaks': 15, 'new': 16, 'Stop': 17, 'good': 18, 'your': 19, 'player': 20, 'made': 21, 'Do': 22, 'It': 23, 'stupidity': 24, 'smiles': 25, 'Lets': 26, 'dog': 27, 'Who': 28, 'cutest': 29, 'so': 30, 'The': 31, 'the': 32, 'end': 33, 'mean': 34, 'cookies': 35, 'bad': 36, 'sister': 37, 'give': 38, 'worked': 39, 'what': 40, 'smile': 41, 'awesome': 42, 'am': 43, 'suggestions': 44, 'feeling': 45, 'go': 46, 'Bravo': 47, 'suck': 48, 'poorly': 49, 'assignment': 50, 'when': 51, 'love': 52, 'her': 53, 'amazing': 54, 'man': 55, 'cooked': 56, 'but': 57, 'intelligent': 58, 'me': 59, 'valentine': 60, 'wrong': 61, 'vegetables': 62, 'excited': 63, 'birthday': 64, 'horrible': 65, 'happy': 66, 'totally': 67, 'few': 68, 'lets': 69, 'grader': 70, 'miss': 71, 'such': 72, 'She': 73, 'hate': 74, 'at': 75, 'French': 76, 'announcement': 77, 'd

In [12]:
# find max sentence length 
max_Sentence_Len = 0
for d in x : 
    data = d.split(" ")
    max_Sentence_Len = len(data) if max_Sentence_Len<len(data) else max_Sentence_Len
print(f"max legth of sentene in terms of word: {max_Sentence_Len}")
    

max legth of sentene in terms of word: 10


In [13]:
# now you need to convert the word to tokens and the sentence length less than 10 will be replaced by 0 
sentence_to_token = []

for sent in x: 
    words = sent.split(" ")
    temp =[]
    for index, t in enumerate(words): 
        if t == "" or t == " ": 
            continue
        else: 
            temp.append(np.int32(word2index[t]))

    while len(temp)<10: 
        temp.append(np.int32(0))
        
    sentence_to_token.append(np.array(temp))
    
    
sentence_to_token = np.array(sentence_to_token)
print(sentence_to_token[0:5])
print(sentence_to_token.shape)



[[ 76 166 152  30 203   0   0   0   0   0]
 [286 152  65   0   0   0   0   0   0   0]
 [207  43 328   0   0   0   0   0   0   0]
 [205  32 261   0   0   0   0   0   0   0]
 [240 316   0   0   0   0   0   0   0   0]]
(183, 10)


In [51]:
# Convert labels to one-hot encoding
y_train = torch.tensor(y, dtype=torch.long)
print(y_train)

tensor([4, 3, 3, 1, 2, 1, 4, 3, 4, 1, 3, 3, 2, 2, 4, 3, 2, 3, 3, 1, 3, 2, 2, 2,
        0, 1, 0, 4, 2, 0, 2, 0, 0, 3, 4, 0, 2, 1, 3, 1, 0, 4, 0, 3, 0, 4, 2, 3,
        4, 2, 2, 3, 0, 2, 2, 3, 2, 3, 2, 2, 3, 3, 0, 2, 3, 0, 2, 0, 0, 2, 3, 2,
        4, 1, 3, 3, 0, 0, 3, 2, 0, 3, 0, 2, 2, 4, 2, 2, 0, 0, 2, 3, 0, 4, 2, 1,
        2, 3, 3, 2, 3, 0, 3, 0, 2, 0, 2, 3, 4, 3, 1, 3, 4, 3, 2, 3, 3, 3, 1, 4,
        4, 2, 2, 1, 1, 2, 3, 2, 3, 4, 2, 3, 0, 2, 0, 0, 4, 3, 4, 2, 3, 2, 3, 4,
        2, 1, 2, 4, 3, 1, 3, 2, 3, 2, 2, 3, 3, 2, 4, 0, 0, 0, 3, 0, 0, 1, 1, 2,
        2, 2, 0, 3, 2, 3, 3, 1, 2, 2, 4, 2, 3, 1, 2])


Now Read and Prepare the Pre Trained Embadding ===============================================

In [52]:
embeddings = {}
with open(path.abspath("./trained_biagram_model/glove.6B.50d.txt"), 'r', encoding='utf8') as file:
    for line in file:
        line = line.split()
        embeddings[line[0]] = np.array(line[1:], dtype=np.float32)

In [79]:
# word2index => functional/manually converted Vocab
# here I will create a new vocab which is just stores the pre-trained embadding in matrix .. its having same orser of orders as vocab=word2index

embed_size = 50
embedding_matrix = np.zeros((len(word2index) + 1, embed_size))

for word, i in word2index.items():
    if word in embeddings:
        embedding_matrix[i] = embeddings[word]

print("embedding matrix shaep(num of vocab , pre trained embedding dim): ", embedding_matrix.shape)
        
        

embedding atrix shaep:  (313, 50)


convert the "sentence_to_token" (x_train) and y_train in tensor 

In [54]:
x_train = torch.tensor(sentence_to_token , dtype=torch.long).to("cuda")
y_train = torch.tensor(y_train , dtype=torch.long).to("cuda")

  y_train = torch.tensor(y_train , dtype=torch.long).to("cuda")


Now batchify the x_train and y_train data

In [61]:
train_dataset = TensorDataset(x_train,y_train)
print(train_dataset)

train_dataloader = DataLoader(train_dataset, batch_size=32)
for X, y in train_dataloader:
  print(X.shape, y.shape)
  break

<torch.utils.data.dataset.TensorDataset object at 0x00000217F8FBBEF0>
torch.Size([32, 10]) torch.Size([32])


Now create the model ===================================================

In [62]:
class EmojiClassifier(nn.Module):
    def __init__(self, embed_size, embedding_matrix):
        super(EmojiClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False
        )
        self.lstm1 = nn.LSTM(embed_size, 16, batch_first=True, bidirectional=False)
        self.lstm2 = nn.LSTM(16, 4, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(4, 5) # cause you have 5 types of emoji and from this 5 output you need to choose argmax of them

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = x[:, -1, :]  # Take the last output
        x = self.fc(x)
        return x
    
model = EmojiClassifier(embed_size, embedding_matrix).to("cuda")

In [63]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [64]:

# Training loop
for epoch in range(10000):
    total_loss = 0
    model.train()
    for X_batch, Y_batch in train_dataloader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, Y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")

Epoch 1, Loss: 1.6108000079790752
Epoch 2, Loss: 1.6005067825317383
Epoch 3, Loss: 1.590661923090617
Epoch 4, Loss: 1.5807783007621765
Epoch 5, Loss: 1.5707005659739177
Epoch 6, Loss: 1.560509939988454
Epoch 7, Loss: 1.5505135655403137
Epoch 8, Loss: 1.5407959818840027
Epoch 9, Loss: 1.5304746429125469
Epoch 10, Loss: 1.5181093215942383
Epoch 11, Loss: 1.503238320350647
Epoch 12, Loss: 1.4843122164408367
Epoch 13, Loss: 1.460449715455373
Epoch 14, Loss: 1.4320523142814636
Epoch 15, Loss: 1.3995856642723083
Epoch 16, Loss: 1.3627193371454875
Epoch 17, Loss: 1.3234346310297649
Epoch 18, Loss: 1.2840856512387593
Epoch 19, Loss: 1.2444238265355427
Epoch 20, Loss: 1.2069388628005981
Epoch 21, Loss: 1.1697565913200378
Epoch 22, Loss: 1.1387657125790913
Epoch 23, Loss: 1.115756909052531
Epoch 24, Loss: 1.095685601234436
Epoch 25, Loss: 1.075851837793986
Epoch 26, Loss: 1.057805319627126
Epoch 27, Loss: 1.0428460836410522
Epoch 28, Loss: 1.0302159388860066
Epoch 29, Loss: 1.0185115834077199
Ep

In [65]:
torch.save(model.state_dict(), "emojiPredict_1.pth")
print("Model weights saved to emojiPredict_1.pth")

Model weights saved to emojiPredict_1.pth


In [77]:
# Testing
test = ["I feel good", "I feel very bad", "lets eat dinner", "I hate you", "lets play", "I am angry", "I am mad at you"]
test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen=maxlen, padding='post', truncating='post')
Xtest = torch.tensor(Xtest, dtype=torch.long).to("cuda")

In [78]:
model.eval()
with torch.no_grad():
    y_pred = model(Xtest)
    y_pred = torch.argmax(y_pred, dim=1)

for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i].item()))
    
# as my model have just 5 emoji ... so it may not detect the emoji not in my list

I feel good 😃
I feel very bad 😞
lets eat dinner 🍽️
I hate you 😞
lets play ⚾
I am angry 😃
I am mad at you ❤️
