<a href="https://colab.research.google.com/github/King-Rian/Project-3/blob/main/project_3_Lou.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [100]:
# Install emoji library
!pip install emoji
!pip install torch
from google.colab import files
import zipfile
import json
import emoji
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaModel, AdamW, T5Tokenizer, T5ForConditionalGeneration



In [101]:
# Retrieve the 'emojis.json' zippped file

# Step 1: Upload the file
uploaded = files.upload()

# Step 2: Extract the ZIP file
with zipfile.ZipFile('emojis.json.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Step 3: Load the extracted JSON file
emoji_file = 'emojis.json'

# Using pandas
emoji_df = pd.read_json(emoji_file)
print("Dataset loaded successfully. Sample data:")
print(emoji_df.head())


Saving emojis.json.zip to emojis.json (5).zip
Dataset loaded successfully. Sample data:
                                            category  \
0  Miscellaneous Symbols And Pictographs -> Emoji...   
1  Miscellaneous Symbols And Pictographs -> Emoji...   
2                                               None   
3  Miscellaneous Symbols And Pictographs -> Emoji...   
4                                               None   

                                            keywords  \
0  [dark skin tone, hand, forbidden, gesture, wom...   
1                     [dark skin tone, woman, guard]   
2                 [racing, running, woman, marathon]   
3  [gymnastics, medium-light skin tone, woman, ca...   
4                                      [woman, golf]   

                                          definition  \
0  The Woman Gesturing Not OK, Type-6 emoji is a ...   
1  The Female Guard, Type-6 emoji is a sequence o...   
2  The female version of the ?? Runner emoji. The...   
3  The Woman D

In [102]:
# Check the shape, features, and datatypes of the df
emoji_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2389 entries, 0 to 2388
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   category    1988 non-null   object
 1   keywords    2389 non-null   object
 2   definition  2389 non-null   object
 3   unicode     2389 non-null   object
 4   name        2389 non-null   object
 5   shortcode   845 non-null    object
 6   senses      2389 non-null   object
dtypes: object(7)
memory usage: 130.8+ KB


In [103]:
# Set options to display all rows and columns
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

# Create a new emojis DataFrame with 'name' and 'unicode'
key_value_pairs_df = emoji_df[['name', 'unicode']]

# Process the 'name' column to include : at the beginning and end of the emoji name
key_value_pairs_df['name'] = key_value_pairs_df['name'].apply(
    lambda x: f":{x.split(':')[0].strip()}:"  # Remove everything after the first colon and add colons
)

# Convert the unicode column to emoji symbols
key_value_pairs_df['emoji'] = key_value_pairs_df['name'].apply(emoji.emojize)

# Filter out rows where 'unicode' contains more than one Unicode value
# We check for the number of spaces and ensure it is less than 1
# Eliminating complex emojis and ZWJ sequence values
filtered_df = key_value_pairs_df[key_value_pairs_df['unicode'].str.count(' ') < 1]

# Remove rows where 'emoji' contains text with a beginning and ending ':' - which means that emoji was not found for that :name:
filtered_df = key_value_pairs_df[~key_value_pairs_df['emoji'].str.match(r'^:.*:$')]

# Create a new df and rename features to read for concatination with the next data set
new_filtered_df = filtered_df[['name', 'emoji']]
new_filtered_df = new_filtered_df.rename(columns={'name': 'label'})

# Display the DataFrame with emoji symbols
new_filtered_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  key_value_pairs_df['name'] = key_value_pairs_df['name'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  key_value_pairs_df['emoji'] = key_value_pairs_df['name'].apply(emoji.emojize)


Unnamed: 0,label,emoji
15,:squid:,🦑
16,:shrimp:,🦐
17,:rhinoceros:,🦏
18,:lizard:,🦎
19,:gorilla:,🦍


In [104]:
# Create an emoji dictionary with labels and their Unicode values from https://carpedm20.github.io/emoji/docs/index.html
emoji_dict = {emoji.demojize(e): e for e in emoji.EMOJI_DATA}

# Show how many key value pairs we retrieved from emoji library
records = len(emoji_dict)
print(f"Number of records: {records}")

# Create new df for use in concatenation the two data frames
emoji_dictionary_df = pd.DataFrame(emoji_dict.items(), columns=['label', 'emoji'])
emoji_dictionary_df.head()
#emoji_dictionary_df.info()

Number of records: 3790


Unnamed: 0,label,emoji
0,:1st_place_medal:,🥇
1,:2nd_place_medal:,🥈
2,:3rd_place_medal:,🥉
3,:AB_button_(blood_type):,🆎
4,:ATM_sign:,🏧


In [105]:
# Concatentate both dataframes / sources and drop the index
combined_emoji_df = pd.concat([emoji_dictionary_df, new_filtered_df], ignore_index=True)

# Show the key pair data frame with training data
combined_emoji_df.head()

Unnamed: 0,label,emoji
0,:1st_place_medal:,🥇
1,:2nd_place_medal:,🥈
2,:3rd_place_medal:,🥉
3,:AB_button_(blood_type):,🆎
4,:ATM_sign:,🏧


In [106]:
# Remove : character used for outputting emoji (vs. unicode) and replace underscores with spaces
combined_emoji_df['label'] = combined_emoji_df['label'].str.replace(':', '').str.replace('_', ' ')

# Print output
combined_emoji_df.head()


Unnamed: 0,label,emoji
0,1st place medal,🥇
1,2nd place medal,🥈
2,3rd place medal,🥉
3,AB button (blood type),🆎
4,ATM sign,🏧


In [92]:
# Augment label data with synomyms
# !pip install nltk==3.8.1
'''
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

def augment_text(phrase):
  words = phrase.split()
  augmented_words = []
  for word in words:
    synonyms = wordnet.synsets(word)
    if synonyms:
        synonym = synonyms[0].lemmas()[0].name()
        augmented_words.append(synonym)
    else:
         augmented_words.append(word)
    return ' '.join(augmented_words)

combined_emoji_df['label'] = combined_emoji_df['label'].apply(augment_text)

combined_emoji_df.head()
'''

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,label,emoji,emoji_index
0,first,🥇,0
1,second,🥈,1
2,third,🥉,2
3,Bachelor_of_Arts,🆎,3
4,standard_atmosphere,🏧,4


In [107]:
# Step 1: Prepare the Data
# Create a mapping of emojis to indices
emoji_to_index = {emoji: idx for idx, emoji in enumerate(combined_emoji_df['emoji'].unique())}
index_to_emoji = {idx: emoji for emoji, idx in emoji_to_index.items()}

print("Emoji to Index Mapping:")
print(emoji_to_index)

print("\nIndex to Emoji Mapping:")
print(index_to_emoji)

Emoji to Index Mapping:
{'🥇': 0, '🥈': 1, '🥉': 2, '🆎': 3, '🏧': 4, '🅰': 5, '🇦🇫': 6, '🇦🇱': 7, '🇩🇿': 8, '🇦🇸': 9, '🇦🇩': 10, '🇦🇴': 11, '🇦🇮': 12, '🇦🇶': 13, '🇦🇬': 14, '♒': 15, '🇦🇷': 16, '♈': 17, '🇦🇲': 18, '🇦🇼': 19, '🇦🇨': 20, '🇦🇺': 21, '🇦🇹': 22, '🇦🇿': 23, '🔙': 24, '🅱': 25, '🇧🇸': 26, '🇧🇭': 27, '🇧🇩': 28, '🇧🇧': 29, '🇧🇾': 30, '🇧🇪': 31, '🇧🇿': 32, '🇧🇯': 33, '🇧🇲': 34, '🇧🇹': 35, '🇧🇴': 36, '🇧🇦': 37, '🇧🇼': 38, '🇧🇻': 39, '🇧🇷': 40, '🇮🇴': 41, '🇻🇬': 42, '🇧🇳': 43, '🇧🇬': 44, '🇧🇫': 45, '🇧🇮': 46, '🆑': 47, '🆒': 48, '🇰🇭': 49, '🇨🇲': 50, '🇨🇦': 51, '🇮🇨': 52, '♋': 53, '🇨🇻': 54, '♑': 55, '🇧🇶': 56, '🇰🇾': 57, '🇨🇫': 58, '🇪🇦': 59, '🇹🇩': 60, '🇨🇱': 61, '🇨🇳': 62, '🇨🇽': 63, '🎄': 64, '🇨🇵': 65, '🇨🇨': 66, '🇨🇴': 67, '🇰🇲': 68, '🇨🇬': 69, '🇨🇩': 70, '🇨🇰': 71, '🇨🇷': 72, '🇭🇷': 73, '🇨🇺': 74, '🇨🇼': 75, '🇨🇾': 76, '🇨🇿': 77, '🇨🇮': 78, '🇩🇰': 79, '🇩🇬': 80, '🇩🇯': 81, '🇩🇲': 82, '🇩🇴': 83, '🔚': 84, '🇪🇨': 85, '🇪🇬': 86, '🇸🇻': 87, '🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f': 88, '🇬🇶': 89, '🇪🇷': 90, '🇪🇪': 91, '🇸🇿': 92, '🇪🇹': 93, '🇪

In [108]:
# Step 2: Text Encoding
tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_data(phrases):
    # T5 expects the input to be formatted as "translate English to emoji: <text>"
    formatted_phrases = ["translate English to emoji: " + phrase for phrase in phrases.tolist()]
    encoded = tokenizer(formatted_phrases, padding=True, truncation=True, return_tensors='pt')
    return encoded['input_ids'], encoded['attention_mask']

input_ids, attention_masks = tokenize_data(combined_emoji_df['label'])

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [109]:
# Step 3: Prepare Emoji Sequences to Indices
combined_emoji_df['emoji_index'] = combined_emoji_df['emoji'].map(emoji_to_index)
target_indices = torch.tensor(combined_emoji_df['emoji_index'].tolist())  # Convert to tensor

In [110]:
# Step 4: Create DataLoader
dataset = TensorDataset(input_ids, attention_masks, target_indices)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [111]:
# Step 5: Build the Model
class T5EmojiClassifier(nn.Module):
    def __init__(self, num_classes):
        super(T5EmojiClassifier, self).__init__()
        self.t5 = T5ForConditionalGeneration.from_pretrained('t5-small')
        self.num_classes = num_classes

    def forward(self, input_ids, attention_mask):
        outputs = self.t5(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        return outputs.loss  # Return the loss for training

# Initialize the model
num_classes = len(emoji_to_index)
model = T5EmojiClassifier(num_classes)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Step 6: Training the Model
criterion = nn.CrossEntropyLoss()  # Note: This may not be used directly with T5
optimizer = AdamW(model.parameters(), lr=1e-5)  # Use AdamW instead of Adam

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_masks, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        loss = model(input_ids, attention_masks)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/3, Loss: 0.6928215632620984


In [None]:
# Step 7: Evaluate the Model
model.eval()
val_accuracy = 0
total = 0

with torch.no_grad():
    for batch in train_loader:
        input_ids, attention_masks, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_masks)

        _, predicted = torch.max(outputs, 1)
        val_accuracy += (predicted == labels).sum().item()
        total += labels.size(0)

print(f"Validation Accuracy: {val_accuracy / total:.4f}")

In [None]:
# Step 8: Make Predictions
def predict_emoji(phrase):
    model.eval()
    inputs = tokenizer(phrase, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        _, predicted_index = torch.max(outputs, 1)

    return index_to_emoji[predicted_index.item()]

In [None]:
# Step 9: Test Predictions
new_phrase = "sad"
predicted_emoji = predict_emoji(new_phrase)
print(f"The predicted emoji for '{new_phrase}' is: {predicted_emoji}")

In [None]:
# Step 10: Build out a user interface that accepts text entry and uses the model to predict output
