<a href="https://colab.research.google.com/github/King-Rian/Project-3/blob/main/Attempts__Other_Options.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install emoji library
!pip install emoji
!pip install torch
from google.colab import files
import zipfile
import json
import emoji
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaModel, AdamW, T5Tokenizer, T5ForConditionalGeneration

In [None]:
# Retrieve the 'emojis.json' zippped file

# Step 1: Upload the file
uploaded = files.upload()

# Step 2: Extract the ZIP file
with zipfile.ZipFile('emojis.json.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Step 3: Load the extracted JSON file
emoji_file = 'emojis.json'

# Using pandas
emoji_df = pd.read_json(emoji_file)
print("Dataset loaded successfully. Sample data:")
print(emoji_df.head())


In [None]:
# Check the shape, features, and datatypes of the df
emoji_df.info()

In [None]:
# Set options to display all rows and columns
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

# Create a new emojis DataFrame with 'name' and 'unicode'
key_value_pairs_df = emoji_df[['name', 'unicode']]

# Process the 'name' column to include : at the beginning and end of the emoji name
key_value_pairs_df['name'] = key_value_pairs_df['name'].apply(
    lambda x: f":{x.split(':')[0].strip()}:"  # Remove everything after the first colon and add colons
)

# Convert the unicode column to emoji symbols
key_value_pairs_df['emoji'] = key_value_pairs_df['name'].apply(emoji.emojize)

# Filter out rows where 'unicode' contains more than one Unicode value
# We check for the number of spaces and ensure it is less than 1
# Eliminating complex emojis and ZWJ sequence values
filtered_df = key_value_pairs_df[key_value_pairs_df['unicode'].str.count(' ') < 1]

# Remove rows where 'emoji' contains text with a beginning and ending ':' - which means that emoji was not found for that :name:
filtered_df = key_value_pairs_df[~key_value_pairs_df['emoji'].str.match(r'^:.*:$')]

# Create a new df and rename features to read for concatination with the next data set
new_filtered_df = filtered_df[['name', 'emoji']]
new_filtered_df = new_filtered_df.rename(columns={'name': 'label'})

# Display the DataFrame with emoji symbols
new_filtered_df.head()

In [None]:
# Create an emoji dictionary with labels and their Unicode values from https://carpedm20.github.io/emoji/docs/index.html
emoji_dict = {emoji.demojize(e): e for e in emoji.EMOJI_DATA}

# Show how many key value pairs we retrieved from emoji library
records = len(emoji_dict)
print(f"Number of records: {records}")

# Create new df for use in concatenation the two data frames
emoji_dictionary_df = pd.DataFrame(emoji_dict.items(), columns=['label', 'emoji'])
emoji_dictionary_df.head()
#emoji_dictionary_df.info()

In [None]:
# Concatentate both dataframes / sources and drop the index
combined_emoji_df = pd.concat([emoji_dictionary_df, new_filtered_df], ignore_index=True)

# Show the key pair data frame with training data
combined_emoji_df.head()

In [None]:
# Remove : character used for outputting emoji (vs. unicode) and replace underscores with spaces
combined_emoji_df['label'] = combined_emoji_df['label'].str.replace(':', '').str.replace('_', ' ')

# Print output
combined_emoji_df.head()


In [None]:
# Save processed dataset
combined_emoji_df.to_csv('processed_emojis.csv', index=False)

# Display dataset
print(combined_emoji_df.head())

In [None]:
# Import necessary libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Initialize the T5 tokenizer and model
# Load the "t5-small" model and tokenizer from Hugging Face's transformers library
# T5 (Text-to-Text Transfer Transformer) is a model capable of various NLP tasks
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Prepare data for T5
# Convert the 'label' column to a list of input texts and the 'emoji' column to a list of target texts
# These will be the input-output pairs for the model training
input_texts = combined_emoji_df['label'].tolist()
target_texts = combined_emoji_df['emoji'].tolist()

# Tokenize inputs and targets
# Tokenize input texts and target texts for the model, ensuring uniform tensor shapes
# Apply padding and truncation to control the sequence length
# max_length specifies the maximum token length for inputs and outputs
input_encodings = tokenizer(input_texts, padding=True, truncation=True, max_length=32, return_tensors="pt")
target_encodings = tokenizer(target_texts, padding=True, truncation=True, max_length=8, return_tensors="pt")

# Dataset and DataLoader
# Define a custom dataset class for organizing the input and target encodings
class EmojiDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        # Initialize with tokenized inputs and targets
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        # Retrieve a single sample (input, attention mask, and labels) by index
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': self.targets['input_ids'][idx]
        }

# Create a dataset and data loader
# Use the custom dataset class to organize tokenized inputs and targets
# DataLoader splits the dataset into manageable batches for training
dataset = EmojiDataset(input_encodings, target_encodings)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

# Training setup
# Define an optimizer (AdamW) to update the model's parameters during training
# Set the learning rate (lr) for the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Set the model to training mode
# This activates certain layers like dropout that are specific to training
model.train()

# Training loop
# Train the model over multiple epochs
for epoch in range(3):  # Train for 3 epochs
    total_loss = 0  # Track the total loss for this epoch
    for batch in data_loader:
        # Zero the gradients to prevent accumulation from previous steps
        optimizer.zero_grad()

        # Extract input_ids, attention_mask, and labels from the batch
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Perform a forward pass and compute the loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backpropagate the gradients
        loss.backward()

        # Update the model's parameters
        optimizer.step()

        # Accumulate the loss for this batch
        total_loss += loss.item()

    # Print the loss for the current epoch
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")

# Save the model and tokenizer
# This allows reloading the model later for inference or fine-tuning
model.save_pretrained("emoji_t5_model")
tokenizer.save_pretrained("emoji_t5_model")

# Translate input text to emoji
# Define a function to generate emoji translations from text inputs
def translate_to_emoji(text):
    # Set the model to evaluation mode (disables dropout layers)
    model.eval()

    # Tokenize the input text and convert it to a tensor
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=32, truncation=True)

    # Generate output tokens using beam search for better quality translations
    outputs = model.generate(input_ids, max_length=8, num_beams=4, early_stopping=True)

    # Decode the generated tokens to a human-readable string
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the translation
# Provide sample inputs to test the model's ability to generate emoji translations
print(translate_to_emoji("happy"))  # Expected to output an emoji for happiness
print(translate_to_emoji("sad"))    # Expected to output an emoji for sadness
print(translate_to_emoji("love"))   # Expected to output an emoji for love


In [None]:
def translate_to_emoji_with_fallback(text):
    """
    Translates the input text to an emoji using the trained T5 model.
    Falls back to direct mapping if model output is empty or invalid.
    """
    model.eval()
    try:
        # Generate output with the model
        input_ids = tokenizer.encode(text, return_tensors="pt", max_length=32, truncation=True)
        outputs = model.generate(input_ids, max_length=8, num_beams=4, early_stopping=True)
        emoji_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Check if the output is valid
        if emoji_output.strip() == "":
            raise ValueError("Empty model output.")

        return emoji_output
    except Exception:
        # Fallback logic: Match words to the emoji dataset
        words = text.lower().split()
        matched_emojis = [combined_emoji_df[combined_emoji_df['label'].str.contains(word, na=False, case=False)]['emoji'].values for word in words]
        matched_emojis = [item for sublist in matched_emojis for item in sublist]  # Flatten list

        if matched_emojis:
            return " ".join(matched_emojis)
        else:
            return "No matching emoji found."

# Interactive loop with fallback
def interactive_emoji_translator_with_fallback():
    print("Welcome to the Enhanced Emoji Translator!")
    print("Type any phrase and press Enter to get an emoji.")
    print("Type 'exit' to quit the program.")

    while True:
        # Get user input
        phrase = input("Enter a phrase: ")

        # Exit condition
        if phrase.lower() == 'exit':
            print("Exiting the Emoji Translator. Goodbye!")
            break

        # Translate the phrase to an emoji
        try:
            emoji_output = translate_to_emoji_with_fallback(phrase)
            print(f"Emoji: {emoji_output}")
        except Exception as e:
            print(f"Error: Could not translate the phrase. {e}")

# Run the interactive translator with fallback
interactive_emoji_translator_with_fallback()

Welcome to the Enhanced Emoji Translator!
Type any phrase and press Enter to get an emoji.
Type 'exit' to quit the program.
Emoji: No matching emoji found.
Emoji: 🇧🇭 🇺🇦 🧠 🚅 ⛈ 🌧 🚄 🌈 🏳‍🌈 🌦 🚆 ☔ 🇧🇭 🚆 🇺🇦 🌈
Emoji: 🇧🇶 🇲🇬 🇳🇮 🗃 📇 🗂 🎠 🎏 🪚 🥕 💳 🎴 🪪 🤸‍♂ 🤸🏿‍♂ 🤸🏻‍♂ 🤸🏾‍♂ 🤸🏼‍♂ 🤸🏽‍♂ 🚔 🤸 🤸🏿 🤸🏻 🤸🏾 🤸🏼 🤸🏽 🪧 🚓 🚨 🏎 🚃 🧣 🛒 🚋 🤸‍♀ 🤸🏿‍♀ 🤸🏻‍♀ 🤸🏾‍♀ 🤸🏼‍♀ 🤸🏽‍♀ 🧕 🧕🏿 🧕🏻 🧕🏾 🧕🏼 🧕🏽 🥕 🇲🇬 🇳🇮
Emoji: 🥖 🍞 🫓 🥙 🍞
Emoji: 🪺
Emoji: ✈ 🛬 🛫 🪐 🛩 ✈️
Emoji: 🚏 ⏹ 🛑 ⏱ ⏱️
