<a href="https://colab.research.google.com/github/King-Rian/Project-3/blob/main/Copy_of_Project_3_working.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install emoji library
!pip install emoji
!pip install torch
from google.colab import files
import zipfile
import json
import emoji
import pandas as pd
import torch
import re
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaModel, AdamW, T5Tokenizer, T5ForConditionalGeneration
from google.colab import files


Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/586.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [2]:
# Define a function to test the text to emoji output to see matches
def check_output_for_match(text, emoji_dict):
  """Checks if a given text matches any emoji labels in the dictionary.

  Args:
    text: The text to match against emoji labels.
    emoji_dict: A DataFrame containing emoji labels and their corresponding emojis.

  Returns:
    A DataFrame containing matched emojis.
  """

  matched_emojis = emoji_dict[emoji_dict['label'].str.contains(text, case=False)]
  return matched_emojis

# Data source number 1: [EmojiNet](https://www.kaggle.com/datasets/rtatman/emojinet?select=emojis.json)

In [None]:
# Retrieve the 'emojis.json' zippped file from your local drive

# Step 1: Upload the file
uploaded = files.upload()

# Step 2: Extract the ZIP file
with zipfile.ZipFile('emojis.json.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Step 3: Load the extracted JSON file
emoji_file = 'emojis.json'

# Using pandas
emoji_df = pd.read_json(emoji_file)
print("Dataset loaded successfully. Sample data:")
emoji_df.head(5)


In [None]:
# Check the shape, features, and datatypes of the df
emoji_df.info()

In [None]:
# Clean up data set # 1
# Set options to display all rows and columns
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

# Create a new emojis DataFrame with 'name' and 'unicode'
key_value_pairs_df = emoji_df[['name', 'unicode']].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Process the 'name' column to include : at the beginning and end of the emoji name
key_value_pairs_df['name'] = key_value_pairs_df['name'].apply(
    lambda x: f":{x.split(':')[0].strip()}:"  # Remove everything after the first colon and add colons
)

# Convert the unicode column to emoji symbols
key_value_pairs_df['emoji'] = key_value_pairs_df['name'].apply(emoji.emojize)

# Remove rows where 'emoji' contains text with a beginning and ending ':' - which means that emoji was not found for that :name:
filtered_df = key_value_pairs_df[~key_value_pairs_df['emoji'].str.match(r'^:.*:$')].copy()  # Use .copy() here as well

# Remove rows where 'unicode' contains multiple Unicode values (separated by a space)
filtered_df = filtered_df[~filtered_df['unicode'].str.contains(' ', na=False)]

# Create a new df and rename features to read for concatenation with the next data set
new_filtered_df = filtered_df[['name', 'emoji']].copy()  # Use .copy() here as well
new_filtered_df = new_filtered_df.rename(columns={'name': 'label'})

# Display the DataFrame with emoji symbols
new_filtered_df.head()

In [None]:
# Test output with customer function
test_df= check_output_for_match('car', new_filtered_df)
test_df

# Data set number 2: [Full Emoji List, v16.0](https://unicode.org/emoji/charts/full-emoji-list.html)

In [24]:
# Try to upload the data set from local before proceding with web scraping
try:
    # Attempt to upload the file
    uploaded = files.upload()
    # Read the uploaded CSV file into a DataFrame
    full_emoji_list_df = pd.read_csv('full_emoji_list.csv')
    print("File 'full_emoji_list.csv' uploaded and loaded successfully.")

except FileNotFoundError:
    print("File 'full_emoji_list.csv' not found. Proceeding to web scraping.")

    # If the file is not found, run your web scraping code
    import requests
    from bs4 import BeautifulSoup

    def extract_tables(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        tables = soup.find_all('table')
        combined_data = []

        for table in tables:
            rows = table.find_all('tr')
            for row in rows:
                cols = row.find_all(['td', 'th'])
                cols = [col.text.strip() for col in cols]
                combined_data.append(cols)

        return combined_data

    # Run the main function and display the DataFrame
    url = "http://www.unicode.org/emoji/charts/full_emoji_list.html"
    combined_data = extract_tables(url)

    if combined_data:
        full_emoji_list_df = pd.DataFrame(combined_data)
        full_emoji_list_df.rename(columns={0: "No", 1: "Code", 2: "Browser", 3: "Sample", 4: "GMail", 5: "SB", 6: "DCM", 7: "KDDI", 8: "CLDR Short Name"}, inplace=True)
        full_emoji_list_df = full_emoji_list_df.iloc[3:]
        full_emoji_list_df = full_emoji_list_df[['Browser', 'CLDR Short Name']].rename(columns={'Browser': 'emoji', 'CLDR Short Name': 'label'})

    full_emoji_list_df.head()

Saving full_emoji_list.csv to full_emoji_list (1).csv
File 'full_emoji_list.csv' uploaded and loaded successfully.


In [31]:
# Check for null values
full_emoji_list_df.isnull().sum()

# Get rid of null values
full_emoji_list_df = full_emoji_list_df.dropna()
full_emoji_list_df.isnull().sum()

# Delete label = 'CLDR Short Name'
full_emoji_list_df = full_emoji_list_df[full_emoji_list_df['label'] != 'CLDR Short Name']

full_emoji_list_df.tail()


Unnamed: 0,emoji,label
2102,🇿🇲,flag Zambia
2103,🇿🇼,flag Zimbabwe
2106,🏴󠁧󠁢󠁥󠁮󠁧󠁿,flag England
2107,🏴󠁧󠁢󠁳󠁣󠁴󠁿,flag Scotland
2108,🏴󠁧󠁢󠁷󠁬󠁳󠁿,flag Wales


In [32]:
# Get rid of special characters
full_emoji_list_df['label'] = full_emoji_list_df['label'].str.replace(r'[^\w\s]', '', regex=True)

# Convert the DataFrame to a CSV file
full_emoji_list_df.to_csv('full_emoji_list.csv', index=False)

# Download the CSV file
files.download('full_emoji_list.csv')

full_emoji_list_df.info()
full_emoji_list_df.head()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<class 'pandas.core.frame.DataFrame'>
Index: 1843 entries, 0 to 2108
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   emoji   1843 non-null   object
 1   label   1843 non-null   object
dtypes: object(2)
memory usage: 43.2+ KB


Unnamed: 0,emoji,label
0,🤥,lying face
1,🫨,shaking face
2,🙂‍↔️,head shaking horizontally
3,🙂‍↕️,head shaking vertically
6,😌,relieved face


# Data set number 3: [emoji Python Library, V2.14.0](https://carpedm20.github.io/emoji/docs/index.html)

In [None]:
# Create an emoji dictionary with labels and their Unicode values from https://carpedm20.github.io/emoji/docs/index.html
emoji_dict = {emoji.demojize(e): e for e in emoji.EMOJI_DATA}

# Show how many key value pairs we retrieved from emoji library
records = len(emoji_dict)
print(f"Number of records: {records}")

# Create new df for use in concatenation the two data frames
emoji_dictionary_df = pd.DataFrame(emoji_dict.items(), columns=['label', 'emoji'])
emoji_dictionary_df.head()
#emoji_dictionary_df.info()

In [None]:
# Test output with customer function
test_df= check_output_for_match('car', emoji_dictionary_df)
test_df

In [None]:
# Concatenate both dataframes / sources and drop the index
combined_emoji_df = pd.concat([emoji_dictionary_df, full_emoji_list_df, new_filtered_df], ignore_index=True)

# Show the key pair data frame with training data
combined_emoji_df.info()
combined_emoji_df.head()

In [None]:
# Make sure 'label' field is lower case, contains no special characters, and has a space between words
# Ensure all values in 'label' column are strings before applying regex
combined_emoji_df['label'] = combined_emoji_df['label'].astype(str)

# Modify the 'label' column: convert to lowercase, strip spaces, remove non-alphanumeric characters, and replace underscores with spaces
combined_emoji_df['label'] = combined_emoji_df['label'].str.lower().str.strip()

# Use regex to remove non-alphanumeric characters, but keep spaces and replace underscores with spaces
combined_emoji_df['label'] = combined_emoji_df['label'].apply(lambda x: re.sub(r'[^a-z0-9\s_]', '', x))  # Remove non-alphanumeric except _

# Replace underscores with spaces
combined_emoji_df['label'] = combined_emoji_df['label'].apply(lambda x: x.replace('_', ' '))

# Ensure a single space between words (in case there are multiple spaces)
combined_emoji_df['label'] = combined_emoji_df['label'].apply(lambda x: re.sub(r'\s+', ' ', x))

# Display the modified dataframe
combined_emoji_df.info()
print(combined_emoji_df.head())


In [None]:
# Define a function to count the number of words in a string
def count_words(text):
    return len(text.split())

# Apply the function to the 'label' column to filter rows with more than 2 words
## combined_emoji_df = combined_emoji_df[combined_emoji_df['label'].apply(count_words) <= 1] ## test without

# Display the filtered DataFrame
print(combined_emoji_df)


In [None]:
# Test output with customer function
test_df= check_output_for_match('car', combined_emoji_df)
test_df

In [None]:
# Use simple random oversampling to improve model output

# Count the frequency of each emoji
emoji_counts = combined_emoji_df['emoji'].value_counts()

# Find the emoji with the maximum count
max_count = emoji_counts.max()

# Create an empty list to store the oversampled data
oversampled_data = []

# Iterate through each emoji class and oversample
for emoji, count in emoji_counts.items():
    # Calculate the number of repetitions needed
    n_repeats = max_count - count
    # Get the rows corresponding to this emoji
    emoji_data = combined_emoji_df[combined_emoji_df['emoji'] == emoji]
    # Sample (with replacement) to match the maximum count
    oversampled_data.append(emoji_data.sample(n=n_repeats, replace=True))

# Concatenate the oversampled data back together
oversampled_df = pd.concat(oversampled_data, axis=0)

# Combine the original dataset with the oversampled data
combined_emoji_df = pd.concat([combined_emoji_df, oversampled_df], axis=0)

# Shuffle the final dataset
combined_emoji_df = combined_emoji_df.sample(frac=1).reset_index(drop=True)

print(f"Oversampled dataset shape: {combined_emoji_df.shape}")
combined_emoji_df.head()


In [None]:
# Test output with customer function
test_df= check_output_for_match('car', combined_emoji_df)
test_df

# Augment the data by getting synonyms of the labels to help train the model

In [41]:
import pandas as pd
import nltk
from nltk.corpus import wordnet

# Download the wordnet data
nltk.download('wordnet')

# Function to find synonyms
def get_synonyms(word, limit=2):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.name() not in synonyms:
                synonyms.append(lemma.name())
            if len(synonyms) == limit:
                break
        if len(synonyms) == limit:
            break
    return synonyms

# Create a new DataFrame to store the results
new_data = []

# Loop through each row in the original DataFrame
for index, row in combined_emoji_df.iterrows():
    label = row['label']
    emoji = row['emoji']
    synonyms = get_synonyms(label)

    # For each synonym, create a new record
    for synonym in synonyms:
        if synonym != label:
            new_data.append({'label': synonym, 'emoji': emoji})

# Convert the new data into a DataFrame
df_new = pd.DataFrame(new_data)

# Function to replace underscores with spaces
def replace_underscores(text):
  return text.replace('_', ' ')

# Apply the function to the 'label' column
df_new['label'] = df_new['label'].apply(replace_underscores)

# Display the new DataFrame
print(df_new)


[nltk_data] Downloading package wordnet to /root/nltk_data...


                                          label    emoji
0                                     sea squab        🐡
1                               basketball game        🏀
2                                          info        ℹ
3                              Christian church        ⛪
4                                      Aquarius        ♒
5                                  Water Bearer        ♒
6                                         orang        🦧
7                                      doorbell        🔔
8                                         irons       ⛓️
9                                        clinch        🗜
10                                       lingua        👅
11                          Meleagris gallopavo        🦃
12                                        phone       ☎️
13                                     Djibouti       🇩🇯
14                          capital of Djibouti       🇩🇯
15                                      wild ox        🐂
16                             

In [42]:
# Concatenate the two DataFrames
combined_emoji_df = pd.concat([combined_emoji_df, df_new], ignore_index=True)

# Display the concatenated DataFrame
print(combined_emoji_df)


                                                   label      emoji
0                                             mouse face          🐭
1                                               blowfish          🐡
2                woman technologist mediumdark skin tone       👩🏾‍💻
3                                             billed cap          🧢
4                                       light blue heart          🩵
5                          woman artist medium skin tone       👩🏽‍🎨
6                     kiss man man mediumlight skin tone  👨🏼‍❤‍💋‍👨🏼
7               person playing handball medium skin tone         🤾🏽
8                           raising hands dark skin tone         🙌🏿
9                                 person getting haircut          💇
10                    construction worker dark skin tone         👷🏿
11              couple with heart man man dark skin tone    👨🏿‍❤‍👨🏿
12                                   boy light skin tone         👦🏻
13                                              

In [None]:
'''
# Apply the function to the 'label' column to filter rows with more than 2 words
combined_emoji_df = combined_emoji_df[combined_emoji_df['label'].apply(count_words) <= 1]

# Display the filtered DataFrame
print(combined_emoji_df)

                   label    emoji
0              detective        🕵
1                  mouse        🐁
2               lipstick        💄
3             paintbrush       🖌️
4               honduras       🇭🇳
5                 norway       🇳🇴
6                customs        🛃
7                 spider        🕷
8                    rat        🐀
9             montenegro       🇲🇪
10                 chile       🇨🇱
11            fourthirty        🕟
12                  niue       🇳🇺
13            uzbekistan       🇺🇿
14                shovel        🪏
15              maldives       🇲🇻
16              motorway        🛣
17                 snail        🐌
18                 olive        🫒
19           firefighter      🧑‍🚒
20               avocado        🥑
21          supervillain        🦹
22                trkiye       🇹🇷
23               glasses        👓
24          technologist      🧑‍💻
25                  herb        🌿
26            basketball        🏀
27               postbox        📮
28            

In [43]:
# Test output with customer function
test_df= check_output_for_match('car', combined_emoji_df)
test_df

Unnamed: 0,label,emoji
377,man cartwheeling medium skin tone,🤸🏽‍♂
484,person cartwheeling medium skin tone,🤸🏽
570,carousel horse,🎠
645,card file box,🗃
758,identification card,🪪
820,person cartwheeling mediumlight skin tone,🤸🏼
845,man cartwheeling,🤸‍♂
847,woman cartwheeling dark skin tone,🤸🏿‍♀
1113,person cartwheeling light skin tone,🤸🏻
1116,man cartwheeling dark skin tone,🤸🏿‍♂


In [44]:
# Identify rows where 'label' and 'emoji' have the same values
rows_to_remove = combined_emoji_df[combined_emoji_df['label'] == combined_emoji_df['emoji']]

# Remove these rows
combined_emoji_df = combined_emoji_df.drop(rows_to_remove.index)

# Display the updated DataFrame
print("\nUpdated DataFrame:")
print(combined_emoji_df)



Updated DataFrame:
                                                   label      emoji
0                                             mouse face          🐭
1                                               blowfish          🐡
2                woman technologist mediumdark skin tone       👩🏾‍💻
3                                             billed cap          🧢
4                                       light blue heart          🩵
5                          woman artist medium skin tone       👩🏽‍🎨
6                     kiss man man mediumlight skin tone  👨🏼‍❤‍💋‍👨🏼
7               person playing handball medium skin tone         🤾🏽
8                           raising hands dark skin tone         🙌🏿
9                                 person getting haircut          💇
10                    construction worker dark skin tone         👷🏿
11              couple with heart man man dark skin tone    👨🏿‍❤‍👨🏿
12                                   boy light skin tone         👦🏻
13                          

In [45]:
# Save the DataFrame to a CSV file
combined_emoji_df.to_csv('coombined_emoji_df.csv', index=False)

# Download the CSV file to your local machine
files.download('coombined_emoji_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Upload the combined_df from a file so that you don't have to clean and concat the 3 different data sets

In [46]:
try:
    # Attempt to upload the file
    uploaded = files.upload()
    # Read the uploaded CSV file into a DataFrame
    coombined_emoji_df = pd.read_csv('coombined_emoji_df.csv')
    print("File 'coombined_emoji_df.csv' uploaded and loaded successfully.")

except FileNotFoundError:
    print("File 'coombined_emoji_df.csv' not found. Proceeding to web scraping.")

Saving coombined_emoji_df.csv to coombined_emoji_df (1).csv
File 'coombined_emoji_df.csv' uploaded and loaded successfully.


# Run the model

In [48]:
# Count the number of rows where the 'label' column has more than 1 word
count_multiple_words = combined_emoji_df['label'].apply(lambda x: len(x.split()) > 1).sum()

# Print the count
print(count_multiple_words)
combined_emoji_df.info()


14096
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18782 entries, 0 to 18781
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   18782 non-null  object
 1   emoji   18782 non-null  object
dtypes: object(2)
memory usage: 293.6+ KB


In [None]:
# Initialize the T5 tokenizer and model
# Load the "t5-small" model and tokenizer from Hugging Face's transformers library
# T5 (Text-to-Text Transfer Transformer) is a model capable of various NLP tasks
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Prepare data for T5
# Convert the 'label' column to a list of input texts and the 'emoji' column to a list of target texts
# These will be the input-output pairs for the model training
input_texts = combined_emoji_df['label'].tolist()  # Use 'label' for model input
target_texts = combined_emoji_df['emoji'].tolist()      # Use 'emoji' for model output

# Replace or remove invalid elements (e.g., NaN, None) with empty strings
# This ensures that the tokenizer only processes strings.
target_texts = combined_emoji_df['emoji'].apply(lambda x: str(x) if pd.notna(x) else "").tolist()  # This ensures that the tokenizer only processes strings

# Tokenize inputs and targets
# Tokenize input texts and target texts for the model, ensuring uniform tensor shapes
# Apply padding and truncation to control the sequence length
# max_length specifies the maximum token length for inputs and outputs
input_encodings = tokenizer(input_texts, padding=True, truncation=True, max_length=32, return_tensors="pt")
target_encodings = tokenizer(target_texts, padding=True, truncation=True, max_length=8, return_tensors="pt")

# Dataset and DataLoader
# Define a custom dataset class for organizing the input and target encodings
class EmojiDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        # Initialize with tokenized inputs and targets
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        # Retrieve a single sample (input, attention mask, and labels) by index
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': self.targets['input_ids'][idx]
        }

# Create a dataset and data loader
# Use the custom dataset class to organize tokenized inputs and targets
# DataLoader splits the dataset into manageable batches for training
dataset = EmojiDataset(input_encodings, target_encodings)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

# Training setup
# Define an optimizer (AdamW) to update the model's parameters during training
# Set the learning rate (lr) for the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01) ## increased lr and added weight decay

# Set the model to training mode
# This activates certain layers like dropout that are specific to training
model.train()

# Training loop
# Set number of epochs
num_epochs = 5

# Train the model over multiple epochs
for epoch in range(num_epochs):
    total_loss = 0  # Track the total loss for this epoch
    for batch in data_loader:
        # Zero the gradients to prevent accumulation from previous steps
        optimizer.zero_grad()

        # Extract input_ids, attention_mask, and labels from the batch
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Perform a forward pass and compute the loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backpropagate the gradients
        loss.backward()

        # Update the model's parameters
        optimizer.step()

        # Accumulate the loss for this batch
        total_loss += loss.item()

    # Print the loss for the current epoch
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")

# Save the model and tokenizer
# This allows reloading the model later for inference or fine-tuning
model.save_pretrained("emoji_t5_model")
tokenizer.save_pretrained("emoji_t5_model")

# Translate input text to emoji
# Define a function to generate emoji translations from text inputs
def translate_to_emoji(text):
    # Set the model to evaluation mode (disables dropout layers)
    model.eval()

    # Tokenize the input text and convert it to a tensor
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=32, truncation=True)

    # Generate output tokens using beam search for better quality translations
    outputs = model.generate(input_ids, max_length=8, num_beams=8, early_stopping=True) ## updated num_beams=8 (from 4)

    # Decode the generated tokens to a human-readable string
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the translation
# Provide sample inputs to test the model's ability to generate emoji translations
print(translate_to_emoji("happy"))  # Expected to output an emoji for happiness
print(translate_to_emoji("sad"))    # Expected to output an emoji for sadness
print(translate_to_emoji("love"))   # Expected to output an emoji for love


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1, Loss: 523.1190851032734


In [None]:
def translate_to_emoji_with_fallback(text):
    """
    Translates the input text to an emoji using the trained T5 model.
    Falls back to direct mapping if model output is empty or invalid.
    Returns only one emoji for each word entered.
    """
    model.eval()
    try:
        # Generate output with the model
        input_ids = tokenizer.encode(text, return_tensors="pt", max_length=32, truncation=True)
        outputs = model.generate(input_ids, max_length=8, num_beams=4, early_stopping=True)
        emoji_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # emoji_output = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

        # Check if the output is valid
        if emoji_output.strip() == "":
            raise ValueError("Empty model output.")

        return emoji_output
    except Exception:
        # Fallback logic: Match words to the emoji dataset
        words = text.lower().split()
        matched_emojis = []

        for word in words:
            # Find the first matching emoji for the word
            emoji_match = combined_emoji_df[combined_emoji_df['label'].str.contains(word, na=False, case=False)]['emoji']
            if not emoji_match.empty:
                matched_emojis.append(emoji_match.values[0])  # Get the first emoji only

        if matched_emojis:
            return " ".join(matched_emojis)
        else:
            return "No matching emoji found."

# Interactive loop with fallback
def interactive_emoji_translator_with_fallback():
    print("Welcome to the Enhanced Emoji Translator!")
    print("Type any phrase and press Enter to get an emoji.")
    print("Type 'exit' to quit the program.")

    while True:
        # Get user input
        phrase = input("Enter a phrase: ")

        # Exit condition
        if phrase.lower() == 'exit':
            print("Exiting the Emoji Translator. Goodbye!")
            break

        # Translate the phrase to an emoji
        try:
            emoji_output = translate_to_emoji_with_fallback(phrase)
            # for idx, emoji_output in enumerate(emoji_output): print(f"Output {idx+1}: {emoji_output}")
            print(f"Emoji: {emoji_output}")
        except Exception as e:
            print(f"Error: Could not translate the phrase. {e}")

# Run the interactive translator with fallback
interactive_emoji_translator_with_fallback()

Welcome to the Enhanced Emoji Translator!
Type any phrase and press Enter to get an emoji.
Type 'exit' to quit the program.
Enter a phrase: dog food
Emoji: 🐕
Enter a phrase: dog dirt
Emoji: 🐕
Enter a phrase: pie
Emoji: 🥧
Enter a phrase: pie face
Emoji: 🥧 🦲
Enter a phrase: apple pie
Emoji: 🍍 🥧
Enter a phrase: exit
Exiting the Emoji Translator. Goodbye!
