In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import torch
from transformers import CLIPTokenizer, CLIPTextModel

# Download NLTK data files (only need to run once)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Apply tqdm to all .apply() functions by using progress_apply
tqdm.pandas()

# Use GPU if available
"""
Reference: https://pytorch.org/get-started/locally/
"""
# Check for NVIDIA GPU
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use CUDA (NVIDIA GPU)
    print("Using NVIDIA GPU (CUDA)")
# Check for Mac Silicon GPU (MPS)
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Metal Performance Shaders (Mac Silicon GPU)
    print("Using Mac GPU (MPS)")
# Default to CPU if no GPU is available
else:
    device = torch.device("cpu")
    print("Using CPU")

Using NVIDIA GPU (CUDA)


[nltk_data] Downloading package punkt_tab to C:\Users\Jyoti Prakash
[nltk_data]     Uprety\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jyoti Prakash
[nltk_data]     Uprety\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Jyoti Prakash
[nltk_data]     Uprety\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Reading the dataset
openmoji_df = pd.read_csv('../data/openmoji.csv')
llm_df = pd.read_parquet('../data/llmemoji.parquet')
emojipedia_df = pd.read_csv('../data/emojipedia.csv')

In [3]:
# Convert Unicode string (e.g., 'U+1F600', 'U+263A,FE0F') to hex code ('1F600', '263A-FE0F').

def unicode_to_hex(unicode_str):
    unicode_str = unicode_str.replace(",", " ")  # Replace commas with spaces
    # First splitting the input string to a list of substrings
    # Loops though each substring
    # Removes the U+ prefix from each substring
    hex_values = [u.replace("U+", "") for u in unicode_str.split()]
    # Join the values with hyphens
    return "-".join(hex_values)

# Convert 'unicode' column in emojipedia_df and llm_df to 'hexcode'
llm_df['hexcode'] = llm_df['unicode'].progress_apply(unicode_to_hex)
emojipedia_df['hexcode'] = emojipedia_df['Codepoints Hex'].progress_apply(unicode_to_hex)

100%|██████████| 5034/5034 [00:00<00:00, 1252246.39it/s]
100%|██████████| 1885/1885 [00:00<00:00, 1288714.43it/s]


In [4]:
# Making the hexcode uniform in all 3 dataframes for merging
# removing -f30f (differentiation between image type emoji and textual type emoji)
# removing -200d (differentiation for emoji with skin-tone)

openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-FE0F', '', regex=True)
openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-200D', '', regex=True)
openmoji_df['skintone_base_hexcode'] = openmoji_df['skintone_base_hexcode'].str.replace('-FE0F', '', regex=True)
openmoji_df['skintone_base_hexcode'] = openmoji_df['skintone_base_hexcode'].str.replace('-200D', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-FE0F', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-200D', '', regex=True)
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.replace('-FE0F', '', regex=True)
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.replace('-200D', '', regex=True)

In [5]:
openmoji_df['hexcode'] = openmoji_df['hexcode'].str.lower()
openmoji_df['skintone_base_hexcode'] = openmoji_df['skintone_base_hexcode'].str.lower()
llm_df['hexcode'] = llm_df['hexcode'].str.lower()
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.lower()

In [6]:
# Dropping symbols, extras-openmoji, extras-unicode and flags categories
openmoji_df = openmoji_df[~openmoji_df["group"].isin(["symbols", "extras-openmoji", "extras-unicode", "flags"])]

# Dropping records of skin colors and hair types
openmoji_df = openmoji_df[~openmoji_df["hexcode"].isin(["1f3fb", "1f3fc", "1f3fd", "1f3fe", "1f3ff", "1f9b0", "1f9b1", "1f9b3", "1f9b2"])]

# Drop all records with multiple skintone combinations
openmoji_df = openmoji_df[~openmoji_df["skintone_combination"].isin(["multiple"])]

# Setting skintone to 0 where no skintone is specified
openmoji_df['skintone'] = openmoji_df['skintone'].fillna(0)

In [7]:
# Checking for duplicates
duplicate_counts = openmoji_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

Series([], Name: count, dtype: int64)


In [8]:
# Checking for duplicates
duplicate_counts = llm_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

hexcode
1f3cc-2640               4
1f575-2640               4
1f6b6-1f3fe-2642-27a1    4
1f3c3-2640-27a1          4
1f3c3-1f3ff-2640-27a1    4
                        ..
261d                     2
267e                     2
2139                     2
26f8                     2
1f470-1f3fb-2640         2
Name: count, Length: 1160, dtype: int64


In [9]:
# Checking for duplicates
duplicate_counts = emojipedia_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

Series([], Name: count, dtype: int64)


In [10]:
# Removing duplicates
llm_df = llm_df[~llm_df.duplicated(subset=['hexcode'], keep=False)]

In [11]:
# Merge the dataframes on 'hexcode' with left join on openmoji_df
merged_df = openmoji_df.merge(llm_df, on='hexcode', how='left')
merged_df = merged_df.merge(emojipedia_df, on='hexcode', how='left')

In [12]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3021 entries, 0 to 3020
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   emoji                  3021 non-null   object 
 1   hexcode                3021 non-null   object 
 2   group                  3021 non-null   object 
 3   subgroups              3021 non-null   object 
 4   annotation             3021 non-null   object 
 5   tags_x                 1406 non-null   object 
 6   openmoji_tags          0 non-null      object 
 7   openmoji_author        3021 non-null   object 
 8   openmoji_date          3021 non-null   object 
 9   skintone               3021 non-null   object 
 10  skintone_combination   1938 non-null   object 
 11  skintone_base_emoji    1938 non-null   object 
 12  skintone_base_hexcode  1938 non-null   object 
 13  unicode_x              3021 non-null   object 
 14  order                  3021 non-null   float64
 15  char

## Handling Tags and Descriptions

In [13]:
def get_first_sentence(text):
    """
    Extracts the first sentence from a given text.
    A sentence ends with '.', '?', or '!' followed by space or end of string.
    """
    if not isinstance(text, str):
        return text  # Return as-is if not a string
    
    match = re.match(r'(.+?[.!?])(\s|$)', text.strip())
    return match.group(1) if match else text

merged_df["Description"] = merged_df["Description"].apply(get_first_sentence)

In [14]:
import unicodedata

def clean_text(text):
    """
    Normalize and remove unwanted characters.
    """
    if not isinstance(text, str):
        return text

    text = unicodedata.normalize("NFKD", text)
    return re.sub(r'[^a-zA-Z0-9\s.,!?\'"():;\-\n]+', '', text)

In [15]:
# def clean_text(text):
#     if not isinstance(text, str) or pd.isna(text) or text.strip().lower() == "nan":  
#         return ""  # Return empty string for NaN or "nan" strings
#     text = text.lower().strip() # Convert to lowercase and remove unnecessary spaces
#     # Keep only letters, numbers, spaces, * and #
#     text = re.sub(r'[^a-z0-9\s*#]', '', text)
#     # Tokenize the text
#     tokens = word_tokenize(text)
#     # Remove stop words
#     tokens = [word for word in tokens if word not in stopwords.words('english')]
#     return ', '.join(tokens)
# 
# def remove_duplicates(text):
#     words = [word.strip() for word in text.split(",")]  # Split by commas and strip spaces
#     unique_words = list(dict.fromkeys(words))
#     return ', '.join(unique_words)  # Join back into a string

In [16]:
# Cleaning annotation, LLM description and openmoji_description
merged_df["annotation"] = merged_df["annotation"].apply(clean_text)
merged_df["LLM description"] = merged_df["LLM description"].apply(clean_text)
merged_df["Description"] = merged_df["Description"].apply(clean_text)

In [17]:
# Create a mapping from hexcode to Description (only non-null ones)
hexcode_to_description = merged_df[merged_df['Description'].notnull()] \
    .set_index('hexcode')['Description'].to_dict()

# Fill missing Descriptions based on skintone_base_hexcode
merged_df['Description'] = merged_df.apply(
    lambda row: hexcode_to_description.get(row['skintone_base_hexcode'], row['Description'])
    if pd.isnull(row['Description']) else row['Description'],
    axis=1
)

In [18]:
# Merge annotation with a description
# Use LLM description if it exists, otherwise use Description
def generate_prompt(row):
    # Start with annotation (always exists)
    prompt = row['annotation']

    # Try LLM description, then Description
    extra = row['LLM description'] if pd.notnull(row['LLM description']) else row['Description']

    # If there's extra content, append with a space
    if pd.notnull(extra):
        prompt += '. ' + extra

    return prompt

merged_df['prompt'] = merged_df.apply(generate_prompt, axis=1)

In [19]:
# merged_df['prompt'] = merged_df['annotation']

In [20]:
# # Clean openmoji_annotation column
# # Clean llm_tags
# merged_df["cleaned_llm_tags"] = merged_df["tags_y"].progress_apply(clean_text)
# 
# # List of columns to merge (All tags)
# columns_to_merge = ["tags_x", "cleaned_llm_tags"]
# 
# # Fill NaN with empty strings, then merge columns
# merged_df["merged_tags"] = merged_df[columns_to_merge].fillna("").agg(
#     lambda x: ", ".join(filter(None, map(str, x))), axis=1
# )

In [21]:
# # Remove duplicates
# merged_df["final_tags"] = merged_df["merged_tags"].progress_apply(remove_duplicates)

In [22]:
# # Function to handle concatenation with empty strings and NaN values
# def merge_descriptions(row):
#     parts = []
#     if pd.notna(row['annotation']):
#         annotation = row['annotation'].strip()
#         if annotation:  # Ensure it's not empty after stripping
#             parts.append(annotation)
#     if pd.notna(row['LLM description']):
#         description = row['LLM description'].strip()
#         if description:
#             parts.append(description)
#     if pd.notna(row['final_tags']):
#         tags = row['final_tags'].strip()
#         if tags:
#             parts.append(f"Tags: {tags}")  # Ensure the "Tags: " prefix is only added if valid
# 
#     return ". ".join(parts) if parts else None  # Return None if no valid parts
# 
# # Apply the function to each row
# merged_df['merged_description'] = merged_df.apply(merge_descriptions, axis=1)

In [23]:
# merged_df.info()
# merged_df.to_csv('../data/test_emoji_data.csv', index=False)

In [24]:
# # Duplicate smiley emoji records so that the model has enough to learn from
# 
# # Filter rows with group "smileys-emotion"
# smileys_df = merged_df[merged_df['group'] == 'smileys-emotion']
# 
# # Duplicate these rows 3 times
# duplicated_smileys_df = pd.concat([smileys_df] * 2, ignore_index=True)
# 
# # Append back to the original DataFrame
# merged_df = pd.concat([merged_df, duplicated_smileys_df], ignore_index=True)
# 
# # Shuffle the dataset to mix the records
# merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [25]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3021 entries, 0 to 3020
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   emoji                  3021 non-null   object 
 1   hexcode                3021 non-null   object 
 2   group                  3021 non-null   object 
 3   subgroups              3021 non-null   object 
 4   annotation             3021 non-null   object 
 5   tags_x                 1406 non-null   object 
 6   openmoji_tags          0 non-null      object 
 7   openmoji_author        3021 non-null   object 
 8   openmoji_date          3021 non-null   object 
 9   skintone               3021 non-null   object 
 10  skintone_combination   1938 non-null   object 
 11  skintone_base_emoji    1938 non-null   object 
 12  skintone_base_hexcode  1938 non-null   object 
 13  unicode_x              3021 non-null   object 
 14  order                  3021 non-null   float64
 15  char

## Embedding Emoji Condition

In [26]:
"""
Reference: https://huggingface.co/docs/transformers/model_doc/clip
"""

# Load CLIP's tokenizer and text model.
clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
clip_model = clip_model.to(device)
clip_model.eval()

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), ep

In [27]:
"""
Reference: ChatGPT o3-mini-high
Prompt: Write a code to embed my description column in my df using CLIP-vit-base-patch32. Use Mean Pooling + L2 Normalization method to generate embeddings.

Reason: We're using Mean Pooling + L2 Normalization to retain fine-grained meanings related to gender, skin tone, emotions, and objects. We're also using L2 Normalization because they have a consistent scale, reducing variance in GAN training.
"""

def mean_pooling(model_output, attention_mask):
    """Mean pool the token embeddings."""
    token_embeddings = model_output.last_hidden_state  # (batch_size, sequence_length, hidden_dim)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)

def embed_text(text):
    if pd.isna(text) or text.strip() == "":
        # Adjust the zero vector size to match CLIP's output dimension (512 for clip-vit-base-patch32)
        return np.zeros(512, dtype=np.float32)

    # Tokenize the input text
    inputs = clip_tokenizer(text, return_tensors="pt", truncation=True, max_length=77)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Disable gradients for inference
    with torch.no_grad():
        output = clip_model(**inputs)

    # Pool the token embeddings (mean pooling)
    pooled_embedding = mean_pooling(output, inputs["attention_mask"])

    # Optionally, you might want to L2 normalize the pooled embedding:
    pooled_embedding = torch.nn.functional.normalize(pooled_embedding, p=2, dim=-1)

    return pooled_embedding.squeeze().cpu().numpy().astype(np.float32)

In [28]:
# Apply CLIP embedding to your dataset
merged_df["combined_embedding"] = merged_df["prompt"].progress_apply(embed_text)

100%|██████████| 3021/3021 [00:23<00:00, 127.06it/s]


In [29]:
merged_df.head()

Unnamed: 0,emoji,hexcode,group,subgroups,annotation,tags_x,openmoji_tags,openmoji_author,openmoji_date,skintone,...,Group,Subgroup,Emoji,Title,DescribedBy,URL,Description,Codepoints Hex,prompt,combined_embedding
0,😀,1f600,smileys-emotion,face-smiling,grinning face,"face, grin",,Emily Jäger,2018-04-18,0,...,smiley,Smiling & Affectionate,😀,Grinning Face,grinning-face,/grinning-face,"A yellow face with simple, open eyes and a bro...",U+1F600,grinning face. This emoji represents a smiling...,"[-0.009248996, -0.047418017, -0.00524062, 0.03..."
1,😃,1f603,smileys-emotion,face-smiling,grinning face with big eyes,"face, mouth, open, smile",,Emily Jäger,2018-04-18,0,...,smiley,Smiling & Affectionate,😃,Grinning Face with Big Eyes,grinning-face-with-big-eyes,/grinning-face-with-big-eyes,"A yellow face with smiling eyes and a broad, o...",U+1F603,grinning face with big eyes. This emoji repres...,"[0.01590147, -0.03315458, -0.010774368, 0.0427..."
2,😄,1f604,smileys-emotion,face-smiling,grinning face with smiling eyes,"eye, face, mouth, open, smile",,Emily Jäger,2018-04-18,0,...,smiley,Smiling & Affectionate,😄,Grinning Face with Smiling Eyes,grinning-face-with-smiling-eyes,/grinning-face-with-smiling-eyes,"A yellow face with smiling eyes and a broad, o...",U+1F604,grinning face with smiling eyes. This emoji re...,"[-0.00059326674, -0.04860326, -0.004713649, 0...."
3,😁,1f601,smileys-emotion,face-smiling,beaming face with smiling eyes,"eye, face, grin, smile",,Emily Jäger,2018-04-18,0,...,smiley,Smiling & Affectionate,😁,Beaming Face with Smiling Eyes,beaming-face-with-smiling-eyes,/beaming-face-with-smiling-eyes,A yellow face with smiling eyes and full-tooth...,U+1F601,beaming face with smiling eyes. This emoji rep...,"[-0.00021087428, -0.035330784, -0.004500972, 0..."
4,😆,1f606,smileys-emotion,face-smiling,grinning squinting face,"face, laugh, mouth, satisfied, smile",,Emily Jäger,2018-04-18,0,...,smiley,Smiling & Affectionate,😆,Grinning Squinting Face,grinning-squinting-face,/grinning-squinting-face,"A yellow face with a broad, open smile and scr...",U+1F606,grinning squinting face. This emoji represents...,"[0.0041811448, -0.039693132, -0.010858626, 0.0..."


## Linking Images

In [30]:
# Define base image path and brands
image_base_path = "../data/tensor_images/"
# brands = ["GoogleEmoji", "JoyPixelsEmoji", "TwitterEmoji"]
brands = ["GoogleEmoji"]

# Function to find all available image paths for a given hexcode
def get_image_paths(hexcode):
    image_paths = {}
    
    for brand in brands:
        brand_path = os.path.join(image_base_path, brand)
        if not os.path.exists(brand_path): # Skip if folder doesn't exist
            continue
            
        expected_filename = f"{hexcode}.pt"  # Adjust based on actual format
        
        if expected_filename in os.listdir(brand_path):
            image_paths[brand] = os.path.join(brand_path, expected_filename)

    return image_paths

# Expand dataframe with tqdm progress bar
expanded_rows = []
for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Processing Hexcodes"):
    hexcode = row["hexcode"]
    embedding = row["combined_embedding"]
    prompt = row["prompt"]
    skintone = row["skintone"]

    image_paths = get_image_paths(hexcode)  # Get list of image paths
    
    if image_paths:  # If images exist, create multiple rows
        for brand, path in image_paths.items():
            expanded_rows.append({"hexcode": hexcode, "prompt": prompt, "skintone":skintone, "combined_embedding": embedding, "image_path": path})
    else:
        # If no images exist, optionally add a row with NaN for image_path
        expanded_rows.append({"hexcode": hexcode, "prompt": prompt, "skintone":skintone, "combined_embedding": embedding, "image_path": None})

# Convert to DataFrame
expanded_df = pd.DataFrame(expanded_rows)

# Optional: Drop rows where no image is found
# expanded_df = expanded_df.dropna(subset=["image_path"]).reset_index(drop=True)

Processing Hexcodes: 100%|██████████| 3021/3021 [00:07<00:00, 402.39it/s]


In [31]:
expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3021 entries, 0 to 3020
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   hexcode             3021 non-null   object
 1   prompt              3021 non-null   object
 2   skintone            3021 non-null   object
 3   combined_embedding  3021 non-null   object
 4   image_path          3021 non-null   object
dtypes: object(5)
memory usage: 118.1+ KB


In [32]:
# Try converting skintone column to numeric (if possible)
expanded_df['skintone'] = pd.to_numeric(expanded_df['skintone'], errors='raise')

In [33]:
output_file = '../data/processed_emoji_dataset.parquet'

# Check if the file exists and remove it
if os.path.exists(output_file):
    os.remove(output_file)

# Now save the DataFrame as Parquet
expanded_df.to_parquet(output_file, index=False)