In [33]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch

# Download NLTK data files (only need to run once)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Apply tqdm to all .apply() functions by using progress_apply
tqdm.pandas()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/bikinghimire/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bikinghimire/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bikinghimire/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
# Reading the dataset
openmoji_df = pd.read_csv('../data/openmoji.csv')
emojipedia_df = pd.read_csv('../data/emojipedia.csv')
llm_df = pd.read_parquet('../data/llmemoji.parquet')

In [35]:
# Convert Unicode string (e.g., 'U+1F600', 'U+263A,FE0F') to hex code ('1F600', '263A-FE0F').

def unicode_to_hex(unicode_str):
    unicode_str = unicode_str.replace(",", " ")  # Replace commas with spaces
    # First splitting the input string to a list of substrings
    # Loops though each substring
    # Removes the U+ prefix from each substring
    hex_values = [u.replace("U+", "") for u in unicode_str.split()]
    # Join the values with hyphens
    return "-".join(hex_values)

# Convert 'unicode' column in emojipedia_df and llm_df to 'hexcode'
emojipedia_df['hexcode'] = emojipedia_df['Codepoints Hex'].progress_apply(unicode_to_hex)
llm_df['hexcode'] = llm_df['unicode'].progress_apply(unicode_to_hex)

100%|██████████| 1885/1885 [00:00<00:00, 673217.22it/s]
100%|██████████| 5034/5034 [00:00<00:00, 1202056.72it/s]


In [36]:
# Making the hexcode uniform in all 3 dataframes for merging
# removing -f30f (differentiation between image type emoji and textual type emoji)
# removing -200d (differentiation for emoji with skin-tone)

openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-FE0F', '', regex=True)
openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-200D', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-FE0F', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-200D', '', regex=True)
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.replace('-FE0F', '', regex=True)
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.replace('-200D', '', regex=True)

In [37]:
# openmoji_df = openmoji_df[openmoji_df["group"] == "smileys-emotion"]

In [38]:
# Removing Skin tones
# openmoji_df = openmoji_df[~openmoji_df['hexcode'].str.contains('1F3FB|1F3FC|1F3FD|1F3FE|1F3FF', regex=True)]

In [39]:
# Checking for duplicates
duplicate_counts = openmoji_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

Series([], Name: count, dtype: int64)


In [40]:
# Checking for duplicates
duplicate_counts = llm_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

hexcode
1F3C3-1F3FD-2640-27A1           4
1F6B6-1F3FF-2642-27A1           4
1F6B6-1F3FF-2640-27A1           4
1F6B6-1F3FE-2640-27A1           4
1F6B6-1F3FC-2640-27A1           4
                               ..
1F9D1-1F3FC-2764-1F9D1-1F3FD    2
1F469-1F3FF-2764-1F468-1F3FD    2
1F9D1-1F3FD-2764-1F9D1-1F3FF    2
1F327                           2
2601                            2
Name: count, Length: 1160, dtype: int64


In [41]:
duplicate_counts = emojipedia_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

Series([], Name: count, dtype: int64)


In [42]:
# Removing duplicates
openmoji_df = openmoji_df[~openmoji_df.duplicated(subset=['hexcode'], keep=False)]
llm_df = llm_df[~llm_df.duplicated(subset=['hexcode'], keep=False)]

In [43]:
# Merge the dataframes on 'hexcode'
merged_df = openmoji_df.merge(emojipedia_df, on='hexcode', how='left')  # Changed outer merge to left
merged_df = merged_df.merge(llm_df, on='hexcode', how='left')  # Changed outer merge to left
# Convert hexcode to lowercase
merged_df['hexcode'] = merged_df['hexcode'].str.lower()

In [44]:
merged_df.head()

Unnamed: 0,emoji,hexcode,group,subgroups,annotation,tags_x,openmoji_tags,openmoji_author,openmoji_date,skintone,...,Title,DescribedBy,URL,Description,Codepoints Hex,character,unicode_y,short description,tags_y,LLM description
0,😀,1f600,smileys-emotion,face-smiling,grinning face,"face, grin",,Emily Jäger,2018-04-18,,...,Grinning Face,grinning-face,/grinning-face,"A yellow face with simple, open eyes and a bro...",U+1F600,😀,U+1F600,GRINNING FACE,"[smiling, happy, amusement, positive, friendly...",This emoji represents a smiling face with a br...
1,😃,1f603,smileys-emotion,face-smiling,grinning face with big eyes,"face, mouth, open, smile",,Emily Jäger,2018-04-18,,...,Grinning Face with Big Eyes,grinning-face-with-big-eyes,/grinning-face-with-big-eyes,"A yellow face with smiling eyes and a broad, o...",U+1F603,😃,U+1F603,GRINNING FACE WITH BIG EYES,"[smiling, happiness, joy, amusement, face, emo...",This emoji represents a smiling face with a wi...
2,😄,1f604,smileys-emotion,face-smiling,grinning face with smiling eyes,"eye, face, mouth, open, smile",,Emily Jäger,2018-04-18,,...,Grinning Face with Smiling Eyes,grinning-face-with-smiling-eyes,/grinning-face-with-smiling-eyes,"A yellow face with smiling eyes and a broad, o...",U+1F604,😄,U+1F604,GRINNING FACE WITH SMILING EYES,"[smiling, happy, amusement, joy, laughter, con...",This emoji represents a smiling face with happ...
3,😁,1f601,smileys-emotion,face-smiling,beaming face with smiling eyes,"eye, face, grin, smile",,Emily Jäger,2018-04-18,,...,Beaming Face with Smiling Eyes,beaming-face-with-smiling-eyes,/beaming-face-with-smiling-eyes,A yellow face with smiling eyes and full-tooth...,U+1F601,😁,U+1F601,BEAMING FACE WITH SMILING EYES,"[happy, joy, delight, smiling, face, emotion]",This emoji represents a beaming face with smil...
4,😆,1f606,smileys-emotion,face-smiling,grinning squinting face,"face, laugh, mouth, satisfied, smile",,Emily Jäger,2018-04-18,,...,Grinning Squinting Face,grinning-squinting-face,/grinning-squinting-face,"A yellow face with a broad, open smile and scr...",U+1F606,😆,U+1F606,GRINNING SQUINTING FACE,"[smiling, happy, humor, amusement, laughter, f...",This emoji represents a smiling face with squi...


In [45]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   emoji                  168 non-null    object 
 1   hexcode                168 non-null    object 
 2   group                  168 non-null    object 
 3   subgroups              168 non-null    object 
 4   annotation             168 non-null    object 
 5   tags_x                 168 non-null    object 
 6   openmoji_tags          0 non-null      object 
 7   openmoji_author        168 non-null    object 
 8   openmoji_date          168 non-null    object 
 9   skintone               0 non-null      object 
 10  skintone_combination   0 non-null      object 
 11  skintone_base_emoji    0 non-null      object 
 12  skintone_base_hexcode  0 non-null      object 
 13  unicode_x              168 non-null    object 
 14  order                  168 non-null    float64
 15  Group 

In [46]:
# Select and rename required columns
final_df = merged_df.copy()
final_df = final_df[['emoji', 'hexcode', 'subgroups', 'annotation', 'tags_x', 'openmoji_tags', 'Description', 'tags_y', 'LLM description']]
final_df.columns = [
    'emoji', 'hexcode', 'subgroups', 'openmoji_annotation', 'openmoji_tags_1', 'openmoji_tags_2', 'emojipedia_description', 'llm_tags', 'llm_description'
]

In [47]:
final_df.head()

Unnamed: 0,emoji,hexcode,subgroups,openmoji_annotation,openmoji_tags_1,openmoji_tags_2,emojipedia_description,llm_tags,llm_description
0,😀,1f600,face-smiling,grinning face,"face, grin",,"A yellow face with simple, open eyes and a bro...","[smiling, happy, amusement, positive, friendly...",This emoji represents a smiling face with a br...
1,😃,1f603,face-smiling,grinning face with big eyes,"face, mouth, open, smile",,"A yellow face with smiling eyes and a broad, o...","[smiling, happiness, joy, amusement, face, emo...",This emoji represents a smiling face with a wi...
2,😄,1f604,face-smiling,grinning face with smiling eyes,"eye, face, mouth, open, smile",,"A yellow face with smiling eyes and a broad, o...","[smiling, happy, amusement, joy, laughter, con...",This emoji represents a smiling face with happ...
3,😁,1f601,face-smiling,beaming face with smiling eyes,"eye, face, grin, smile",,A yellow face with smiling eyes and full-tooth...,"[happy, joy, delight, smiling, face, emotion]",This emoji represents a beaming face with smil...
4,😆,1f606,face-smiling,grinning squinting face,"face, laugh, mouth, satisfied, smile",,"A yellow face with a broad, open smile and scr...","[smiling, happy, humor, amusement, laughter, f...",This emoji represents a smiling face with squi...


## Handling Tags and Descriptions

In [48]:
def clean_text(text):
    if not isinstance(text, str) or pd.isna(text) or text.strip().lower() == "nan":  
        return ""  # Return empty string for NaN or "nan" strings
    text = text.lower().strip() # Convert to lowercase and remove unnecessary spaces
    # Keep only letters, numbers, spaces, * and #
    text = re.sub(r'[^a-z0-9\s*#]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize the tokens
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ', '.join(tokens)

def remove_duplicates(text):
    words = [word.strip() for word in text.split(",")]  # Split by commas and strip spaces
    unique_words = sorted(set(word.strip() for word in text.split(",")))
    return ', '.join(unique_words)  # Join back into a string

In [49]:
# Clean openmoji_annotation column
# final_df["cleaned_annotations"] = final_df["openmoji_annotation"].progress_apply(clean_text)
# Clean llm_tags
final_df["cleaned_llm_tags"] = final_df["llm_tags"].progress_apply(clean_text)

# List of columns to merge
columns_to_merge = ["openmoji_tags_1", "openmoji_tags_2", "cleaned_llm_tags"]  # Not merging annotations

# Fill NaN with empty strings, then merge columns
final_df["merged_tags"] = final_df[columns_to_merge].fillna("").agg(
    lambda x: ", ".join(filter(None, map(str, x))), axis=1
)

100%|██████████| 168/168 [00:00<00:00, 601744.72it/s]


In [50]:
# Remove duplicates
final_df["final_tags"] = final_df["merged_tags"].progress_apply(remove_duplicates)

100%|██████████| 168/168 [00:00<00:00, 388233.10it/s]


In [51]:
# # Limiting only 2 sentences because the content is too long with too many references to other emoji, which might be more confusing later for the model to learn.
# 
# """
# Reference: Chat-GPT-4o
# Prompt: My df_images has a emojipedia_description column. Only keep the first two sentences in the column and remove the rest.
# """
# # Function to keep only the first two sentences
# def keep_first_two_sentences(description):
#     if pd.isna(description):  # Handle missing values (NaNs)
#         return description
#     sentences = re.split(r'(?<=[.!?]) +', description.strip())  # Split by sentence-ending punctuation (., !, ?)
#     return ' '.join(sentences[:2])  # Return only the first two sentences
# 
# # Apply the function to the 'llm_description' column
# final_df['emojipedia_description'] = final_df['emojipedia_description'].progress_apply(keep_first_two_sentences)

In [52]:
# # Function to handle concatenation with empty strings and NaN values
# def merge_descriptions(row):
#     parts = []
#     if pd.notna(row['emojipedia_description']) and row['emojipedia_description'].strip():
#         parts.append(row['emojipedia_description'].strip())
#     if pd.notna(row['llm_description']) and row['llm_description'].strip():
#         parts.append(row['llm_description'].strip())
#     if pd.notna(row['final_tags']) and row['final_tags'].strip():
#         parts.append("Tags: " + row['final_tags'].strip())
#     
#     return " ".join(parts)
# 
# # Apply the function to each row
# final_df['merged_description'] = final_df.apply(merge_descriptions, axis=1)

In [53]:
# Function to handle concatenation with empty strings and NaN values
def merge_descriptions(row):
    parts = []
    if pd.notna(row['openmoji_annotation']) and row['openmoji_annotation'].strip():
        parts.append(row['openmoji_annotation'].strip())
    if pd.notna(row['llm_description']) and row['llm_description'].strip():
        parts.append(row['llm_description'].strip())
    if pd.notna(row['final_tags']) and row['final_tags'].strip():
        parts.append("Tags: " + row['final_tags'].strip())
    
    return ". ".join(parts)

# Apply the function to each row
final_df['merged_description'] = final_df.apply(merge_descriptions, axis=1)

In [54]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   emoji                   168 non-null    object
 1   hexcode                 168 non-null    object
 2   subgroups               168 non-null    object
 3   openmoji_annotation     168 non-null    object
 4   openmoji_tags_1         168 non-null    object
 5   openmoji_tags_2         0 non-null      object
 6   emojipedia_description  167 non-null    object
 7   llm_tags                154 non-null    object
 8   llm_description         154 non-null    object
 9   cleaned_llm_tags        168 non-null    object
 10  merged_tags             168 non-null    object
 11  final_tags              168 non-null    object
 12  merged_description      168 non-null    object
dtypes: object(13)
memory usage: 17.2+ KB


## Embedding using Sentence BERT

In [55]:
# Use GPU if available
"""
Reference: https://pytorch.org/get-started/locally/
"""

# Check for NVIDIA GPU
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use CUDA (NVIDIA GPU)
    print("Using NVIDIA GPU (CUDA)")

# Check for Mac Silicon GPU (MPS)
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Metal Performance Shaders (Mac Silicon GPU)
    print("Using Mac GPU (MPS)")

# Default to CPU if no GPU is available
else:
    device = torch.device("cpu")
    print("Using CPU")

Using Mac GPU (MPS)


In [56]:
# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sbert_model = sbert_model.to(device)

# Ensure text columns are strings
# final_df["emojipedia_description"] = final_df["emojipedia_description"].fillna("").astype(str)
# final_df["llm_description"] = final_df["llm_description"].fillna("").astype(str)

# Return SBERT embedding for a given text.
def embed_text(text):
    if pd.isna(text) or text.strip() == "":
        return np.zeros(384, dtype=np.float32)  # Return zero vector for missing values (SBERT output size = 384)
    return sbert_model.encode(text).astype(np.float32)

In [57]:
"""
Reference: https://sbert.net/
"""

# Apply SBERT embeddings to each columns
# print("Embedding final_tags...")
# final_df["tags_embedding"] = final_df["final_tags"].progress_apply(embed_text)
# 
# print("Embedding emojipedia_description...")
# final_df["emojipedia_embedding"] = final_df["emojipedia_description"].progress_apply(embed_text)
# 
# print("Embedding llm_description...")
# final_df["llm_embedding"] = final_df["llm_description"].progress_apply(embed_text)

# Apply SBERT embedding to merged_description
final_df["combined_embedding"] = final_df["merged_description"].progress_apply(embed_text)

100%|██████████| 168/168 [00:03<00:00, 45.90it/s]


In [58]:
# Apply SBERT embedding to subgroups
final_df["group_embedding"] = final_df["subgroups"].progress_apply(embed_text)

100%|██████████| 168/168 [00:01<00:00, 109.36it/s]


In [59]:
# final_df["combined_embedding"] = final_df.apply(
#     lambda row: np.concatenate([row["tags_embedding"], row["emojipedia_embedding"], row["llm_embedding"]]), axis=1
# )

In [60]:
final_df.head()

Unnamed: 0,emoji,hexcode,subgroups,openmoji_annotation,openmoji_tags_1,openmoji_tags_2,emojipedia_description,llm_tags,llm_description,cleaned_llm_tags,merged_tags,final_tags,merged_description,combined_embedding,group_embedding
0,😀,1f600,face-smiling,grinning face,"face, grin",,"A yellow face with simple, open eyes and a bro...","[smiling, happy, amusement, positive, friendly...",This emoji represents a smiling face with a br...,,"face, grin","face, grin",grinning face. This emoji represents a smiling...,"[-0.104532756, 0.0633581, 0.014499624, 0.01156...","[-0.044545308, 0.09074001, -0.034251396, 0.016..."
1,😃,1f603,face-smiling,grinning face with big eyes,"face, mouth, open, smile",,"A yellow face with smiling eyes and a broad, o...","[smiling, happiness, joy, amusement, face, emo...",This emoji represents a smiling face with a wi...,,"face, mouth, open, smile","face, mouth, open, smile",grinning face with big eyes. This emoji repres...,"[-0.041463498, 0.03491759, 0.026930302, 0.0079...","[-0.044545308, 0.09074001, -0.034251396, 0.016..."
2,😄,1f604,face-smiling,grinning face with smiling eyes,"eye, face, mouth, open, smile",,"A yellow face with smiling eyes and a broad, o...","[smiling, happy, amusement, joy, laughter, con...",This emoji represents a smiling face with happ...,,"eye, face, mouth, open, smile","eye, face, mouth, open, smile",grinning face with smiling eyes. This emoji re...,"[-0.0701911, 0.017480828, 0.00540135, 0.009211...","[-0.044545308, 0.09074001, -0.034251396, 0.016..."
3,😁,1f601,face-smiling,beaming face with smiling eyes,"eye, face, grin, smile",,A yellow face with smiling eyes and full-tooth...,"[happy, joy, delight, smiling, face, emotion]",This emoji represents a beaming face with smil...,,"eye, face, grin, smile","eye, face, grin, smile",beaming face with smiling eyes. This emoji rep...,"[-0.058498748, 0.035195988, 0.03350492, 0.0154...","[-0.044545308, 0.09074001, -0.034251396, 0.016..."
4,😆,1f606,face-smiling,grinning squinting face,"face, laugh, mouth, satisfied, smile",,"A yellow face with a broad, open smile and scr...","[smiling, happy, humor, amusement, laughter, f...",This emoji represents a smiling face with squi...,,"face, laugh, mouth, satisfied, smile","face, laugh, mouth, satisfied, smile",grinning squinting face. This emoji represents...,"[-0.07049953, -0.01600013, 0.03081883, 0.00752...","[-0.044545308, 0.09074001, -0.034251396, 0.016..."


## Linking Images

In [61]:
import os
import pandas as pd
from tqdm import tqdm

# Define base image path and brands
image_base_path = "../data/tensor_images/"
brands = ["GoogleEmoji", "JoyPixelsEmoji", "TwitterEmoji"]

# Function to find all available image paths for a given hexcode
def get_image_paths(hexcode):
    image_paths = {}
    
    for brand in brands:
        brand_path = os.path.join(image_base_path, brand)
        if not os.path.exists(brand_path): # Skip if folder doesn't exist
            continue
            
        expected_filename = f"{hexcode}.pt"  # Adjust based on actual format
        
        if expected_filename in os.listdir(brand_path):
            image_paths[brand] = os.path.join(brand_path, expected_filename)

    return image_paths

# Expand dataframe with tqdm progress bar
expanded_rows = []
for _, row in tqdm(final_df.iterrows(), total=len(final_df), desc="Processing Hexcodes"):
    hexcode = row["hexcode"]
    embedding = row["combined_embedding"]
    group_embedding = row["group_embedding"]

    image_paths = get_image_paths(hexcode)  # Get list of image paths
    
    if image_paths:  # If images exist, create multiple rows
        for brand, path in image_paths.items():
            expanded_rows.append({"hexcode": hexcode, "combined_embedding": embedding, "group_embedding": group_embedding, "image_path": path})
    else:
        # If no images exist, optionally add a row with NaN for image_path
        expanded_rows.append({"hexcode": hexcode, "combined_embedding": embedding, "image_path": None})

# Convert to DataFrame
expanded_df = pd.DataFrame(expanded_rows)

# Optional: Drop rows where no image is found
expanded_df = expanded_df.dropna(subset=["image_path"]).reset_index(drop=True)

Processing Hexcodes: 100%|██████████| 168/168 [00:00<00:00, 246.30it/s]


In [62]:
expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438 entries, 0 to 437
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   hexcode             438 non-null    object
 1   combined_embedding  438 non-null    object
 2   group_embedding     438 non-null    object
 3   image_path          438 non-null    object
dtypes: object(4)
memory usage: 13.8+ KB


In [63]:
output_file = '../data/processed_emoji_dataset.parquet'

# Check if the file exists and remove it
if os.path.exists(output_file):
    os.remove(output_file)

# Now save the DataFrame as Parquet
expanded_df[['combined_embedding', 'group_embedding', 'image_path']].to_parquet(output_file, index=False)