In [53]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch

# Download NLTK data files (only need to run once)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Apply tqdm to all .apply() functions by using progress_apply
tqdm.pandas()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/bikinghimire/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bikinghimire/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bikinghimire/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [54]:
# Reading the dataset
openmoji_df = pd.read_csv('../data/openmoji.csv')
llm_df = pd.read_parquet('../data/llmemoji.parquet')

In [55]:
# Convert Unicode string (e.g., 'U+1F600', 'U+263A,FE0F') to hex code ('1F600', '263A-FE0F').

def unicode_to_hex(unicode_str):
    unicode_str = unicode_str.replace(",", " ")  # Replace commas with spaces
    # First splitting the input string to a list of substrings
    # Loops though each substring
    # Removes the U+ prefix from each substring
    hex_values = [u.replace("U+", "") for u in unicode_str.split()]
    # Join the values with hyphens
    return "-".join(hex_values)

llm_df['hexcode'] = llm_df['unicode'].progress_apply(unicode_to_hex)

100%|██████████| 5034/5034 [00:00<00:00, 1115202.36it/s]


In [56]:
# Making the hexcode uniform in all 3 dataframes for merging
# removing -f30f (differentiation between image type emoji and textual type emoji)
# removing -200d (differentiation for emoji with skin-tone)

openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-FE0F', '', regex=True)
openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-200D', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-FE0F', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-200D', '', regex=True)

In [57]:
openmoji_df['hexcode'] = openmoji_df['hexcode'].str.lower()
llm_df['hexcode'] = llm_df['hexcode'].str.lower()

In [58]:
# openmoji_df = openmoji_df[openmoji_df["group"] == "smileys-emotion"]
openmoji_df = openmoji_df[openmoji_df["group"].isin(["smileys-emotion", "people-body"])]
openmoji_df = openmoji_df[~openmoji_df["subgroups"].isin(["person-symbol", "emotion"])]
# Removing Skin tones
# openmoji_df = openmoji_df[~openmoji_df['hexcode'].str.contains('1f3fb|1f3fc|1f3fd|1f3fe|1f3ff', regex=True)]
# Removing extra emojis

In [59]:
# Checking for duplicates
duplicate_counts = openmoji_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

Series([], Name: count, dtype: int64)


In [60]:
# Checking for duplicates
duplicate_counts = llm_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)
llm_df = llm_df[~llm_df.duplicated(subset=['hexcode'], keep=False)]

hexcode
1f3c3-1f3fd-2640-27a1           4
1f6b6-1f3ff-2642-27a1           4
1f6b6-1f3ff-2640-27a1           4
1f6b6-1f3fe-2640-27a1           4
1f6b6-1f3fc-2640-27a1           4
                               ..
1f9d1-1f3fc-2764-1f9d1-1f3fd    2
1f469-1f3ff-2764-1f468-1f3fd    2
1f9d1-1f3fd-2764-1f9d1-1f3ff    2
1f327                           2
2601                            2
Name: count, Length: 1160, dtype: int64


In [61]:
# Merge the dataframes on 'hexcode'
merged_df = openmoji_df.merge(llm_df, on='hexcode', how='left')  # Changed outer merge to left
# Convert hexcode to lowercase
merged_df['hexcode'] = merged_df['hexcode'].str.lower()

In [62]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2404 entries, 0 to 2403
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   emoji                  2404 non-null   object 
 1   hexcode                2404 non-null   object 
 2   group                  2404 non-null   object 
 3   subgroups              2404 non-null   object 
 4   annotation             2404 non-null   object 
 5   tags_x                 529 non-null    object 
 6   openmoji_tags          0 non-null      object 
 7   openmoji_author        2404 non-null   object 
 8   openmoji_date          2404 non-null   object 
 9   skintone               1875 non-null   object 
 10  skintone_combination   2198 non-null   object 
 11  skintone_base_emoji    2198 non-null   object 
 12  skintone_base_hexcode  2198 non-null   object 
 13  unicode_x              2404 non-null   object 
 14  order                  2404 non-null   float64
 15  char

In [63]:
# Function to handle concatenation with empty strings and NaN values
def merge_descriptions(row):
    parts = []
    if pd.notna(row['annotation']) and row['annotation'].strip():
        parts.append(row['annotation'].strip())
    if pd.notna(row['LLM description']) and row['LLM description'].strip():
        parts.append(row['LLM description'].strip())
    
    return ". ".join(parts)

# Apply the function to each row
merged_df['merged_description'] = merged_df.apply(merge_descriptions, axis=1)

In [64]:
# Use GPU if available
"""
Reference: https://pytorch.org/get-started/locally/
"""

# Check for NVIDIA GPU
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use CUDA (NVIDIA GPU)
    print("Using NVIDIA GPU (CUDA)")

# Check for Mac Silicon GPU (MPS)
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Metal Performance Shaders (Mac Silicon GPU)
    print("Using Mac GPU (MPS)")

# Default to CPU if no GPU is available
else:
    device = torch.device("cpu")
    print("Using CPU")

Using Mac GPU (MPS)


In [65]:
# # Load SBERT model
# sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
# sbert_model = sbert_model.to(device)
# 
# # Return SBERT embedding for a given text.
# def embed_text(text):
#     if pd.isna(text) or text.strip() == "":
#         return np.zeros(384, dtype=np.float32)  # Return zero vector for missing values (SBERT output size = 384)
#     return sbert_model.encode(text).astype(np.float32)

In [66]:
# # Applu SBERT embedding to annotation in openmoji_df
# merged_df["combined_embedding"] = merged_df["merged_description"].progress_apply(embed_text)

In [67]:
from transformers import CLIPTokenizer, CLIPTextModel
import torch

# Load CLIP's tokenizer and text model.
clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
clip_model = clip_model.to(device)

def mean_pooling(model_output, attention_mask):
    """Mean pool the token embeddings."""
    token_embeddings = model_output.last_hidden_state  # (batch_size, sequence_length, hidden_dim)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)

def embed_text(text):
    if pd.isna(text) or text.strip() == "":
        # Adjust the zero vector size to match CLIP's output dimension (512 for clip-vit-base-patch32)
        return np.zeros(512, dtype=np.float32)
    
    # Tokenize the input text
    inputs = clip_tokenizer(text, return_tensors="pt", truncation=True, max_length=77)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # Disable gradients for inference
    with torch.no_grad():
        output = clip_model(**inputs)
    
    # Pool the token embeddings (mean pooling)
    pooled_embedding = mean_pooling(output, inputs["attention_mask"])
    
    # Optionally, you might want to L2 normalize the pooled embedding:
    pooled_embedding = torch.nn.functional.normalize(pooled_embedding, p=2, dim=-1)
    
    return pooled_embedding.squeeze().cpu().numpy().astype(np.float32)

# Apply CLIP embedding to your dataset
merged_df["combined_embedding"] = merged_df["merged_description"].progress_apply(embed_text)

100%|██████████| 2404/2404 [00:48<00:00, 49.08it/s]


In [68]:
merged_df.head()

Unnamed: 0,emoji,hexcode,group,subgroups,annotation,tags_x,openmoji_tags,openmoji_author,openmoji_date,skintone,...,skintone_base_hexcode,unicode_x,order,character,unicode_y,short description,tags_y,LLM description,merged_description,combined_embedding
0,😀,1f600,smileys-emotion,face-smiling,grinning face,"face, grin",,Emily Jäger,2018-04-18,,...,,1.0,1.0,😀,U+1F600,GRINNING FACE,"[smiling, happy, amusement, positive, friendly...",This emoji represents a smiling face with a br...,grinning face. This emoji represents a smiling...,"[-0.009248982, -0.047417954, -0.0052406015, 0...."
1,😃,1f603,smileys-emotion,face-smiling,grinning face with big eyes,"face, mouth, open, smile",,Emily Jäger,2018-04-18,,...,,0.6,2.0,😃,U+1F603,GRINNING FACE WITH BIG EYES,"[smiling, happiness, joy, amusement, face, emo...",This emoji represents a smiling face with a wi...,grinning face with big eyes. This emoji repres...,"[0.015901517, -0.033154536, -0.010774338, 0.04..."
2,😄,1f604,smileys-emotion,face-smiling,grinning face with smiling eyes,"eye, face, mouth, open, smile",,Emily Jäger,2018-04-18,,...,,0.6,3.0,😄,U+1F604,GRINNING FACE WITH SMILING EYES,"[smiling, happy, amusement, joy, laughter, con...",This emoji represents a smiling face with happ...,grinning face with smiling eyes. This emoji re...,"[-0.0005932258, -0.0486032, -0.00471365, 0.028..."
3,😁,1f601,smileys-emotion,face-smiling,beaming face with smiling eyes,"eye, face, grin, smile",,Emily Jäger,2018-04-18,,...,,0.6,4.0,😁,U+1F601,BEAMING FACE WITH SMILING EYES,"[happy, joy, delight, smiling, face, emotion]",This emoji represents a beaming face with smil...,beaming face with smiling eyes. This emoji rep...,"[-0.0002108886, -0.035330746, -0.0045009465, 0..."
4,😆,1f606,smileys-emotion,face-smiling,grinning squinting face,"face, laugh, mouth, satisfied, smile",,Emily Jäger,2018-04-18,,...,,0.6,5.0,😆,U+1F606,GRINNING SQUINTING FACE,"[smiling, happy, humor, amusement, laughter, f...",This emoji represents a smiling face with squi...,grinning squinting face. This emoji represents...,"[0.0041811396, -0.039693166, -0.010858571, 0.0..."


In [69]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2404 entries, 0 to 2403
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   emoji                  2404 non-null   object 
 1   hexcode                2404 non-null   object 
 2   group                  2404 non-null   object 
 3   subgroups              2404 non-null   object 
 4   annotation             2404 non-null   object 
 5   tags_x                 529 non-null    object 
 6   openmoji_tags          0 non-null      object 
 7   openmoji_author        2404 non-null   object 
 8   openmoji_date          2404 non-null   object 
 9   skintone               1875 non-null   object 
 10  skintone_combination   2198 non-null   object 
 11  skintone_base_emoji    2198 non-null   object 
 12  skintone_base_hexcode  2198 non-null   object 
 13  unicode_x              2404 non-null   object 
 14  order                  2404 non-null   float64
 15  char

In [70]:
output_file = '../data/test_openmoji_dataset.csv'

# Check if the file exists and remove it
if os.path.exists(output_file):
    os.remove(output_file)

# Now save the DataFrame as Parquet
merged_df[['hexcode', 'group', 'subgroups', 'annotation', 'LLM description']].to_csv(output_file, index=False)

In [71]:
# import os
# import pandas as pd
# from tqdm import tqdm
# 
# # Define base image path and brands
# image_base_path = "../data/tensor_images/OpenmojiEmoji"
# 
# # Function to find all available image paths for a given hexcode
# def get_image_paths(hexcode):
#     image_paths = {}
#             
#     expected_filename = f"{hexcode}.pt"  # Adjust based on actual format
#         
#     if expected_filename in os.listdir(image_base_path):
#         image_paths = os.path.join(image_base_path, expected_filename)
# 
#     return image_paths
# 
# # Expand dataframe with tqdm progress bar
# expanded_rows = []
# for _, row in tqdm(openmoji_df.iterrows(), total=len(openmoji_df), desc="Processing Hexcodes"):
#     hexcode = row["hexcode"]
#     embedding = row["combined_embedding"]
# 
#     image_paths = get_image_paths(hexcode)  # Get list of image paths
#     
#     if image_paths:
#         expanded_rows.append({"hexcode": hexcode, "combined_embedding": embedding, "image_path": image_paths})
#     else:
#         # If no images exist, optionally add a row with NaN for image_path
#         expanded_rows.append({"hexcode": hexcode, "combined_embedding": embedding, "image_path": None})
# 
# # Convert to DataFrame
# expanded_df = pd.DataFrame(expanded_rows)
# 
# # Optional: Drop rows where no image is found
# expanded_df = expanded_df.dropna(subset=["image_path"]).reset_index(drop=True)

In [72]:
import os
import pandas as pd
from tqdm import tqdm

# Define base image path and brands
image_base_path = "../data/tensor_images/"
# brands = ["GoogleEmoji", "JoyPixelsEmoji", "TwitterEmoji"]
brands = ["OpenMojiEmoji"]

# Function to find all available image paths for a given hexcode
def get_image_paths(hexcode):
    image_paths = {}
    
    for brand in brands:
        brand_path = os.path.join(image_base_path, brand)
        if not os.path.exists(brand_path): # Skip if folder doesn't exist
            continue
            
        expected_filename = f"{hexcode}.pt"  # Adjust based on actual format
        
        if expected_filename in os.listdir(brand_path):
            image_paths[brand] = os.path.join(brand_path, expected_filename)

    return image_paths

# Expand dataframe with tqdm progress bar
expanded_rows = []
for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc="Processing Hexcodes"):
    hexcode = row["hexcode"]
    embedding = row["combined_embedding"]

    image_paths = get_image_paths(hexcode)  # Get list of image paths
    
    if image_paths:  # If images exist, create multiple rows
        for brand, path in image_paths.items():
            expanded_rows.append({"hexcode": hexcode, "combined_embedding": embedding, "image_path": path})
    else:
        # If no images exist, optionally add a row with NaN for image_path
        expanded_rows.append({"hexcode": hexcode, "combined_embedding": embedding, "image_path": None})

# Convert to DataFrame
expanded_df = pd.DataFrame(expanded_rows)

# Optional: Drop rows where no image is found
expanded_df = expanded_df.dropna(subset=["image_path"]).reset_index(drop=True)

Processing Hexcodes: 100%|██████████| 2404/2404 [00:04<00:00, 517.46it/s]


In [73]:
expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2404 entries, 0 to 2403
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   hexcode             2404 non-null   object
 1   combined_embedding  2404 non-null   object
 2   image_path          2404 non-null   object
dtypes: object(3)
memory usage: 56.5+ KB


In [74]:
expanded_df.head()

Unnamed: 0,hexcode,combined_embedding,image_path
0,1f600,"[-0.009248982, -0.047417954, -0.0052406015, 0....",../data/tensor_images/OpenMojiEmoji/1f600.pt
1,1f603,"[0.015901517, -0.033154536, -0.010774338, 0.04...",../data/tensor_images/OpenMojiEmoji/1f603.pt
2,1f604,"[-0.0005932258, -0.0486032, -0.00471365, 0.028...",../data/tensor_images/OpenMojiEmoji/1f604.pt
3,1f601,"[-0.0002108886, -0.035330746, -0.0045009465, 0...",../data/tensor_images/OpenMojiEmoji/1f601.pt
4,1f606,"[0.0041811396, -0.039693166, -0.010858571, 0.0...",../data/tensor_images/OpenMojiEmoji/1f606.pt


In [75]:
output_file = '../data/processed_openmoji_dataset.parquet'

# Check if the file exists and remove it
if os.path.exists(output_file):
    os.remove(output_file)

# Now save the DataFrame as Parquet
expanded_df[['combined_embedding', 'image_path']].to_parquet(output_file, index=False)