In [30]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch

# Download NLTK data files (only need to run once)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Apply tqdm to all .apply() functions by using progress_apply
tqdm.pandas()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/bikinghimire/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bikinghimire/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bikinghimire/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
# Reading the dataset
openmoji_df = pd.read_csv('../data/openmoji.csv')

In [32]:
# Making the hexcode uniform in all 3 dataframes for merging
# removing -f30f (differentiation between image type emoji and textual type emoji)
# removing -200d (differentiation for emoji with skin-tone)

openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-FE0F', '', regex=True)
openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-200D', '', regex=True)

In [33]:
openmoji_df['hexcode'] = openmoji_df['hexcode'].str.lower()

In [34]:
# Checking for duplicates
duplicate_counts = openmoji_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

hexcode
1f3f3    2
Name: count, dtype: int64


In [35]:
# Use GPU if available
"""
Reference: https://pytorch.org/get-started/locally/
"""

# Check for NVIDIA GPU
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use CUDA (NVIDIA GPU)
    print("Using NVIDIA GPU (CUDA)")

# Check for Mac Silicon GPU (MPS)
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Metal Performance Shaders (Mac Silicon GPU)
    print("Using Mac GPU (MPS)")

# Default to CPU if no GPU is available
else:
    device = torch.device("cpu")
    print("Using CPU")

Using Mac GPU (MPS)


In [36]:
# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sbert_model = sbert_model.to(device)

# Return SBERT embedding for a given text.
def embed_text(text):
    if pd.isna(text) or text.strip() == "":
        return np.zeros(384, dtype=np.float32)  # Return zero vector for missing values (SBERT output size = 384)
    return sbert_model.encode(text).astype(np.float32)

In [37]:
# Applu SBERT embedding to annotation in openmoji_df
openmoji_df["combined_embedding"] = openmoji_df["annotation"].progress_apply(embed_text)

100%|██████████| 4284/4284 [00:35<00:00, 119.33it/s]


In [38]:
openmoji_df.head()

Unnamed: 0,emoji,hexcode,group,subgroups,annotation,tags,openmoji_tags,openmoji_author,openmoji_date,skintone,skintone_combination,skintone_base_emoji,skintone_base_hexcode,unicode,order,combined_embedding
0,😀,1f600,smileys-emotion,face-smiling,grinning face,"face, grin",,Emily Jäger,2018-04-18,,,,,1.0,1.0,"[-0.06182415, 0.05820662, 0.014787203, 0.04018..."
1,😃,1f603,smileys-emotion,face-smiling,grinning face with big eyes,"face, mouth, open, smile",,Emily Jäger,2018-04-18,,,,,0.6,2.0,"[0.00064931595, 0.08000965, 0.020743383, 0.044..."
2,😄,1f604,smileys-emotion,face-smiling,grinning face with smiling eyes,"eye, face, mouth, open, smile",,Emily Jäger,2018-04-18,,,,,0.6,3.0,"[-0.032483187, 0.062074404, -0.002212019, 0.04..."
3,😁,1f601,smileys-emotion,face-smiling,beaming face with smiling eyes,"eye, face, grin, smile",,Emily Jäger,2018-04-18,,,,,0.6,4.0,"[-0.032452203, 0.07676125, 0.019601094, 0.0397..."
4,😆,1f606,smileys-emotion,face-smiling,grinning squinting face,"face, laugh, mouth, satisfied, smile",,Emily Jäger,2018-04-18,,,,,0.6,5.0,"[-0.009676312, -0.0067937872, 0.035916783, 0.0..."


In [39]:
openmoji_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4284 entries, 0 to 4283
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   emoji                  4284 non-null   object 
 1   hexcode                4284 non-null   object 
 2   group                  4284 non-null   object 
 3   subgroups              4284 non-null   object 
 4   annotation             4284 non-null   object 
 5   tags                   1907 non-null   object 
 6   openmoji_tags          392 non-null    object 
 7   openmoji_author        4284 non-null   object 
 8   openmoji_date          4284 non-null   object 
 9   skintone               1875 non-null   object 
 10  skintone_combination   2198 non-null   object 
 11  skintone_base_emoji    2198 non-null   object 
 12  skintone_base_hexcode  2198 non-null   object 
 13  unicode                3903 non-null   object 
 14  order                  3782 non-null   float64
 15  comb

In [40]:
import os
import pandas as pd
from tqdm import tqdm

# Define base image path and brands
image_base_path = "../data/tensor_images/OpenmojiEmoji"

# Function to find all available image paths for a given hexcode
def get_image_paths(hexcode):
    image_paths = {}
            
    expected_filename = f"{hexcode}.pt"  # Adjust based on actual format
        
    if expected_filename in os.listdir(image_base_path):
        image_paths = os.path.join(image_base_path, expected_filename)

    return image_paths

# Expand dataframe with tqdm progress bar
expanded_rows = []
for _, row in tqdm(openmoji_df.iterrows(), total=len(openmoji_df), desc="Processing Hexcodes"):
    hexcode = row["hexcode"]
    embedding = row["combined_embedding"]

    image_paths = get_image_paths(hexcode)  # Get list of image paths
    
    if image_paths:
        expanded_rows.append({"hexcode": hexcode, "combined_embedding": embedding, "image_path": image_paths})
    else:
        # If no images exist, optionally add a row with NaN for image_path
        expanded_rows.append({"hexcode": hexcode, "combined_embedding": embedding, "image_path": None})

# Convert to DataFrame
expanded_df = pd.DataFrame(expanded_rows)

# Optional: Drop rows where no image is found
expanded_df = expanded_df.dropna(subset=["image_path"]).reset_index(drop=True)

Processing Hexcodes: 100%|██████████| 4284/4284 [00:07<00:00, 543.57it/s]


In [41]:
expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4284 entries, 0 to 4283
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   hexcode             4284 non-null   object
 1   combined_embedding  4284 non-null   object
 2   image_path          4284 non-null   object
dtypes: object(3)
memory usage: 100.5+ KB


In [42]:
output_file = '../data/processed_openmoji_dataset.parquet'

# Check if the file exists and remove it
if os.path.exists(output_file):
    os.remove(output_file)

# Now save the DataFrame as Parquet
expanded_df[['combined_embedding', 'image_path']].to_parquet(output_file, index=False)