In [42]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch

# Download NLTK data files (only need to run once)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Apply tqdm to all .apply() functions by using progress_apply
tqdm.pandas()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rajprasadshrestha/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajprasadshrestha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rajprasadshrestha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [43]:
openmoji_df = pd.read_csv('../data/openmoji.csv')
emojipedia_df = pd.read_csv('../data/emojipedia.csv')
llm_df = pd.read_parquet('../data/llmemoji.parquet')

In [44]:
# Convert Unicode string (e.g., 'U+1F600', 'U+263A,FE0F') to hex code ('1F600', '263A-FE0F').

def unicode_to_hex(unicode_str):
    unicode_str = unicode_str.replace(",", " ")  # Replace commas with spaces
    # First splitting the input string to a list of substrings
    # Loops though each substring
    # Removes the U+ prefix from each substring
    hex_values = [u.replace("U+", "") for u in unicode_str.split()]
    # Join the values with hyphens
    return "-".join(hex_values)

# Convert 'unicode' column in emojipedia_df and llm_df to 'hexcode'
emojipedia_df['hexcode'] = emojipedia_df['Codepoints Hex'].progress_apply(unicode_to_hex)
llm_df['hexcode'] = llm_df['unicode'].progress_apply(unicode_to_hex)

100%|██████████| 1885/1885 [00:00<00:00, 453236.82it/s]
100%|██████████| 5034/5034 [00:00<00:00, 1037090.54it/s]


In [45]:
# Making the hexcode uniform in all 3 dataframes for merging
# removing -f30f (differentiation between image type emoji and textual type emoji)
# removing -200d (differentiation for emoji with skin-tone)

openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-FE0F', '', regex=True)
openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-200D', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-FE0F', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-200D', '', regex=True)
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.replace('-FE0F', '', regex=True)
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.replace('-200D', '', regex=True)

In [46]:
# Checking for duplicates
duplicate_counts = openmoji_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

hexcode
1F3F3    2
Name: count, dtype: int64


In [47]:
# Checking for duplicates
duplicate_counts = llm_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

hexcode
1F3C3-1F3FD-2640-27A1           4
1F6B6-1F3FF-2642-27A1           4
1F6B6-1F3FF-2640-27A1           4
1F6B6-1F3FE-2640-27A1           4
1F6B6-1F3FC-2640-27A1           4
                               ..
1F9D1-1F3FC-2764-1F9D1-1F3FD    2
1F469-1F3FF-2764-1F468-1F3FD    2
1F9D1-1F3FD-2764-1F9D1-1F3FF    2
1F327                           2
2601                            2
Name: count, Length: 1160, dtype: int64


In [48]:
duplicate_counts = emojipedia_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

Series([], Name: count, dtype: int64)


In [49]:
# Removing duplicates
openmoji_df = openmoji_df[~openmoji_df.duplicated(subset=['hexcode'], keep=False)]
llm_df = llm_df[~llm_df.duplicated(subset=['hexcode'], keep=False)]

In [50]:
# Merge the dataframes on 'hexcode'
merged_df = openmoji_df.merge(emojipedia_df, on='hexcode', how='outer')
merged_df = merged_df.merge(llm_df, on='hexcode', how='outer')
# Convert hexcode to lowercase
merged_df['hexcode'] = merged_df['hexcode'].str.lower()

In [51]:
merged_df.head()

Unnamed: 0,emoji,hexcode,group,subgroups,annotation,tags_x,openmoji_tags,openmoji_author,openmoji_date,skintone,...,Title,DescribedBy,URL,Description,Codepoints Hex,character,unicode_y,short description,tags_y,LLM description
0,#️⃣,0023-20e3,symbols,keycap,keycap: #,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,
1,*️⃣,002a-20e3,symbols,keycap,keycap: *,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,
2,-,002d,extras-unicode,symbol-other,hyphen-minus,,"hyphen, minus, dash, line",Robert Winslow,2022-12-24,,...,,,,,,,,,,
3,0️⃣,0030-20e3,symbols,keycap,keycap: 0,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,
4,1️⃣,0031-20e3,symbols,keycap,keycap: 1,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,


In [52]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4298 entries, 0 to 4297
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   emoji                  4282 non-null   object 
 1   hexcode                4298 non-null   object 
 2   group                  4282 non-null   object 
 3   subgroups              4282 non-null   object 
 4   annotation             4282 non-null   object 
 5   tags_x                 1906 non-null   object 
 6   openmoji_tags          391 non-null    object 
 7   openmoji_author        4282 non-null   object 
 8   openmoji_date          4282 non-null   object 
 9   skintone               1875 non-null   object 
 10  skintone_combination   2198 non-null   object 
 11  skintone_base_emoji    2198 non-null   object 
 12  skintone_base_hexcode  2198 non-null   object 
 13  unicode_x              3902 non-null   object 
 14  order                  3781 non-null   float64
 15  Grou

In [53]:
# Select and rename required columns
final_df = merged_df.copy()
final_df = final_df[['emoji', 'hexcode', 'annotation', 'tags_x', 'openmoji_tags', 'Description', 'tags_y', 'LLM description']]
final_df.columns = [
    'emoji', 'hexcode', 'openmoji_annotation', 'openmoji_tags_1', 'openmoji_tags_2', 'emojipedia_description', 'llm_tags', 'llm_description'
]

In [54]:
final_df.head()

Unnamed: 0,emoji,hexcode,openmoji_annotation,openmoji_tags_1,openmoji_tags_2,emojipedia_description,llm_tags,llm_description
0,#️⃣,0023-20e3,keycap: #,keycap,,,,
1,*️⃣,002a-20e3,keycap: *,keycap,,,,
2,-,002d,hyphen-minus,,"hyphen, minus, dash, line",,,
3,0️⃣,0030-20e3,keycap: 0,keycap,,,,
4,1️⃣,0031-20e3,keycap: 1,keycap,,,,


## Handling Tags and Descriptions

In [55]:
def clean_text(text):
    if not isinstance(text, str) or pd.isna(text) or text.strip().lower() == "nan":  
        return ""  # Return empty string for NaN or "nan" strings
    text = text.lower().strip() # Convert to lowercase and remove unnecessary spaces
    # Keep only letters, numbers, spaces, * and #
    text = re.sub(r'[^a-z0-9\s*#]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize the tokens
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ', '.join(tokens)

def remove_duplicates(text):
    words = [word.strip() for word in text.split(",")]  # Split by commas and strip spaces
    unique_words = sorted(set(word.strip() for word in text.split(",")))
    return ', '.join(unique_words)  # Join back into a string

In [56]:
# Clean openmoji_annotation column
final_df["cleaned_annotations"] = final_df["openmoji_annotation"].progress_apply(clean_text)
# Clean llm_tags
final_df["cleaned_llm_tags"] = final_df["llm_tags"].progress_apply(clean_text)

# List of columns to merge
columns_to_merge = ["cleaned_annotations", "openmoji_tags_1", "openmoji_tags_2", "cleaned_llm_tags"]

# Fill NaN with empty strings, then merge columns
final_df["merged_tags"] = final_df[columns_to_merge].fillna("").agg(
    lambda x: ", ".join(filter(None, map(str, x))), axis=1
)

100%|██████████| 4298/4298 [00:01<00:00, 3355.59it/s]
100%|██████████| 4298/4298 [00:00<00:00, 1838188.91it/s]


In [57]:
# Remove duplicates
final_df["final_tags"] = final_df["merged_tags"].progress_apply(remove_duplicates)

100%|██████████| 4298/4298 [00:00<00:00, 191458.08it/s]


In [58]:
# Limiting only 2 sentences because the content is too long with too many references to other emoji, which might be more confusing later for the model to learn.

"""
Reference: Chat-GPT-4o
Prompt: My df_images has a emojipedia_description column. Only keep the first two sentences in the column and remove the rest.
"""
# Function to keep only the first two sentences
def keep_first_two_sentences(description):
    if pd.isna(description):  # Handle missing values (NaNs)
        return description
    sentences = re.split(r'(?<=[.!?]) +', description.strip())  # Split by sentence-ending punctuation (., !, ?)
    return ' '.join(sentences[:2])  # Return only the first two sentences

# Apply the function to the 'llm_description' column
final_df['emojipedia_description'] = final_df['emojipedia_description'].progress_apply(keep_first_two_sentences)

100%|██████████| 4298/4298 [00:00<00:00, 207913.25it/s]


In [59]:
final_df.head()

Unnamed: 0,emoji,hexcode,openmoji_annotation,openmoji_tags_1,openmoji_tags_2,emojipedia_description,llm_tags,llm_description,cleaned_annotations,cleaned_llm_tags,merged_tags,final_tags
0,#️⃣,0023-20e3,keycap: #,keycap,,,,,"keycap, #",,"keycap, #, keycap","#, keycap"
1,*️⃣,002a-20e3,keycap: *,keycap,,,,,"keycap, *",,"keycap, *, keycap","*, keycap"
2,-,002d,hyphen-minus,,"hyphen, minus, dash, line",,,,hyphenminus,,"hyphenminus, hyphen, minus, dash, line","dash, hyphen, hyphenminus, line, minus"
3,0️⃣,0030-20e3,keycap: 0,keycap,,,,,"keycap, 0",,"keycap, 0, keycap","0, keycap"
4,1️⃣,0031-20e3,keycap: 1,keycap,,,,,"keycap, 1",,"keycap, 1, keycap","1, keycap"


In [60]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4298 entries, 0 to 4297
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   emoji                   4282 non-null   object
 1   hexcode                 4298 non-null   object
 2   openmoji_annotation     4282 non-null   object
 3   openmoji_tags_1         1906 non-null   object
 4   openmoji_tags_2         391 non-null    object
 5   emojipedia_description  1885 non-null   object
 6   llm_tags                2622 non-null   object
 7   llm_description         2622 non-null   object
 8   cleaned_annotations     4298 non-null   object
 9   cleaned_llm_tags        4298 non-null   object
 10  merged_tags             4298 non-null   object
 11  final_tags              4298 non-null   object
dtypes: object(12)
memory usage: 403.1+ KB


## Embedding using Sentence BERT

In [61]:
# Use GPU if available
"""
Reference: https://pytorch.org/get-started/locally/
"""

# Check for NVIDIA GPU
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use CUDA (NVIDIA GPU)
    print("Using NVIDIA GPU (CUDA)")

# Check for Mac Silicon GPU (MPS)
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Metal Performance Shaders (Mac Silicon GPU)
    print("Using Mac GPU (MPS)")

# Default to CPU if no GPU is available
else:
    device = torch.device("cpu")
    print("Using CPU")

Using Mac GPU (MPS)


In [62]:
# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sbert_model = sbert_model.to(device)

# Ensure text columns are strings
# final_df["emojipedia_description"] = final_df["emojipedia_description"].fillna("").astype(str)
# final_df["llm_description"] = final_df["llm_description"].fillna("").astype(str)

# Return SBERT embedding for a given text.
def embed_text(text):
    if pd.isna(text) or text.strip() == "":
        return np.zeros(384)  # Return zero vector for missing values (SBERT output size = 384)
    return sbert_model.encode(text)

In [63]:
"""
Reference: https://sbert.net/
"""

# Apply SBERT embeddings to each columns
print("Embedding final_tags...")
final_df["tags_embedding"] = final_df["final_tags"].progress_apply(embed_text)

print("Embedding emojipedia_description...")
final_df["emojipedia_embedding"] = final_df["emojipedia_description"].progress_apply(embed_text)

print("Embedding llm_description...")
final_df["llm_embedding"] = final_df["llm_description"].progress_apply(embed_text)

Embedding final_tags...


  1%|          | 23/4298 [00:00<01:59, 35.75it/s]

100%|██████████| 4298/4298 [00:41<00:00, 104.70it/s]


Embedding emojipedia_description...


100%|██████████| 4298/4298 [00:15<00:00, 283.38it/s]


Embedding llm_description...


100%|██████████| 4298/4298 [00:20<00:00, 207.64it/s]


In [64]:
final_df["combined_embedding"] = final_df.apply(
    lambda row: np.concatenate([row["tags_embedding"], row["emojipedia_embedding"], row["llm_embedding"]]), axis=1
)

In [65]:
final_df.head()

Unnamed: 0,emoji,hexcode,openmoji_annotation,openmoji_tags_1,openmoji_tags_2,emojipedia_description,llm_tags,llm_description,cleaned_annotations,cleaned_llm_tags,merged_tags,final_tags,tags_embedding,emojipedia_embedding,llm_embedding,combined_embedding
0,#️⃣,0023-20e3,keycap: #,keycap,,,,,"keycap, #",,"keycap, #, keycap","#, keycap","[0.02519471, -0.04989285, 0.04940779, -0.01256...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.02519471012055874, -0.0498928502202034, 0.0..."
1,*️⃣,002a-20e3,keycap: *,keycap,,,,,"keycap, *",,"keycap, *, keycap","*, keycap","[0.030324753, -0.047731012, 0.04585727, -0.008...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.030324753373861313, -0.04773101210594177, 0..."
2,-,002d,hyphen-minus,,"hyphen, minus, dash, line",,,,hyphenminus,,"hyphenminus, hyphen, minus, dash, line","dash, hyphen, hyphenminus, line, minus","[-0.024587138, 0.10079276, 0.011220697, -0.040...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.02458713762462139, 0.10079275816679001, 0...."
3,0️⃣,0030-20e3,keycap: 0,keycap,,,,,"keycap, 0",,"keycap, 0, keycap","0, keycap","[0.043693572, -0.01632516, -0.030014936, 0.029...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.04369357228279114, -0.01632516086101532, -0..."
4,1️⃣,0031-20e3,keycap: 1,keycap,,,,,"keycap, 1",,"keycap, 1, keycap","1, keycap","[0.022134297, -0.07334993, -0.0056481804, -0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.022134296596050262, -0.07334993034601212, -..."


## Linking Images

In [66]:
# Define paths
processed_image_path = "../data/tensor_images/"
brands = ["GoogleEmoji", "JoyPixelsEmoji", "OpenMojiEmoji", "TwitterEmoji"]

# Function to get image paths for a given hexcode
def get_image_paths(hexcode):
    image_paths = {}

    for brand in brands:
        brand_path = os.path.join(processed_image_path, brand)
        if not os.path.exists(brand_path):
            continue  # Skip if folder doesn't exist

        # List all files in the category
        matching_files = [f for f in os.listdir(brand_path) if hexcode in f]  # Match anywhere in the filename
        
        if matching_files:
            # Take the first match (if multiple exist)
            image_paths[brand] = os.path.join(brand_path, matching_files[0])
        else:
            image_paths[brand] = None  # No matching file found

    return image_paths

df_images = final_df.copy()  # Work on a copy to be safe

# Apply the function to get image paths for each row
df_images[['google_image_path', 'joypixels_image_path', 'openmoji_image_path', 'twitter_image_path']] = df_images['hexcode'].progress_apply(lambda x: pd.Series(get_image_paths(x)))

100%|██████████| 4298/4298 [00:23<00:00, 185.92it/s]


In [67]:
df_images

Unnamed: 0,emoji,hexcode,openmoji_annotation,openmoji_tags_1,openmoji_tags_2,emojipedia_description,llm_tags,llm_description,cleaned_annotations,cleaned_llm_tags,merged_tags,final_tags,tags_embedding,emojipedia_embedding,llm_embedding,combined_embedding,google_image_path,joypixels_image_path,openmoji_image_path,twitter_image_path
0,#️⃣,0023-20e3,keycap: #,keycap,,,,,"keycap, #",,"keycap, #, keycap","#, keycap","[0.02519471, -0.04989285, 0.04940779, -0.01256...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.02519471012055874, -0.0498928502202034, 0.0...",../data/tensor_images/GoogleEmoji/0023-20e3.pt,../data/tensor_images/JoyPixelsEmoji/0023-20e3.pt,../data/tensor_images/OpenMojiEmoji/0023-20e3.pt,
1,*️⃣,002a-20e3,keycap: *,keycap,,,,,"keycap, *",,"keycap, *, keycap","*, keycap","[0.030324753, -0.047731012, 0.04585727, -0.008...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.030324753373861313, -0.04773101210594177, 0...",../data/tensor_images/GoogleEmoji/002a-20e3.pt,../data/tensor_images/JoyPixelsEmoji/002a-20e3.pt,../data/tensor_images/OpenMojiEmoji/002a-20e3.pt,
2,-,002d,hyphen-minus,,"hyphen, minus, dash, line",,,,hyphenminus,,"hyphenminus, hyphen, minus, dash, line","dash, hyphen, hyphenminus, line, minus","[-0.024587138, 0.10079276, 0.011220697, -0.040...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.02458713762462139, 0.10079275816679001, 0....",,,../data/tensor_images/OpenMojiEmoji/002d.pt,
3,0️⃣,0030-20e3,keycap: 0,keycap,,,,,"keycap, 0",,"keycap, 0, keycap","0, keycap","[0.043693572, -0.01632516, -0.030014936, 0.029...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.04369357228279114, -0.01632516086101532, -0...",../data/tensor_images/GoogleEmoji/0030-20e3.pt,../data/tensor_images/JoyPixelsEmoji/0030-20e3.pt,../data/tensor_images/OpenMojiEmoji/0030-20e3.pt,
4,1️⃣,0031-20e3,keycap: 1,keycap,,,,,"keycap, 1",,"keycap, 1, keycap","1, keycap","[0.022134297, -0.07334993, -0.0056481804, -0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.022134296596050262, -0.07334993034601212, -...",../data/tensor_images/GoogleEmoji/0031-20e3.pt,../data/tensor_images/JoyPixelsEmoji/0031-20e3.pt,../data/tensor_images/OpenMojiEmoji/0031-20e3.pt,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4293,,e380,no handshaking,,"hygiene, agreement, virus, meeting, spread, germs",,,,handshaking,,"handshaking, hygiene, agreement, virus, meetin...","agreement, germs, handshaking, hygiene, meetin...","[0.02637571, 0.066247724, -0.0017016777, -0.02...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.02637570910155773, 0.06624772399663925, -0....",,,../data/tensor_images/OpenMojiEmoji/e380.pt,
4294,,e381,web syndication,,"feed, RSS, atom feed, podcast, subscribe, web ...",,,,"web, syndication",,"web, syndication, feed, RSS, atom feed, podcas...","RSS, atom feed, feed, podcast, subscribe, synd...","[0.025086876, -0.12783332, -0.050925713, -0.03...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.02508687600493431, -0.1278333216905594, -0....",,,../data/tensor_images/OpenMojiEmoji/e381.pt,
4295,,f000,windows,,Microsoft,,,,windows,,"windows, Microsoft","Microsoft, windows","[-0.0010178613, -0.015280813, -0.03865494, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0010178613010793924, -0.015280812978744507...",,,../data/tensor_images/OpenMojiEmoji/f000.pt,
4296,,f77a,artstation,,"art, brand",,,,artstation,,"artstation, art, brand","art, artstation, brand","[0.026021712, -0.005451154, 0.024167942, -0.07...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.02602171152830124, -0.005451153963804245, 0...",,,../data/tensor_images/OpenMojiEmoji/f77a.pt,


In [68]:
df_images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4298 entries, 0 to 4297
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   emoji                   4282 non-null   object
 1   hexcode                 4298 non-null   object
 2   openmoji_annotation     4282 non-null   object
 3   openmoji_tags_1         1906 non-null   object
 4   openmoji_tags_2         391 non-null    object
 5   emojipedia_description  1885 non-null   object
 6   llm_tags                2622 non-null   object
 7   llm_description         2622 non-null   object
 8   cleaned_annotations     4298 non-null   object
 9   cleaned_llm_tags        4298 non-null   object
 10  merged_tags             4298 non-null   object
 11  final_tags              4298 non-null   object
 12  tags_embedding          4298 non-null   object
 13  emojipedia_embedding    4298 non-null   object
 14  llm_embedding           4298 non-null   object
 15  comb

In [69]:
"""
Reference: Chat-GPT-4o
Prompt: 
 
This is my final_df.


#   Column

---  ------

 0   hexcode

 1  combined_embedding



I need to get image path of a specific emoji and add it to the dataframe in a new column "image_path". My images are in the path "../data/tensor_images/". I have emojis of 4 different brands in their respective folders inside the image path ["GoogleEmoji", "JoyPixelsEmoji", "OpenMojiEmoji", "TwitterEmoji"]. I want a new row for each image. Some brands have emoji specific to the hex code, and some do not."""
import os
import pandas as pd
from tqdm import tqdm

# Define base image path and brands
image_base_path = "../data/tensor_images/"
brands = ["GoogleEmoji", "JoyPixelsEmoji", "OpenMojiEmoji", "TwitterEmoji"]

# Function to find all available image paths for a given hexcode
def get_image_paths(hexcode):
    image_paths = {}
   
    for brand in brands:
        brand_path = os.path.join(image_base_path, brand)
        if not os.path.exists(brand_path): # Skip if folder doesn't exist
            continue
           
        expected_filename = f"{hexcode}.pt"  # Adjust based on actual format
       
        if expected_filename in os.listdir(brand_path):
            image_paths[brand] = os.path.join(brand_path, expected_filename)

    return image_paths

# Expand dataframe with tqdm progress bar
expanded_rows = []
for _, row in tqdm(final_df.iterrows(), total=len(final_df), desc="Processing Hexcodes"):
    hexcode = row["hexcode"]
    embedding = row["combined_embedding"]

    image_paths = get_image_paths(hexcode)  # Get list of image paths
   
    if image_paths:  # If images exist, create multiple rows
        for brand, path in image_paths.items():
            expanded_rows.append({"hexcode": hexcode, "combined_embedding": embedding, "image_path": path})
    else:
        # If no images exist, optionally add a row with NaN for image_path
        expanded_rows.append({"hexcode": hexcode, "combined_embedding": embedding, "image_path": None})

# Convert to DataFrame
expanded_df = pd.DataFrame(expanded_rows)

# Optional: Drop rows where no image is found
expanded_df = expanded_df.dropna(subset=["image_path"]).reset_index(drop=True)

Processing Hexcodes: 100%|██████████| 4298/4298 [00:22<00:00, 194.55it/s]


In [70]:
expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12510 entries, 0 to 12509
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   hexcode             12510 non-null  object
 1   combined_embedding  12510 non-null  object
 2   image_path          12510 non-null  object
dtypes: object(3)
memory usage: 293.3+ KB


Model Building

In [None]:
# 
# Reference: Chat-GPT-4o
# Prompt: Build a GAN model to generate emoji images based on their embeddings. The dataset contains the following columns: 'combined_embedding', 'image_path'. The 'combined_embedding' column contains the combined embeddings of tags, emojipedia description, and LLM description. The 'image_path' column contains the path to the image tensor. The model should have a generator and a discriminator. The generator should take the combined embedding as input and output an image tensor. The discriminator should take an image tensor and the combined embedding as input and output a probability score. Train the GAN model on the dataset for a few epochs."""
# 

import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np

import torch.nn as nn
import torch.optim as optim

# Define the dataset class
class EmojiDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = row['image_path']
        embedding = row['combined_embedding']
        
        # Load the image tensor
        image_tensor = torch.load(image_path)
        
        # Convert embedding to tensor
        embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
        
        return embedding_tensor, image_tensor

# Define the generator network
class Generator(nn.Module):
    def __init__(self, embedding_dim, image_channels, image_size):
        super(Generator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, 256),
            nn.ReLU(True),
            nn.Linear(256, 512),
            nn.ReLU(True),
            nn.Linear(512, 1024),
            nn.ReLU(True),
            nn.Linear(1024, image_channels * image_size * image_size),
            nn.Tanh()
        )
        self.image_channels = image_channels
        self.image_size = image_size

    def forward(self, x):
        x = self.fc(x)
        x = x.view(-1, self.image_channels, self.image_size, self.image_size)
        return x

# Define the discriminator network
class Discriminator(nn.Module):
    def __init__(self, image_channels, image_size, embedding_dim):
        super(Discriminator, self).__init__()
        self.image_size = image_size
        self.image_channels = image_channels
        self.fc = nn.Sequential(
            nn.Linear(image_channels * image_size * image_size + embedding_dim, 1024),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(1024, 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, image, embedding):
        image_flat = image.view(-1, self.image_channels * self.image_size * self.image_size)
        x = torch.cat((image_flat, embedding), dim=1)
        x = self.fc(x)
        return x

# Hyperparameters
embedding_dim = 384 * 3  # Combined embedding dimension
image_channels = 3  # Assuming RGB images
image_size = 16  # Assuming 64x64 images
batch_size = 32
num_epochs = 4
learning_rate = 0.0002

# Create dataset and dataloader
dataset = EmojiDataset(expanded_df)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize models
generator = Generator(embedding_dim, image_channels, image_size).to(device)
discriminator = Discriminator(image_channels, image_size, embedding_dim).to(device)

# Loss and optimizers
criterion = nn.BCELoss()
optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate, betas=(0.5, 0.999))
optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate, betas=(0.5, 0.999))

# Training loop
for epoch in range(num_epochs):
    for i, (embeddings, real_images) in enumerate(dataloader):
        batch_size = real_images.size(0)
        
        # Labels
        real_labels = torch.ones(batch_size, 1).to(device)
        fake_labels = torch.zeros(batch_size, 1).to(device)
        
        # Train Discriminator
        optimizer_D.zero_grad()
        
        outputs = discriminator(real_images.to(device), embeddings.to(device))
        d_loss_real = criterion(outputs, real_labels)
        d_loss_real.backward()
        
        z = torch.randn(batch_size, embedding_dim).to(device)
        fake_images = generator(z)
        outputs = discriminator(fake_images.detach(), embeddings.to(device))
        d_loss_fake = criterion(outputs, fake_labels)
        d_loss_fake.backward()
        
        optimizer_D.step()
        
        d_loss = d_loss_real + d_loss_fake
        
        # Train Generator
        optimizer_G.zero_grad()
        
        z = torch.randn(batch_size, embedding_dim).to(device)
        fake_images = generator(z)
        outputs = discriminator(fake_images, embeddings.to(device))
        g_loss = criterion(outputs, real_labels)
        g_loss.backward()
        
        optimizer_G.step()
        
        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], D Loss: {d_loss.item()}, G Loss: {g_loss.item()}')

Epoch [1/4], Step [100/391], D Loss: 0.18671101331710815, G Loss: 4.123954772949219
Epoch [1/4], Step [200/391], D Loss: 0.3518197536468506, G Loss: 2.5185885429382324
Epoch [1/4], Step [300/391], D Loss: 0.6162877082824707, G Loss: 6.8570404052734375
Epoch [2/4], Step [100/391], D Loss: 0.4212099313735962, G Loss: 2.4823451042175293
Epoch [2/4], Step [200/391], D Loss: 1.1088223457336426, G Loss: 1.679426670074463
Epoch [2/4], Step [300/391], D Loss: 0.7740771770477295, G Loss: 2.6906867027282715
Epoch [3/4], Step [100/391], D Loss: 0.46282222867012024, G Loss: 2.1670422554016113
Epoch [3/4], Step [200/391], D Loss: 0.6897350549697876, G Loss: 1.7810333967208862
Epoch [3/4], Step [300/391], D Loss: 0.7655010223388672, G Loss: 2.8216466903686523
Epoch [4/4], Step [100/391], D Loss: 0.8151779174804688, G Loss: 1.8335931301116943
Epoch [4/4], Step [200/391], D Loss: 0.5846915245056152, G Loss: 1.9725338220596313
Epoch [4/4], Step [300/391], D Loss: 0.45163917541503906, G Loss: 2.60619449

In [99]:
# For the text "smiling face" do the embeddings and generate an emoji
text = "smiling face"

# Embed the text
text_embedding = embed_text(text)

# Generate a noise tensor
noise = torch.randn(1, noise_dim).to(device)

# Convert text embedding to tensor and add batch dimension
embedding_tensor = torch.tensor(text_embedding, dtype=torch.float32).to(device).unsqueeze(0)

# Generate an image
generated_image = generator(noise, embedding_tensor)
generated_image = generated_image.squeeze(0).cpu().detach().numpy()

# Display the generated image
plt.imshow(generated_image.transpose(1, 2, 0))
plt.axis('off')
plt.show()

TypeError: Generator.forward() takes 2 positional arguments but 3 were given