In [128]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch

# Download NLTK data files (only need to run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Apply tqdm to all .apply() functions by using progress_apply
tqdm.pandas()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prashansathapa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prashansathapa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/prashansathapa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [129]:
openmoji_df = pd.read_csv('../data/openmoji.csv')
emojipedia_df = pd.read_csv('../data/emojipedia.csv')
llm_df = pd.read_parquet('../data/llmemoji.parquet')

In [130]:
flaticon_stickers_df = pd.read_csv('../data/flaticon/flaticon_stickers_desc.csv') 

In [131]:
print(flaticon_stickers_df.head().to_string())

                                                     image        title                                                               tags                              filename                                                    description
0  https://cdn-icons-png.flaticon.com/512/8356/8356693.png       Family                   people, love, family, gay, homosexual, relatives  2fbde21360a8ddf99949459a90004f33.png                                   a family hugging and smiling
1  https://cdn-icons-png.flaticon.com/512/8356/8356696.png       Family                                    people, love, family, relatives  617a3f0ef0d2ef5d246eb06b3ed5fbf4.png                                          a family with a child
2  https://cdn-icons-png.flaticon.com/512/8359/8359767.png      Holiday                   holidays, holiday, time and date, calendar, desk  478a953a9468c40b2ef99e81dd1b6c1b.png                                  a pile of papers and a folder
3  https://cdn-icons-png.flaticon.com/51

In [132]:
# Convert Unicode string (e.g., 'U+1F600', 'U+263A,FE0F') to hex code ('1F600', '263A-FE0F').

def unicode_to_hex(unicode_str):
    unicode_str = unicode_str.replace(",", " ")  # Replace commas with spaces
    # First splitting the input string to a list of substrings
    # Loops though each substring
    # Removes the U+ prefix from each substring
    hex_values = [u.replace("U+", "") for u in unicode_str.split()]
    # Join the values with hyphens
    return "-".join(hex_values)

# Convert 'unicode' column in emojipedia_df and llm_df to 'hexcode'
emojipedia_df['hexcode'] = emojipedia_df['Codepoints Hex'].progress_apply(unicode_to_hex)
llm_df['hexcode'] = llm_df['unicode'].progress_apply(unicode_to_hex)

100%|██████████| 1885/1885 [00:00<00:00, 613459.27it/s]
100%|██████████| 5034/5034 [00:00<00:00, 666944.42it/s]


In [133]:
# Making the hexcode uniform in all 3 dataframes for merging
# removing -f30f (differentiation between image type emoji and textual type emoji)
# removing -200d (differentiation for emoji with skin-tone)

openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-FE0F', '', regex=True)
openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-200D', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-FE0F', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-200D', '', regex=True)
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.replace('-FE0F', '', regex=True)
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.replace('-200D', '', regex=True)

In [134]:
# Checking for duplicates
duplicate_counts = openmoji_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

hexcode
1F3F3    2
Name: count, dtype: int64


In [135]:
# Checking for duplicates
duplicate_counts = llm_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

hexcode
1F3C3-1F3FD-2640-27A1           4
1F6B6-1F3FF-2642-27A1           4
1F6B6-1F3FF-2640-27A1           4
1F6B6-1F3FE-2640-27A1           4
1F6B6-1F3FC-2640-27A1           4
                               ..
1F9D1-1F3FC-2764-1F9D1-1F3FD    2
1F469-1F3FF-2764-1F468-1F3FD    2
1F9D1-1F3FD-2764-1F9D1-1F3FF    2
1F327                           2
2601                            2
Name: count, Length: 1160, dtype: int64


In [136]:
duplicate_counts = emojipedia_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

Series([], Name: count, dtype: int64)


In [137]:
#checking duplicates on flaticon_sticker dataset
duplicate_counts = flaticon_stickers_df['description'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

description
a black background with a white and red flower                                                                       941
happy new year clipart                                                                                                29
hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello hello     26
a woman doing yoga                                                                                                    21
let let let let let let let let let let let let let let let let let let let                                           19
                                                                                                                    ... 
you are sunshine                                                                                                       2
a bottle with a heart inside                                                                                           2
a rabbit riding a sc

In [138]:
#checking for any missing values
missing_values = flaticon_stickers_df.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 image             0
title             0
tags              0
filename          0
description    6902
dtype: int64


In [139]:
#finding if any inconsistent data types are present 
data_types = flaticon_stickers_df.dtypes
print("Data types of each column:\n", data_types)


Data types of each column:
 image          object
title          object
tags           object
filename       object
description    object
dtype: object


In [140]:
# Removing duplicates in the 'description' from flaticon_sticker_df
flaticon_stickers_df = flaticon_stickers_df[~flaticon_stickers_df.duplicated(subset=['description'], keep=False)]

In [141]:
# Checking if there are any duplicate descriptions remaining in the dataframe
duplicate_descriptions = flaticon_stickers_df.duplicated(subset=['description'], keep=False).any()

print("Are there duplicate descriptions?", duplicate_descriptions)


Are there duplicate descriptions? False


In [142]:
flaticon_stickers_df.head()

Unnamed: 0,image,title,tags,filename,description
2,https://cdn-icons-png.flaticon.com/512/8359/83...,Holiday,"holidays, holiday, time and date, calendar, desk",478a953a9468c40b2ef99e81dd1b6c1b.png,a pile of papers and a folder
3,https://cdn-icons-png.flaticon.com/512/8359/83...,Lipstick,"birthday and party, fashion, grooming, female,...",ad92c2a592a552ecd70437795d3a0ac2.png,a lipstick with a red lipstick brush and a red...
4,https://cdn-icons-png.flaticon.com/512/8359/83...,Mental Care,"miscellaneous, thinking, mental care",ebba19973ad7a1bd35a02e53a1f57694.png,a brain with a rainbow colored circle in the m...
5,https://cdn-icons-png.flaticon.com/512/8359/83...,Tape Recorder,"professions and jobs, music and multimedia, el...",d12308b31301697ae3095170d2a69b34.png,a cartoon cassette cassette with a cassette ca...
6,https://cdn-icons-png.flaticon.com/512/8360/83...,Monster,"smileys, halloween, spooky, scary, monster, te...",a8b1af8dab6afcbf3412baca6347e1fb.png,a cartoon monster with big eyes and a big mouth


In [143]:
# Removing duplicates
openmoji_df = openmoji_df[~openmoji_df.duplicated(subset=['hexcode'], keep=False)]
llm_df = llm_df[~llm_df.duplicated(subset=['hexcode'], keep=False)]

In [144]:
# Merge the dataframes on 'hexcode'
merged_df = openmoji_df.merge(emojipedia_df, on='hexcode', how='outer')
merged_df = merged_df.merge(llm_df, on='hexcode', how='outer')
# Convert hexcode to lowercase
merged_df['hexcode'] = merged_df['hexcode'].str.lower()

In [145]:
merged_df.head()

Unnamed: 0,emoji,hexcode,group,subgroups,annotation,tags_x,openmoji_tags,openmoji_author,openmoji_date,skintone,...,Title,DescribedBy,URL,Description,Codepoints Hex,character,unicode_y,short description,tags_y,LLM description
0,#️⃣,0023-20e3,symbols,keycap,keycap: #,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,
1,*️⃣,002a-20e3,symbols,keycap,keycap: *,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,
2,-,002d,extras-unicode,symbol-other,hyphen-minus,,"hyphen, minus, dash, line",Robert Winslow,2022-12-24,,...,,,,,,,,,,
3,0️⃣,0030-20e3,symbols,keycap,keycap: 0,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,
4,1️⃣,0031-20e3,symbols,keycap,keycap: 1,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,


In [146]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4298 entries, 0 to 4297
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   emoji                  4282 non-null   object 
 1   hexcode                4298 non-null   object 
 2   group                  4282 non-null   object 
 3   subgroups              4282 non-null   object 
 4   annotation             4282 non-null   object 
 5   tags_x                 1906 non-null   object 
 6   openmoji_tags          391 non-null    object 
 7   openmoji_author        4282 non-null   object 
 8   openmoji_date          4282 non-null   object 
 9   skintone               1875 non-null   object 
 10  skintone_combination   2198 non-null   object 
 11  skintone_base_emoji    2198 non-null   object 
 12  skintone_base_hexcode  2198 non-null   object 
 13  unicode_x              3902 non-null   object 
 14  order                  3781 non-null   float64
 15  Grou

In [147]:
# Select and rename required columns
final_df = merged_df.copy()
final_df = final_df[['emoji', 'hexcode', 'annotation', 'tags_x', 'openmoji_tags', 'Description', 'tags_y', 'LLM description']]
final_df.columns = [
    'emoji', 'hexcode', 'openmoji_annotation', 'openmoji_tags_1', 'openmoji_tags_2', 'emojipedia_description', 'llm_tags', 'llm_description'
]

In [148]:
final_df.head()

Unnamed: 0,emoji,hexcode,openmoji_annotation,openmoji_tags_1,openmoji_tags_2,emojipedia_description,llm_tags,llm_description
0,#️⃣,0023-20e3,keycap: #,keycap,,,,
1,*️⃣,002a-20e3,keycap: *,keycap,,,,
2,-,002d,hyphen-minus,,"hyphen, minus, dash, line",,,
3,0️⃣,0030-20e3,keycap: 0,keycap,,,,
4,1️⃣,0031-20e3,keycap: 1,keycap,,,,


In [149]:
final_flaticon_stickers_df = flaticon_stickers_df.copy()
final_flaticon_stickers_df = flaticon_stickers_df[['image', 'title', 'tags', 'filename', 'description']]
final_flaticon_stickers_df.columns = [ 'flaticon_image', 'flaticon_title', 'flaticon_tags', 'flaticon_filename', 'flaticon_description']

In [150]:
final_flaticon_stickers_df.head()

Unnamed: 0,flaticon_image,flaticon_title,flaticon_tags,flaticon_filename,flaticon_description
2,https://cdn-icons-png.flaticon.com/512/8359/83...,Holiday,"holidays, holiday, time and date, calendar, desk",478a953a9468c40b2ef99e81dd1b6c1b.png,a pile of papers and a folder
3,https://cdn-icons-png.flaticon.com/512/8359/83...,Lipstick,"birthday and party, fashion, grooming, female,...",ad92c2a592a552ecd70437795d3a0ac2.png,a lipstick with a red lipstick brush and a red...
4,https://cdn-icons-png.flaticon.com/512/8359/83...,Mental Care,"miscellaneous, thinking, mental care",ebba19973ad7a1bd35a02e53a1f57694.png,a brain with a rainbow colored circle in the m...
5,https://cdn-icons-png.flaticon.com/512/8359/83...,Tape Recorder,"professions and jobs, music and multimedia, el...",d12308b31301697ae3095170d2a69b34.png,a cartoon cassette cassette with a cassette ca...
6,https://cdn-icons-png.flaticon.com/512/8360/83...,Monster,"smileys, halloween, spooky, scary, monster, te...",a8b1af8dab6afcbf3412baca6347e1fb.png,a cartoon monster with big eyes and a big mouth


## Handling Tags and Descriptions

In [151]:
def clean_text(text):
    if not isinstance(text, str) or pd.isna(text) or text.strip().lower() == "nan":  
        return ""  # Return empty string for NaN or "nan" strings
    text = text.lower().strip() # Convert to lowercase and remove unnecessary spaces
    # Keep only letters, numbers, spaces, * and #
    text = re.sub(r'[^a-z0-9\s*#]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize the tokens
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ', '.join(tokens)

def remove_duplicates(text):
    words = [word.strip() for word in text.split(",")]  # Split by commas and strip spaces
    unique_words = sorted(set(word.strip() for word in text.split(",")))
    return ', '.join(unique_words)  # Join back into a string

In [152]:
# Clean openmoji_annotation column
final_df["cleaned_annotations"] = final_df["openmoji_annotation"].progress_apply(clean_text)
# Clean llm_tags
final_df["cleaned_llm_tags"] = final_df["llm_tags"].progress_apply(clean_text)

# List of columns to merge
columns_to_merge = ["cleaned_annotations", "openmoji_tags_1", "openmoji_tags_2", "cleaned_llm_tags"]

# Fill NaN with empty strings, then merge columns
final_df["merged_tags"] = final_df[columns_to_merge].fillna("").agg(
    lambda x: ", ".join(filter(None, map(str, x))), axis=1
)

100%|██████████| 4298/4298 [00:01<00:00, 3765.47it/s]
100%|██████████| 4298/4298 [00:00<00:00, 1266749.95it/s]


In [153]:
# Remove duplicates
final_df["final_tags"] = final_df["merged_tags"].progress_apply(remove_duplicates)

100%|██████████| 4298/4298 [00:00<00:00, 491135.23it/s]


In [154]:
# Limiting only 2 sentences because the content is too long with too many references to other emoji, which might be more confusing later for the model to learn.

"""
Reference: Chat-GPT-4o
Prompt: My df_images has a emojipedia_description column. Only keep the first two sentences in the column and remove the rest.
"""
# Function to keep only the first two sentences
def keep_first_two_sentences(description):
    if pd.isna(description):  # Handle missing values (NaNs)
        return description
    sentences = re.split(r'(?<=[.!?]) +', description.strip())  # Split by sentence-ending punctuation (., !, ?)
    return ' '.join(sentences[:2])  # Return only the first two sentences

# Apply the function to the 'llm_description' column
final_df['emojipedia_description'] = final_df['emojipedia_description'].progress_apply(keep_first_two_sentences)

100%|██████████| 4298/4298 [00:00<00:00, 315357.89it/s]


In [155]:
final_df.head()

Unnamed: 0,emoji,hexcode,openmoji_annotation,openmoji_tags_1,openmoji_tags_2,emojipedia_description,llm_tags,llm_description,cleaned_annotations,cleaned_llm_tags,merged_tags,final_tags
0,#️⃣,0023-20e3,keycap: #,keycap,,,,,"keycap, #",,"keycap, #, keycap","#, keycap"
1,*️⃣,002a-20e3,keycap: *,keycap,,,,,"keycap, *",,"keycap, *, keycap","*, keycap"
2,-,002d,hyphen-minus,,"hyphen, minus, dash, line",,,,hyphenminus,,"hyphenminus, hyphen, minus, dash, line","dash, hyphen, hyphenminus, line, minus"
3,0️⃣,0030-20e3,keycap: 0,keycap,,,,,"keycap, 0",,"keycap, 0, keycap","0, keycap"
4,1️⃣,0031-20e3,keycap: 1,keycap,,,,,"keycap, 1",,"keycap, 1, keycap","1, keycap"


In [156]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4298 entries, 0 to 4297
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   emoji                   4282 non-null   object
 1   hexcode                 4298 non-null   object
 2   openmoji_annotation     4282 non-null   object
 3   openmoji_tags_1         1906 non-null   object
 4   openmoji_tags_2         391 non-null    object
 5   emojipedia_description  1885 non-null   object
 6   llm_tags                2622 non-null   object
 7   llm_description         2622 non-null   object
 8   cleaned_annotations     4298 non-null   object
 9   cleaned_llm_tags        4298 non-null   object
 10  merged_tags             4298 non-null   object
 11  final_tags              4298 non-null   object
dtypes: object(12)
memory usage: 403.1+ KB


## Embedding using Sentence BERT

In [157]:
# Use GPU if available
"""
Reference: https://pytorch.org/get-started/locally/
"""

# Check for NVIDIA GPU
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use CUDA (NVIDIA GPU)
    print("Using NVIDIA GPU (CUDA)")

# Check for Mac Silicon GPU (MPS)
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Metal Performance Shaders (Mac Silicon GPU)
    print("Using Mac GPU (MPS)")

# Default to CPU if no GPU is available
else:
    device = torch.device("cpu")
    print("Using CPU")

Using Mac GPU (MPS)


In [158]:
# Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sbert_model = sbert_model.to(device)

# Ensure text columns are strings
# final_df["emojipedia_description"] = final_df["emojipedia_description"].fillna("").astype(str)
# final_df["llm_description"] = final_df["llm_description"].fillna("").astype(str)

# Return SBERT embedding for a given text.
def embed_text(text):
    if pd.isna(text) or text.strip() == "":
        return np.zeros(384)  # Return zero vector for missing values (SBERT output size = 384)
    return sbert_model.encode(text)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [159]:
"""
Reference: https://sbert.net/
"""

# Apply SBERT embeddings to each columns
print("Embedding final_tags...")
final_df["tags_embedding"] = final_df["final_tags"].progress_apply(embed_text)

print("Embedding emojipedia_description...")
final_df["emojipedia_embedding"] = final_df["emojipedia_description"].progress_apply(embed_text)

print("Embedding llm_description...")
final_df["llm_embedding"] = final_df["llm_description"].progress_apply(embed_text)

Embedding final_tags...


100%|██████████| 4298/4298 [00:50<00:00, 85.81it/s] 


Embedding emojipedia_description...


100%|██████████| 4298/4298 [00:27<00:00, 155.28it/s]


Embedding llm_description...


100%|██████████| 4298/4298 [00:30<00:00, 139.54it/s]


In [160]:
#Applying SBERT embedding to tags and description from flaticon_stickers
print("Embedding flaticon_tags...")
final_flaticon_stickers_df["tags_embedding"] = final_flaticon_stickers_df["flaticon_tags"].progress_apply(embed_text)

print("Embedding flaticon_description...")
final_flaticon_stickers_df["desc_embedding"] = final_flaticon_stickers_df["flaticon_description"].progress_apply(embed_text)

Embedding flaticon_tags...


100%|██████████| 18166/18166 [03:42<00:00, 81.66it/s] 


Embedding flaticon_description...


100%|██████████| 18166/18166 [02:51<00:00, 106.09it/s]


In [161]:
final_df["combined_embedding"] = final_df.apply(
    lambda row: np.concatenate([row["tags_embedding"], row["emojipedia_embedding"], row["llm_embedding"]]), axis=1
)

In [162]:
final_df.head()

Unnamed: 0,emoji,hexcode,openmoji_annotation,openmoji_tags_1,openmoji_tags_2,emojipedia_description,llm_tags,llm_description,cleaned_annotations,cleaned_llm_tags,merged_tags,final_tags,tags_embedding,emojipedia_embedding,llm_embedding,combined_embedding
0,#️⃣,0023-20e3,keycap: #,keycap,,,,,"keycap, #",,"keycap, #, keycap","#, keycap","[0.025194714, -0.049892843, 0.049407776, -0.01...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.025194713845849037, -0.0498928427696228, 0...."
1,*️⃣,002a-20e3,keycap: *,keycap,,,,,"keycap, *",,"keycap, *, keycap","*, keycap","[0.030324765, -0.04773097, 0.045857277, -0.008...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.030324764549732208, -0.04773097112774849, 0..."
2,-,002d,hyphen-minus,,"hyphen, minus, dash, line",,,,hyphenminus,,"hyphenminus, hyphen, minus, dash, line","dash, hyphen, hyphenminus, line, minus","[-0.024587153, 0.10079275, 0.011220682, -0.040...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.024587152525782585, 0.10079275071620941, 0..."
3,0️⃣,0030-20e3,keycap: 0,keycap,,,,,"keycap, 0",,"keycap, 0, keycap","0, keycap","[0.04369357, -0.016325157, -0.030014923, 0.029...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.04369356855750084, -0.01632515713572502, -0..."
4,1️⃣,0031-20e3,keycap: 1,keycap,,,,,"keycap, 1",,"keycap, 1, keycap","1, keycap","[0.02213432, -0.07334993, -0.005648181, -0.005...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.022134320810437202, -0.07334993034601212, -..."


In [164]:
final_flaticon_stickers_df["embed_combined"] = final_flaticon_stickers_df.apply(
    lambda row: np.concatenate([row["tags_embedding"], row["desc_embedding"]]), axis=1
)

In [165]:
final_flaticon_stickers_df.head()

Unnamed: 0,flaticon_image,flaticon_title,flaticon_tags,flaticon_filename,flaticon_description,tags_embedding,desc_embedding,embed_combined
2,https://cdn-icons-png.flaticon.com/512/8359/83...,Holiday,"holidays, holiday, time and date, calendar, desk",478a953a9468c40b2ef99e81dd1b6c1b.png,a pile of papers and a folder,"[0.038581457, 0.053599395, 0.052598927, 0.0065...","[-0.095564686, 0.041848075, -0.052247435, 0.02...","[0.038581457, 0.053599395, 0.052598927, 0.0065..."
3,https://cdn-icons-png.flaticon.com/512/8359/83...,Lipstick,"birthday and party, fashion, grooming, female,...",ad92c2a592a552ecd70437795d3a0ac2.png,a lipstick with a red lipstick brush and a red...,"[-0.02237393, -0.00095466594, 0.07944109, 0.01...","[-0.08092986, -0.08813305, -0.036154073, 0.065...","[-0.02237393, -0.00095466594, 0.07944109, 0.01..."
4,https://cdn-icons-png.flaticon.com/512/8359/83...,Mental Care,"miscellaneous, thinking, mental care",ebba19973ad7a1bd35a02e53a1f57694.png,a brain with a rainbow colored circle in the m...,"[0.0572702, 0.023576371, 0.0009865896, 0.01325...","[0.0308583, -0.022989696, -0.067923576, -0.007...","[0.0572702, 0.023576371, 0.0009865896, 0.01325..."
5,https://cdn-icons-png.flaticon.com/512/8359/83...,Tape Recorder,"professions and jobs, music and multimedia, el...",d12308b31301697ae3095170d2a69b34.png,a cartoon cassette cassette with a cassette ca...,"[0.023974909, -0.025374288, 0.011085419, -0.02...","[-0.07901245, 0.08081437, -0.10110584, -0.0524...","[0.023974909, -0.025374288, 0.011085419, -0.02..."
6,https://cdn-icons-png.flaticon.com/512/8360/83...,Monster,"smileys, halloween, spooky, scary, monster, te...",a8b1af8dab6afcbf3412baca6347e1fb.png,a cartoon monster with big eyes and a big mouth,"[-0.026260154, -0.05141792, 0.025089074, 0.032...","[-0.016194591, 0.024804508, -0.047403067, 0.00...","[-0.026260154, -0.05141792, 0.025089074, 0.032..."


## Linking Images

In [163]:
# Define paths
processed_image_path = "../data/tensor_images/"
brands = ["GoogleEmoji", "JoyPixelsEmoji", "OpenMojiEmoji", "TwitterEmoji"]

# Function to get image paths for a given hexcode
def get_image_paths(hexcode):
    image_paths = {}

    for brand in brands:
        brand_path = os.path.join(processed_image_path, brand)
        if not os.path.exists(brand_path):
            continue  # Skip if folder doesn't exist

        # List all files in the category
        matching_files = [f for f in os.listdir(brand_path) if hexcode in f]  # Match anywhere in the filename
        
        if matching_files:
            # Take the first match (if multiple exist)
            image_paths[brand] = os.path.join(brand_path, matching_files[0])
        else:
            image_paths[brand] = None  # No matching file found

    return image_paths

df_images = final_df.copy()  # Work on a copy to be safe

# Apply the function to get image paths for each row
df_images[['google_image_path', 'joypixels_image_path', 'openmoji_image_path', 'twitter_image_path']] = df_images['hexcode'].progress_apply(lambda x: pd.Series(get_image_paths(x)))

100%|██████████| 4298/4298 [00:00<00:00, 11572.02it/s]


ValueError: Columns must be same length as key

In [None]:
df_images

In [None]:
df_images.info()