In [2]:
import pandas as pd
import numpy as np
import os
import re

In [3]:
openmoji_df = pd.read_csv('../data/openmoji.csv')
emojipedia_df = pd.read_csv('../data/emojipedia.csv')
llm_df = pd.read_parquet('../data/llmemoji.parquet')

In [4]:
# Convert Unicode string (e.g., 'U+1F600', 'U+263A,FE0F') to hex code ('1F600', '263A-FE0F').
def unicode_to_hex(unicode_str):
    unicode_str = unicode_str.replace(",", " ")  # Replace commas with spaces
    return "-".join([f"{int(u.replace('U+', ''), 16):X}" for u in unicode_str.split()])

# Convert 'unicode' column in emojipedia_df and llm_df to 'hexcode'
emojipedia_df['hexcode'] = emojipedia_df['Codepoints Hex'].apply(unicode_to_hex)
llm_df['hexcode'] = llm_df['unicode'].apply(unicode_to_hex)

In [5]:
# Making the hexcode uniform in all 3 dataframes for merging
# removing -f30f (differentiation between button and text)
# removing -200d (differentiation for emoji with skin-tone)

openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-FE0F', '', regex=True)
openmoji_df['hexcode'] = openmoji_df['hexcode'].str.replace('-200D', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-FE0F', '', regex=True)
llm_df['hexcode'] = llm_df['hexcode'].str.replace('-200D', '', regex=True)
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.replace('-FE0F', '', regex=True)
emojipedia_df['hexcode'] = emojipedia_df['hexcode'].str.replace('-200D', '', regex=True)

In [6]:
# Checking for duplicates
duplicate_counts = openmoji_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

hexcode
1F3F3    2
Name: count, dtype: int64


In [7]:
# Checking for duplicates
duplicate_counts = llm_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

hexcode
1F3C3-1F3FD-2640-27A1           4
1F6B6-1F3FF-2642-27A1           4
1F6B6-1F3FF-2640-27A1           4
1F6B6-1F3FE-2640-27A1           4
1F6B6-1F3FC-2640-27A1           4
                               ..
1F9D1-1F3FC-2764-1F9D1-1F3FD    2
1F469-1F3FF-2764-1F468-1F3FD    2
1F9D1-1F3FD-2764-1F9D1-1F3FF    2
1F327                           2
2601                            2
Name: count, Length: 1160, dtype: int64


In [8]:
duplicate_counts = emojipedia_df['hexcode'].value_counts()
duplicates = duplicate_counts[duplicate_counts > 1]
print(duplicates)

Series([], Name: count, dtype: int64)


In [9]:
# Removing duplicates
openmoji_df = openmoji_df[~openmoji_df.duplicated(subset=['hexcode'], keep=False)]
llm_df = llm_df[~llm_df.duplicated(subset=['hexcode'], keep=False)]

In [10]:
# Merge the dataframes on 'hexcode'
merged_df = openmoji_df.merge(emojipedia_df, on='hexcode', how='outer')
merged_df = merged_df.merge(llm_df, on='hexcode', how='outer')
# Convert hexcode to lowercase
merged_df['hexcode'] = merged_df['hexcode'].str.lower()

In [11]:
merged_df

Unnamed: 0,emoji,hexcode,group,subgroups,annotation,tags_x,openmoji_tags,openmoji_author,openmoji_date,skintone,...,Title,DescribedBy,URL,Description,Codepoints Hex,character,unicode_y,short description,tags_y,LLM description
0,#️⃣,0023-20e3,symbols,keycap,keycap: #,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,
1,*️⃣,002a-20e3,symbols,keycap,keycap: *,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,
2,-,002d,extras-unicode,symbol-other,hyphen-minus,,"hyphen, minus, dash, line",Robert Winslow,2022-12-24,,...,,,,,,,,,,
3,0️⃣,0030-20e3,symbols,keycap,keycap: 0,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,
4,1️⃣,0031-20e3,symbols,keycap,keycap: 1,keycap,,Selina Lange,2019-05-06,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4293,,e380,extras-openmoji,symbols,no handshaking,,"hygiene, agreement, virus, meeting, spread, germs",Julian Grüneberg,2020-04-05,,...,,,,,,,,,,
4294,,e381,extras-openmoji,symbols,web syndication,,"feed, RSS, atom feed, podcast, subscribe, web ...",Alexander Müller,2020-04-19,,...,,,,,,,,,,
4295,,f000,extras-openmoji,brand,windows,,Microsoft,Alexander Müller,2020-04-26,,...,,,,,,,,,,
4296,,f77a,extras-openmoji,brand,artstation,,"art, brand",Seth Falco,2023-09-09,,...,,,,,,,,,,


In [12]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4298 entries, 0 to 4297
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   emoji                  4282 non-null   object 
 1   hexcode                4298 non-null   object 
 2   group                  4282 non-null   object 
 3   subgroups              4282 non-null   object 
 4   annotation             4282 non-null   object 
 5   tags_x                 1906 non-null   object 
 6   openmoji_tags          391 non-null    object 
 7   openmoji_author        4282 non-null   object 
 8   openmoji_date          4282 non-null   object 
 9   skintone               1875 non-null   object 
 10  skintone_combination   2198 non-null   object 
 11  skintone_base_emoji    2198 non-null   object 
 12  skintone_base_hexcode  2198 non-null   object 
 13  unicode_x              3902 non-null   object 
 14  order                  3781 non-null   float64
 15  Grou

In [13]:
# Select and rename required columns
final_df = merged_df[['emoji', 'hexcode', 'annotation', 'tags_x', 'Description', 'tags_y', 'LLM description']]
final_df.columns = [
    'emoji', 'hexcode', 'openmoji_annotation', 'openmoji_tags', 'emojipedia_description', 'llm_tags', 'llm_description'
]

In [14]:
final_df

Unnamed: 0,emoji,hexcode,openmoji_annotation,openmoji_tags,emojipedia_description,llm_tags,llm_description
0,#️⃣,0023-20e3,keycap: #,keycap,,,
1,*️⃣,002a-20e3,keycap: *,keycap,,,
2,-,002d,hyphen-minus,,,,
3,0️⃣,0030-20e3,keycap: 0,keycap,,,
4,1️⃣,0031-20e3,keycap: 1,keycap,,,
...,...,...,...,...,...,...,...
4293,,e380,no handshaking,,,,
4294,,e381,web syndication,,,,
4295,,f000,windows,,,,
4296,,f77a,artstation,,,,


In [15]:
# Define paths
processed_image_path = "../data/processed_images/"
brands = ["GoogleEmoji", "JoyPixelsEmoji", "OpenMojiEmoji", "TwitterEmoji"]

# Function to get image paths for a given hexcode
def get_image_paths(hexcode):
    image_paths = {}

    for brand in brands:
        brand_path = os.path.join(processed_image_path, brand)
        if not os.path.exists(brand_path):
            continue  # Skip if folder doesn't exist

        # List all files in the category
        matching_files = [f for f in os.listdir(brand_path) if hexcode in f]  # Match anywhere in the filename
        
        if matching_files:
            # Take the first match (if multiple exist)
            image_paths[brand] = os.path.join(brand_path, matching_files[0])
        else:
            image_paths[brand] = None  # No matching file found

    return image_paths

df_images = final_df.copy()  # Work on a copy to be safe

# Apply the function to get image paths for each row
df_images[['google_image_path', 'joypixels_image_path', 'openmoji_image_path', 'twitter_image_path']] = df_images['hexcode'].apply(lambda x: pd.Series(get_image_paths(x)))

# Display a sample
print(df_images[['hexcode', 'google_image_path', 'joypixels_image_path', 'openmoji_image_path', 'twitter_image_path']].head())

     hexcode                                  google_image_path  \
0  0023-20e3  ../data/processed_images/GoogleEmoji/0023-20e3...   
1  002a-20e3  ../data/processed_images/GoogleEmoji/002a-20e3...   
2       002d                                               None   
3  0030-20e3  ../data/processed_images/GoogleEmoji/0030-20e3...   
4  0031-20e3  ../data/processed_images/GoogleEmoji/0031-20e3...   

                                joypixels_image_path  \
0  ../data/processed_images/JoyPixelsEmoji/0023-2...   
1  ../data/processed_images/JoyPixelsEmoji/002a-2...   
2                                               None   
3  ../data/processed_images/JoyPixelsEmoji/0030-2...   
4  ../data/processed_images/JoyPixelsEmoji/0031-2...   

                                 openmoji_image_path twitter_image_path  
0  ../data/processed_images/OpenMojiEmoji/0023-20...               None  
1  ../data/processed_images/OpenMojiEmoji/002a-20...               None  
2    ../data/processed_images/OpenMoji

In [16]:
df_images

Unnamed: 0,emoji,hexcode,openmoji_annotation,openmoji_tags,emojipedia_description,llm_tags,llm_description,google_image_path,joypixels_image_path,openmoji_image_path,twitter_image_path
0,#️⃣,0023-20e3,keycap: #,keycap,,,,../data/processed_images/GoogleEmoji/0023-20e3...,../data/processed_images/JoyPixelsEmoji/0023-2...,../data/processed_images/OpenMojiEmoji/0023-20...,
1,*️⃣,002a-20e3,keycap: *,keycap,,,,../data/processed_images/GoogleEmoji/002a-20e3...,../data/processed_images/JoyPixelsEmoji/002a-2...,../data/processed_images/OpenMojiEmoji/002a-20...,
2,-,002d,hyphen-minus,,,,,,,../data/processed_images/OpenMojiEmoji/002d.png,
3,0️⃣,0030-20e3,keycap: 0,keycap,,,,../data/processed_images/GoogleEmoji/0030-20e3...,../data/processed_images/JoyPixelsEmoji/0030-2...,../data/processed_images/OpenMojiEmoji/0030-20...,
4,1️⃣,0031-20e3,keycap: 1,keycap,,,,../data/processed_images/GoogleEmoji/0031-20e3...,../data/processed_images/JoyPixelsEmoji/0031-2...,../data/processed_images/OpenMojiEmoji/0031-20...,
...,...,...,...,...,...,...,...,...,...,...,...
4293,,e380,no handshaking,,,,,,,../data/processed_images/OpenMojiEmoji/e380.png,
4294,,e381,web syndication,,,,,,,../data/processed_images/OpenMojiEmoji/e381.png,
4295,,f000,windows,,,,,,,../data/processed_images/OpenMojiEmoji/f000.png,
4296,,f77a,artstation,,,,,,,../data/processed_images/OpenMojiEmoji/f77a.png,


In [17]:
df_images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4298 entries, 0 to 4297
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   emoji                   4282 non-null   object
 1   hexcode                 4298 non-null   object
 2   openmoji_annotation     4282 non-null   object
 3   openmoji_tags           1906 non-null   object
 4   emojipedia_description  1885 non-null   object
 5   llm_tags                2622 non-null   object
 6   llm_description         2622 non-null   object
 7   google_image_path       3562 non-null   object
 8   joypixels_image_path    3824 non-null   object
 9   openmoji_image_path     4297 non-null   object
 10  twitter_image_path      871 non-null    object
dtypes: object(11)
memory usage: 369.5+ KB


In [18]:
# Limiting only 2 sentences because the content is too long with too many references to other emoji, which might be more confusing later for the model to learn.

"""
My df_images has a emojipedia_description column. Only keep the first two sentences in the column and remove the rest.
"""
# Function to keep only the first two sentences
def keep_first_two_sentences(description):
    if pd.isna(description):  # Handle missing values (NaNs)
        return description
    sentences = re.split(r'(?<=[.!?]) +', description.strip())  # Split by sentence-ending punctuation (., !, ?)
    return ' '.join(sentences[:2])  # Return only the first two sentences

# Apply the function to the 'llm_description' column
df_images['emojipedia_description'] = df_images['emojipedia_description'].apply(keep_first_two_sentences)

In [19]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data files (only need to run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
    else:
        text = ''
    return text

# Function to preprocess text
def preprocess_text(text):
    # Clean the text
    text = clean_text(text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


# Apply the preprocessing function to the 'Description' column in final_df
df_images['cleaned_description'] = df_images['emojipedia_description'].apply(preprocess_text)

#Apply the preprocessing function to the 'llm_description' column in final_df
df_images['cleaned_llm_description'] = df_images['llm_description'].apply(preprocess_text)


#Write this to csv file df_images selecing emoji, hexcode, cleaned_description, llm_description,openmoji_tags
df_images[['emoji', 'hexcode', 'cleaned_description', 'cleaned_llm_description', 'openmoji_tags']].to_csv('../data/df_images.csv', index=False)

 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rajprasadshrestha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajprasadshrestha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rajprasadshrestha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
df_processed = pd.read_csv('../data/df_images.csv')
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4298 entries, 0 to 4297
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   emoji                    4282 non-null   object
 1   hexcode                  4298 non-null   object
 2   cleaned_description      1885 non-null   object
 3   cleaned_llm_description  2622 non-null   object
 4   openmoji_tags            1906 non-null   object
dtypes: object(5)
memory usage: 168.0+ KB


In [25]:
df_processed.head(20)

Unnamed: 0,emoji,hexcode,cleaned_description,cleaned_llm_description,openmoji_tags
0,#️⃣,0023-20e3,,,keycap
1,*️⃣,002a-20e3,,,keycap
2,-,002d,,,
3,0️⃣,0030-20e3,,,keycap
4,1️⃣,0031-20e3,,,keycap
5,2️⃣,0032-20e3,,,keycap
6,3️⃣,0033-20e3,,,keycap
7,4️⃣,0034-20e3,,,keycap
8,5️⃣,0035-20e3,,,keycap
9,6️⃣,0036-20e3,,,keycap
