# EmojiGen

#### Data Ingestion

In [1]:
# Importing necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading emoji data from OpenMoji
# unicode-based emoji
# https://github.com/hfg-gmuend/openmoji/blob/master/data/openmoji.csv
open_moji_df = pd.read_csv('../data/openmoji.csv')

open_moji_df

Unnamed: 0,emoji,hexcode,group,subgroups,annotation,tags,openmoji_tags,openmoji_author,openmoji_date,skintone,skintone_combination,skintone_base_emoji,skintone_base_hexcode,unicode,order
0,😀,1F600,smileys-emotion,face-smiling,grinning face,"face, grin",,Emily Jäger,2018-04-18,,,,,1,1.0
1,😃,1F603,smileys-emotion,face-smiling,grinning face with big eyes,"face, mouth, open, smile",,Emily Jäger,2018-04-18,,,,,0.6,2.0
2,😄,1F604,smileys-emotion,face-smiling,grinning face with smiling eyes,"eye, face, mouth, open, smile",,Emily Jäger,2018-04-18,,,,,0.6,3.0
3,😁,1F601,smileys-emotion,face-smiling,beaming face with smiling eyes,"eye, face, grin, smile",,Emily Jäger,2018-04-18,,,,,0.6,4.0
4,😆,1F606,smileys-emotion,face-smiling,grinning squinting face,"face, laugh, mouth, satisfied, smile",,Emily Jäger,2018-04-18,,,,,0.6,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4279,⮼,2BBC,extras-unicode,symbol-other,overlapping black squares,,,loominade,2020-04-22,,,,,1.1,
4280,⯃,2BC3,extras-unicode,symbol-other,horizontal black octagon,,equilateral polygon,loominade,2020-04-17,,,,,5,
4281,⯄,2BC4,extras-unicode,symbol-other,black octagon,,equilateral polygon,loominade,2020-04-17,,,,,5,
4282,⯪,2BEA,extras-unicode,symbol-other,star with left half black,,half star,Alexander Müller,2020-11-09,,,,,11,


In [3]:
# Loading emoji data from Emojipedia
# unicode-based emoji

emojipedia_df = pd.read_csv('../data/emojipedia/emojis_details.csv')

emojipedia_df

Unnamed: 0,Group,Subgroup,Emoji,Title,DescribedBy,URL,Description,Codepoints Hex
0,smiley,Smiling & Affectionate,😀,Grinning Face,grinning-face,/grinning-face,"A yellow face with simple, open eyes and a bro...",U+1F600
1,smiley,Smiling & Affectionate,😃,Grinning Face with Big Eyes,grinning-face-with-big-eyes,/grinning-face-with-big-eyes,"A yellow face with smiling eyes and a broad, o...",U+1F603
2,smiley,Smiling & Affectionate,😄,Grinning Face with Smiling Eyes,grinning-face-with-smiling-eyes,/grinning-face-with-smiling-eyes,"A yellow face with smiling eyes and a broad, o...",U+1F604
3,smiley,Smiling & Affectionate,😁,Beaming Face with Smiling Eyes,beaming-face-with-smiling-eyes,/beaming-face-with-smiling-eyes,A yellow face with smiling eyes and full-tooth...,U+1F601
4,smiley,Smiling & Affectionate,😆,Grinning Squinting Face,grinning-squinting-face,/grinning-squinting-face,"A yellow face with a broad, open smile and scr...",U+1F606
...,...,...,...,...,...,...,...,...
1888,flags,"Oceania, Island Nations & Territories",🇫🇲,Flag: Micronesia,flag-micronesia,/flag-micronesia,"The flag for Micronesia , which may show as th...","U+1F1EB,U+1F1F2"
1889,objects,Clothing & Appearance,👒,Woman’s Hat,womans-hat,/womans-hat,"A hat worn by women for sun protection, or for...",U+1F452
1890,objects,Office & Stationery,📕,Closed Book,closed-book,/closed-book,"A closed, hardcover book depicted with a red c...",U+1F4D5
1891,objects,Tools & Household Items,📡,Satellite Antenna,satellite-antenna,/satellite-antenna,"A dish antenna, as used to send or receive inf...",U+1F4E1


In [4]:
# Loading emoji data from huggingface/arattinger
# Non unicode-based emoji
# https://huggingface.co/datasets/arattinger/noto-emoji-captions
noto_emoji_df = pd.read_parquet("hf://datasets/arattinger/noto-emoji-captions/data/train-00000-of-00001-42f7c9308efe5fbe.parquet")

noto_emoji_df

Unnamed: 0,image,text
0,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,a person with a surprised look on their face
1,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,a blue square with a white suitcase and a key
2,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,a yellow crescent with a smiley face
3,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,a little girl dressed in a fairy costume holdi...
4,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,a woman with a pair of scissors in her hand
...,...,...
3463,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,a half of the moon with a half of the moon beh...
3464,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,a woman in a purple shirt and gray pants
3465,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,a woman in a wheelchair with a ponytail
3466,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,a couple of men holding hands


In [5]:
# Loading emoji data from huggingface/badrex
# Unicode-based emoji
# https://huggingface.co/datasets/badrex/llm-emoji-dataset?row=4
llm_emoji_df = pd.read_parquet("hf://datasets/badrex/llm-emoji-dataset/data/train-00000-of-00001.parquet")

llm_emoji_df

Unnamed: 0,character,unicode,short description,tags,LLM description
0,🥇,U+1F947,1ST PLACE MEDAL,"[first place, victory, achievement, success, c...","This emoji represents a first place medal, oft..."
1,🥈,U+1F948,2ND PLACE MEDAL,"[medal, silver, second place, achievement, suc...","This emoji represents a silver medal, often us..."
2,🥉,U+1F949,3RD PLACE MEDAL,"[medal, bronze, third place, achievement, spor...","This emoji represents a bronze medal, symboliz..."
3,🆎,U+1F18E,AB BUTTON (BLOOD TYPE),"[blood type, AB, medical, compatibility, trans...",This emoji represents the AB blood type symbol...
4,🏧,U+1F3E7,ATM SIGN,"[ATM, banking, finance, money, transaction, lo...","This emoji represents an ATM sign, often used ..."
...,...,...,...,...,...
5029,🤪,U+1F92A,ZANY FACE,"[zany, playful, silly, humor, lightheartedness...","This emoji represents a zany face, expressing ..."
5030,🦓,U+1F993,ZEBRA,"[zebra, animal, stripes, mammal, nature, fast,...","This emoji represents a zebra, a fast and agil..."
5031,🤐,U+1F910,ZIPPER-MOUTH FACE,"[zipper-mouth, silence, secrecy, mute, speechl...",This emoji represents a face with a zipper ove...
5032,🧟,U+1F9DF,ZOMBIE,"[zombie, undead, horror, fiction, scary, reani...","This emoji represents a zombie, a fictional un..."


In [25]:
# Merge the llm_emoji_df, openmoji_df, emojipedia_df based on unicode/hexcode
# The final df should have unicode, title (short descriotion, annotation), tags, group, subgroup, description)
new_columns = ['emoji', 'unicode', 'title', 'tags', 'group', 'subgroup', 'description']

llm_emoji_df.info()
new_order = ['character', 'unicode', 'short description', 'tags', 'LLM description']
llm_emoji_sm_df = llm_emoji_df[new_order]
llm_emoji_sm_df.rename(columns={'character':'emoji', 'unicode':'unicode', 'short description':'title', 
'tags':'tags', 'LLM description':'description'}, inplace=True)
llm_emoji_sm_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5034 entries, 0 to 5033
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   character          5034 non-null   object
 1   unicode            5034 non-null   object
 2   short description  5034 non-null   object
 3   tags               5034 non-null   object
 4   LLM description    5034 non-null   object
dtypes: object(5)
memory usage: 196.8+ KB


Unnamed: 0,emoji,unicode,title,tags,description
0,🥇,U+1F947,1ST PLACE MEDAL,"[first place, victory, achievement, success, c...","This emoji represents a first place medal, oft..."
1,🥈,U+1F948,2ND PLACE MEDAL,"[medal, silver, second place, achievement, suc...","This emoji represents a silver medal, often us..."
2,🥉,U+1F949,3RD PLACE MEDAL,"[medal, bronze, third place, achievement, spor...","This emoji represents a bronze medal, symboliz..."
3,🆎,U+1F18E,AB BUTTON (BLOOD TYPE),"[blood type, AB, medical, compatibility, trans...",This emoji represents the AB blood type symbol...
4,🏧,U+1F3E7,ATM SIGN,"[ATM, banking, finance, money, transaction, lo...","This emoji represents an ATM sign, often used ..."


In [26]:
open_moji_df.info()
new_order = ['emoji', 'hexcode', 'annotation', 'tags', 'group', 'subgroups']
open_moji_sm_df = open_moji_df[new_order]
open_moji_sm_df.rename(columns={'emoji':'emoji', 'hexcode':'unicode', 'annotation':'title', 
'tags':'tags', 'group':'group', 'subgroups':'subgroup'}, inplace=True)
open_moji_sm_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4284 entries, 0 to 4283
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   emoji                  4284 non-null   object 
 1   hexcode                4284 non-null   object 
 2   group                  4284 non-null   object 
 3   subgroups              4284 non-null   object 
 4   annotation             4284 non-null   object 
 5   tags                   1907 non-null   object 
 6   openmoji_tags          392 non-null    object 
 7   openmoji_author        4284 non-null   object 
 8   openmoji_date          4284 non-null   object 
 9   skintone               1875 non-null   object 
 10  skintone_combination   2198 non-null   object 
 11  skintone_base_emoji    2198 non-null   object 
 12  skintone_base_hexcode  2198 non-null   object 
 13  unicode                3903 non-null   object 
 14  order                  3782 non-null   float64
dtypes: f

Unnamed: 0,emoji,unicode,title,tags,group,subgroup
0,😀,1F600,grinning face,"face, grin",smileys-emotion,face-smiling
1,😃,1F603,grinning face with big eyes,"face, mouth, open, smile",smileys-emotion,face-smiling
2,😄,1F604,grinning face with smiling eyes,"eye, face, mouth, open, smile",smileys-emotion,face-smiling
3,😁,1F601,beaming face with smiling eyes,"eye, face, grin, smile",smileys-emotion,face-smiling
4,😆,1F606,grinning squinting face,"face, laugh, mouth, satisfied, smile",smileys-emotion,face-smiling


In [27]:
emojipedia_df.info()
new_order = ['Emoji', 'Codepoints Hex', 'Title', 'DescribedBy', 'Group', 'Subgroup', 'Description']
emojipedia_sm_df = emojipedia_df[new_order]
emojipedia_sm_df.rename(columns={'Emoji':'emoji', 'Codepoints Hex':'unicode', 'Title':'title', 
'DescribedBy':'tags', 'Group':'group', 'Subgroup':'subgroup', 'Description':'description'}, inplace=True)
emojipedia_sm_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1893 entries, 0 to 1892
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Group           1893 non-null   object
 1   Subgroup        1893 non-null   object
 2   Emoji           1893 non-null   object
 3   Title           1893 non-null   object
 4   DescribedBy     1893 non-null   object
 5   URL             1893 non-null   object
 6   Description     1893 non-null   object
 7   Codepoints Hex  1893 non-null   object
dtypes: object(8)
memory usage: 118.4+ KB


Unnamed: 0,emoji,unicode,title,tags,group,subgroup,description
0,😀,U+1F600,Grinning Face,grinning-face,smiley,Smiling & Affectionate,"A yellow face with simple, open eyes and a bro..."
1,😃,U+1F603,Grinning Face with Big Eyes,grinning-face-with-big-eyes,smiley,Smiling & Affectionate,"A yellow face with smiling eyes and a broad, o..."
2,😄,U+1F604,Grinning Face with Smiling Eyes,grinning-face-with-smiling-eyes,smiley,Smiling & Affectionate,"A yellow face with smiling eyes and a broad, o..."
3,😁,U+1F601,Beaming Face with Smiling Eyes,beaming-face-with-smiling-eyes,smiley,Smiling & Affectionate,A yellow face with smiling eyes and full-tooth...
4,😆,U+1F606,Grinning Squinting Face,grinning-squinting-face,smiley,Smiling & Affectionate,"A yellow face with a broad, open smile and scr..."


In [28]:
for col in new_columns:
    if col not in llm_emoji_sm_df.columns:
        llm_emoji_sm_df[col] = 'NaN'
    if col not in open_moji_sm_df.columns:
        open_moji_sm_df[col] = 'NaN'
    if col not in emojipedia_sm_df.columns:
        emojipedia_sm_df[col] = 'NaN'

mixed_df = pd.concat([llm_emoji_sm_df, open_moji_sm_df, emojipedia_sm_df], ignore_index=True)
mixed_df = mixed_df[new_columns]
mixed_df

Unnamed: 0,emoji,unicode,title,tags,group,subgroup,description
0,🥇,U+1F947,1ST PLACE MEDAL,"[first place, victory, achievement, success, c...",,,"This emoji represents a first place medal, oft..."
1,🥈,U+1F948,2ND PLACE MEDAL,"[medal, silver, second place, achievement, suc...",,,"This emoji represents a silver medal, often us..."
2,🥉,U+1F949,3RD PLACE MEDAL,"[medal, bronze, third place, achievement, spor...",,,"This emoji represents a bronze medal, symboliz..."
3,🆎,U+1F18E,AB BUTTON (BLOOD TYPE),"[blood type, AB, medical, compatibility, trans...",,,This emoji represents the AB blood type symbol...
4,🏧,U+1F3E7,ATM SIGN,"[ATM, banking, finance, money, transaction, lo...",,,"This emoji represents an ATM sign, often used ..."
...,...,...,...,...,...,...,...
11206,🇫🇲,"U+1F1EB,U+1F1F2",Flag: Micronesia,flag-micronesia,flags,"Oceania, Island Nations & Territories","The flag for Micronesia , which may show as th..."
11207,👒,U+1F452,Woman’s Hat,womans-hat,objects,Clothing & Appearance,"A hat worn by women for sun protection, or for..."
11208,📕,U+1F4D5,Closed Book,closed-book,objects,Office & Stationery,"A closed, hardcover book depicted with a red c..."
11209,📡,U+1F4E1,Satellite Antenna,satellite-antenna,objects,Tools & Household Items,"A dish antenna, as used to send or receive inf..."
