In [2]:
# %%
#!/usr/bin/env python3
import sys
import unicodedata

# %%
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from tqdm import tqdm

# %%
from typing import List, Dict, Any, Tuple, Union

import requests
from bs4 import BeautifulSoup

In [3]:
from fastembed import TextEmbedding
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer


In [4]:
encoder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')



In [5]:
def fetch_flag_data():
    url = 'https://apps.timwhitlock.info/emoji/tables/iso3166'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Assuming the page structure, modify selectors accordingly
    rows = soup.find_all('tr')
    iso2country = {}

    for row in rows:
        cells = row.find_all('td')
        if len(cells) > 2:  # Check if row has sufficient cells
            code = cells[0].text.strip()
            country_emoji = cells[1].text.strip()
            unicode = cells[2].text.strip().replace('U+', '0x')
            country_name = cells[3].text.strip()

            if all((code, country_emoji, unicode)):
                iso2country[code] = {
                    'emoji': country_emoji,
                    'unicode': unicode, 
                    'name': country_name
                }

    return iso2country

# Example usage
iso2country = fetch_flag_data()
print(iso2country['ZA'])


{'emoji': '🇿🇦', 'unicode': '0x1F1FF 0x1F1E6', 'name': 'South Africa'}


In [6]:
def return_flag(country_code):
    assert len(country_code) == 2 and country_code.isalpha(), "Country code must be two alphabetical characters"
    base = 0x1F1E6
    flag = chr(base + ord(country_code[0]) - ord('A')) + chr(base + ord(country_code[1]) - ord('A'))
    
    #print(flag)
    return flag

In [7]:
def get_country_flags():
    flags = []
    base = 0x1F1E6  # Start of regional indicator symbols

    for code1 in range(base, base + 26):  # Loop over A to Z
        for code2 in range(base, base + 26):  # Loop over A to Z

            flag_char = chr(code1) + chr(code2)
            first, second = chr(code1 - base + 65), chr(code2 - base + 65)
            country_iso = first + second
            if country_iso in iso2country:
                country_name = f"Flag of {iso2country[country_iso]['name']}" 
                country_flag = iso2country[country_iso]['emoji']
                country_code = ord(flag_char[0]) * 0x10000 + ord(flag_char[1])
                

                flags.append(
                    {
                        'code': country_code, 
                        'hex_code': hex(country_code),
                        'char': country_flag, 
                        'name': country_name.lower() # 'This is an emoji that represents a ' + '.'                 
                    }
                )

    return flags

In [8]:

def get_standard_emojis():
    emojis = []
    # Ranges of emojis (excluding country flags)
    emoji_ranges = [
        (0x1F600, 0x1F64F),  # Emoticons
        (0x1F300, 0x1F5FF),  # Miscellaneous Symbols and Pictographs
        (0x1F680, 0x1F6FF),  # Transport and Map Symbols
        #(0x1F700, 0x1F77F),  # Alchemical Symbols
        (0x2600, 0x26FF),    # Miscellaneous Symbols
        (0x2700, 0x27BF),    # Dingbats
        #(0x2B50, 0x2BFF),    # Additional symbols
    ]

    for start, end in emoji_ranges:
        for code in range(start, end + 1):
            try:
                char = chr(code)
                name = unicodedata.name(char)
                emojis.append(
                    {
                        'code': ord(char),
                        'hex_code': hex(ord(char)),
                        'char': char,
                        'name': name.lower() # + '.' 'This is an emoji that represents a ' + 
                    }
                )
            except ValueError:
                continue
    return emojis

In [10]:
#emojis[-100:]

In [11]:
#Combine all emojis and flags into one list
emojis = get_standard_emojis() + get_country_flags()

#print(emojis)

# Print the first few for demonstration
print(emojis[-100:])  # Print only the first 10 for brevity

[{'code': 8354263542, 'hex_code': '0x1f1f3f1f6', 'char': '🇲🇶', 'name': 'flag of martinique'}, {'code': 8354263543, 'hex_code': '0x1f1f3f1f7', 'char': '🇲🇷', 'name': 'flag of mauritania'}, {'code': 8354263544, 'hex_code': '0x1f1f3f1f8', 'char': '🇲🇸', 'name': 'flag of montserrat'}, {'code': 8354263545, 'hex_code': '0x1f1f3f1f9', 'char': '🇲🇹', 'name': 'flag of malta'}, {'code': 8354263546, 'hex_code': '0x1f1f3f1fa', 'char': '🇲🇺', 'name': 'flag of mauritius'}, {'code': 8354263547, 'hex_code': '0x1f1f3f1fb', 'char': '🇲🇻', 'name': 'flag of maldives'}, {'code': 8354263548, 'hex_code': '0x1f1f3f1fc', 'char': '🇲🇼', 'name': 'flag of malawi'}, {'code': 8354263549, 'hex_code': '0x1f1f3f1fd', 'char': '🇲🇽', 'name': 'flag of mexico'}, {'code': 8354263550, 'hex_code': '0x1f1f3f1fe', 'char': '🇲🇾', 'name': 'flag of malaysia'}, {'code': 8354263551, 'hex_code': '0x1f1f3f1ff', 'char': '🇲🇿', 'name': 'flag of mozambique'}, {'code': 8354329062, 'hex_code': '0x1f1f4f1e6', 'char': '🇳🇦', 'name': 'flag of namibia'

In [12]:
client = QdrantClient(":memory:")

# %%
client.recreate_collection(
    collection_name="my_books",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE,
    ),
)

  client.recreate_collection(


True

In [13]:
client.upload_points(
    collection_name="my_books",
    points=[
        models.PointStruct(
            id=idx, vector=encoder.encode(emoji["name"]).tolist(), payload=emoji
        )
        for idx, emoji in tqdm(enumerate(emojis), ncols=70, total=len(emojis))
    ],
)

100%|█████████████████████████████| 1662/1662 [00:28<00:00, 58.45it/s]


In [14]:
def show_hits(query):
    """
    """
    #query = 'This is an emoji that represents a ' + query + '.' #.lower()
    #str_len = len('This is an emoji that represents a ')

    if query:
        hits = client.search(
            collection_name="my_books",
            query_vector=encoder.encode(query).tolist(),
            limit=15,
        )

        for hit in hits:
            print(f"{hit.payload['char']:<10} {hex(hit.payload['code'])} {hit.payload['name']:>30} {hit.score:>5.2f}") #, "score:", hit.score) [str_len:-1][str_len:][str_len:-1]


In [28]:
show_hits("medal")

🏅          0x1f3c5                   sports medal  0.88
🎖          0x1f396                 military medal  0.86
🏆          0x1f3c6                         trophy  0.82
✌          0x270c                   victory hand  0.64
🔱          0x1f531                 trident emblem  0.53
🇲🇶         0x1f1f3f1f6             flag of martinique  0.52
📛          0x1f4db                     name badge  0.51
🇯🇪         0x1f1f0f1ea                 flag of jersey  0.50
🖔          0x1f594          reversed victory hand  0.50
🇰🇲         0x1f1f1f1f2                flag of comoros  0.48
⚙          0x2699                           gear  0.47
👑          0x1f451                          crown  0.47
🏐          0x1f3d0                     volleyball  0.47
⚳          0x26b3                          ceres  0.46
🏷          0x1f3f7                          label  0.46


In [43]:
import emoji
import emoji.unicode_codes

# Get all unique emojis
all_emojis = emoji.unicode_codes.EMOJI_DATA
# Loop over all emojis and print them
for em in all_emojis:
    desc = ' '.join(all_emojis[em]['en'].split('_')).upper()
    if len(em) > 1:
        print(f"{em}, {desc} {hex(ord(em[0]))} {hex(ord(em[1]))}")
    else:
        print(f"{em}, {desc} {hex(ord(em))}")

🥇, :1ST PLACE MEDAL: 0x1f947
🥈, :2ND PLACE MEDAL: 0x1f948
🥉, :3RD PLACE MEDAL: 0x1f949
🆎, :AB BUTTON (BLOOD TYPE): 0x1f18e
🏧, :ATM SIGN: 0x1f3e7
🅰️, :A BUTTON (BLOOD TYPE): 0x1f170 0xfe0f
🅰, :A BUTTON (BLOOD TYPE): 0x1f170
🇦🇫, :AFGHANISTAN: 0x1f1e6 0x1f1eb
🇦🇱, :ALBANIA: 0x1f1e6 0x1f1f1
🇩🇿, :ALGERIA: 0x1f1e9 0x1f1ff
🇦🇸, :AMERICAN SAMOA: 0x1f1e6 0x1f1f8
🇦🇩, :ANDORRA: 0x1f1e6 0x1f1e9
🇦🇴, :ANGOLA: 0x1f1e6 0x1f1f4
🇦🇮, :ANGUILLA: 0x1f1e6 0x1f1ee
🇦🇶, :ANTARCTICA: 0x1f1e6 0x1f1f6
🇦🇬, :ANTIGUA & BARBUDA: 0x1f1e6 0x1f1ec
♒, :AQUARIUS: 0x2652
🇦🇷, :ARGENTINA: 0x1f1e6 0x1f1f7
♈, :ARIES: 0x2648
🇦🇲, :ARMENIA: 0x1f1e6 0x1f1f2
🇦🇼, :ARUBA: 0x1f1e6 0x1f1fc
🇦🇨, :ASCENSION ISLAND: 0x1f1e6 0x1f1e8
🇦🇺, :AUSTRALIA: 0x1f1e6 0x1f1fa
🇦🇹, :AUSTRIA: 0x1f1e6 0x1f1f9
🇦🇿, :AZERBAIJAN: 0x1f1e6 0x1f1ff
🔙, :BACK ARROW: 0x1f519
🅱️, :B BUTTON (BLOOD TYPE): 0x1f171 0xfe0f
🅱, :B BUTTON (BLOOD TYPE): 0x1f171
🇧🇸, :BAHAMAS: 0x1f1e7 0x1f1f8
🇧🇭, :BAHRAIN: 0x1f1e7 0x1f1ed
🇧🇩, :BANGLADESH: 0x1f1e7 0x1f1e9
🇧🇧, :BARBADOS: 0x1f1e7 0x

In [56]:
emojis = [
    {
        #'code': ord(em),
        'char': em,
        'desc': all_emojis[em]['en'],
        'hex_code': hex(ord(em[0])) + ' ' + hex(ord(em[1]) if len(em) > 1 else 0)
    }
    for em in all_emojis
]

In [57]:
emojis

[{'char': '🥇', 'desc': ':1st_place_medal:', 'hex_code': '0x1f947 0x0'},
 {'char': '🥈', 'desc': ':2nd_place_medal:', 'hex_code': '0x1f948 0x0'},
 {'char': '🥉', 'desc': ':3rd_place_medal:', 'hex_code': '0x1f949 0x0'},
 {'char': '🆎', 'desc': ':AB_button_(blood_type):', 'hex_code': '0x1f18e 0x0'},
 {'char': '🏧', 'desc': ':ATM_sign:', 'hex_code': '0x1f3e7 0x0'},
 {'char': '🅰️',
  'desc': ':A_button_(blood_type):',
  'hex_code': '0x1f170 0xfe0f'},
 {'char': '🅰', 'desc': ':A_button_(blood_type):', 'hex_code': '0x1f170 0x0'},
 {'char': '🇦🇫', 'desc': ':Afghanistan:', 'hex_code': '0x1f1e6 0x1f1eb'},
 {'char': '🇦🇱', 'desc': ':Albania:', 'hex_code': '0x1f1e6 0x1f1f1'},
 {'char': '🇩🇿', 'desc': ':Algeria:', 'hex_code': '0x1f1e9 0x1f1ff'},
 {'char': '🇦🇸', 'desc': ':American_Samoa:', 'hex_code': '0x1f1e6 0x1f1f8'},
 {'char': '🇦🇩', 'desc': ':Andorra:', 'hex_code': '0x1f1e6 0x1f1e9'},
 {'char': '🇦🇴', 'desc': ':Angola:', 'hex_code': '0x1f1e6 0x1f1f4'},
 {'char': '🇦🇮', 'desc': ':Anguilla:', 'hex_code': '0

In [58]:
from datasets import Dataset

emoji_dataset = Dataset.from_list(emojis)

# print the dataset
print(emoji_dataset)

Dataset({
    features: ['char', 'desc', 'hex_code'],
    num_rows: 5034
})


In [59]:
emoji_dataset[0]

{'char': '🥇', 'desc': ':1st_place_medal:', 'hex_code': '0x1f947 0x0'}

In [62]:
# push the dataset to the Hugging Face Model Hub
emoji_dataset.push_to_hub("badrabdullah/emoji-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/341 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/badrabdullah/emoji-dataset/commit/0f2cca0832de2ed78be8eb86f1cc7be7c0194178', commit_message='Upload dataset', commit_description='', oid='0f2cca0832de2ed78be8eb86f1cc7be7c0194178', pr_url=None, pr_revision=None, pr_num=None)