In [51]:
#!/usr/bin/env python3
import sys
import unicodedata

import pandas as pd

from typing import List, Dict, Any, Tuple, Union

import requests
from bs4 import BeautifulSoup

from datasets import Dataset

In [43]:
def fetch_flag_data(url: str) -> Dict[str, Dict[str, str]]:
    """
    Extract all the flag data from the given URL using BeautifulSoup
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Assuming the page structure, modify selectors accordingly
    rows = soup.find_all('tr')
    iso2country = {}

    for row in rows:
        cells = row.find_all('td')
        if len(cells) > 2:  # Check if row has sufficient cells
            code = cells[0].text.strip()
            country_emoji = cells[1].text.strip()
            unicode = cells[2].text.strip().replace('U+', '0x')
            country_name = cells[3].text.strip()

            if all((code, country_emoji, unicode)):
                iso2country[code] = {
                    'emoji': country_emoji,
                    'unicode': unicode, 
                    'name': country_name
                }

    return iso2country

# extract flag emojis
emoji_url = 'https://apps.timwhitlock.info/emoji/tables/iso3166'
iso2country = fetch_flag_data(emoji_url)

print(iso2country['YE'])


{'emoji': '🇾🇪', 'unicode': '0x1F1FE 0x1F1EA', 'name': 'Yemen'}


In [44]:
def get_country_flags() -> List[Dict[str, Union[int, str]]]:
    """
    Generate a list of country flags with their respective Unicode code points
    """
    flags = []
    base = 0x1F1E6  # Start of regional indicator symbols

    for code1 in range(base, base + 26):  # Loop over A to Z
        for code2 in range(base, base + 26):  # Loop over A to Z

            flag_char = chr(code1) + chr(code2)
            first, second = chr(code1 - base + 65), chr(code2 - base + 65)
            country_iso = first + second
            if country_iso in iso2country:
                country_name = f"Flag of {iso2country[country_iso]['name']}" 
                country_flag = iso2country[country_iso]['emoji']
                country_code = ord(flag_char[0]) * 0x10000 + ord(flag_char[1])
                

                flags.append(
                    {
                        'code': country_code, 
                        'hex_code': hex(country_code),
                        'char': country_flag, 
                        'name': country_name.upper()                 
                    }
                )

    return flags

In [45]:

def get_standard_emojis():
    emojis = []
    # Ranges of emojis (excluding country flags)
    emoji_ranges = [
        (0x1F600, 0x1F64F),  # Emoticons
        (0x1F300, 0x1F5FF),  # Miscellaneous Symbols and Pictographs
        (0x1F680, 0x1F6FF),  # Transport and Map Symbols
        #(0x1F700, 0x1F77F),  # Alchemical Symbols
        (0x2600, 0x26FF),    # Miscellaneous Symbols
        (0x2700, 0x27BF),    # Dingbats
        #(0x2B50, 0x2BFF),    # Additional symbols
    ]

    for start, end in emoji_ranges:
        for code in range(start, end + 1):
            try:
                char = chr(code)
                name = unicodedata.name(char)
                emojis.append(
                    {
                        'code': ord(char),
                        'hex_code': hex(ord(char)),
                        'char': char,
                        'name': name.upper() # + '.' 'This is an emoji that represents a ' + 
                    }
                )
            except ValueError:
                continue
    return emojis

In [46]:
# combine all emojis and flags into one list
emojis = get_standard_emojis() + get_country_flags()

# print the first few for demonstration
print(emojis[:5]) 

[{'code': 128512, 'hex_code': '0x1f600', 'char': '😀', 'name': 'GRINNING FACE'}, {'code': 128513, 'hex_code': '0x1f601', 'char': '😁', 'name': 'GRINNING FACE WITH SMILING EYES'}, {'code': 128514, 'hex_code': '0x1f602', 'char': '😂', 'name': 'FACE WITH TEARS OF JOY'}, {'code': 128515, 'hex_code': '0x1f603', 'char': '😃', 'name': 'SMILING FACE WITH OPEN MOUTH'}, {'code': 128516, 'hex_code': '0x1f604', 'char': '😄', 'name': 'SMILING FACE WITH OPEN MOUTH AND SMILING EYES'}]


In [47]:
# print the first few for demonstration
print(emojis[-5:]) 

[{'code': 8355049962, 'hex_code': '0x1f1fff1ea', 'char': '🇾🇪', 'name': 'FLAG OF YEMEN'}, {'code': 8355049977, 'hex_code': '0x1f1fff1f9', 'char': '🇾🇹', 'name': 'FLAG OF MAYOTTE'}, {'code': 8355115494, 'hex_code': '0x1f200f1e6', 'char': '🇿🇦', 'name': 'FLAG OF SOUTH AFRICA'}, {'code': 8355115506, 'hex_code': '0x1f200f1f2', 'char': '🇿🇲', 'name': 'FLAG OF ZAMBIA'}, {'code': 8355115516, 'hex_code': '0x1f200f1fc', 'char': '🇿🇼', 'name': 'FLAG OF ZIMBABWE'}]


In [48]:
emojis[0]

{'code': 128512, 'hex_code': '0x1f600', 'char': '😀', 'name': 'GRINNING FACE'}

In [49]:
emojis[-1]

{'code': 8355115516,
 'hex_code': '0x1f200f1fc',
 'char': '🇿🇼',
 'name': 'FLAG OF ZIMBABWE'}

In [41]:
type(emojis[0]['hex_code'])

str

In [54]:

# make a Dataset object from the list of emojis

# convert the list of dictionaries to a Hugging Face dataset
# emoji_dict = {
#     index: emoji for index, emoji in enumerate(emojis) 
# }

emoji_dataset = Dataset.from_list(emojis)

# print the dataset
print(emoji_dataset)


Dataset({
    features: ['code', 'hex_code', 'char', 'name'],
    num_rows: 1662
})


In [56]:
emoji_dataset[0]

{'code': 128512, 'hex_code': '0x1f600', 'char': '😀', 'name': 'GRINNING FACE'}

In [57]:
# push the dataset to the Hugging Face Model Hub
emoji_dataset.push_to_hub("badrabdullah/emoji-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/badrabdullah/emoji-dataset/commit/5f2510b8670db8cb8699ac23ea7fee8ee54d042e', commit_message='Upload dataset', commit_description='', oid='5f2510b8670db8cb8699ac23ea7fee8ee54d042e', pr_url=None, pr_revision=None, pr_num=None)