In [42]:
#!/usr/bin/env python3
import sys
import unicodedata

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter

In [19]:
from typing import List, Dict, Any, Tuple, Union

In [45]:
emoji_ranges = [
    (0x1F600, 0x1F64F),  # Emoticons
    (0x1F300, 0x1F5FF),  # Miscellaneous Symbols and Pictographs
    (0x1F680, 0x1F6FF),  # Transport and Map Symbols
    (0x1F700, 0x1F77F),  # Alchemical Symbols
    (0x2600, 0x26FF),    # Miscellaneous Symbols
    (0x2700, 0x27BF),    # Dingbats
    (0x2B50, 0x2BFF),    # Additional symbols
]

def is_emoji(character):
    return any(start <= ord(character) <= end for start, end in emoji_ranges)

In [56]:

START, END = ord(' '), sys.maxunicode + 1

emojis = []
    
for code in range(START, END):
    char = chr(code)

    name = unicodedata.name(char, None)

    #print(f'U+{code:04X}\t{char}\t{name}')
    if is_emoji(char) and name:
        emojis.append(
            {
                'code': ord(char),
                'char': char,
                'name': name.capitalize()
            }
        )

len(emojis)

1702

In [60]:
emojis[1000]

{'code': 128123, 'char': '👻', 'name': 'Ghost'}

In [2]:
from fastembed import TextEmbedding

In [3]:
supported_models = (
    pd.DataFrame(TextEmbedding.list_supported_models())
    .sort_values("size_in_GB")
    .drop(columns="sources")
    .reset_index(drop=True)
)
supported_models

	

Unnamed: 0,model,dim,description,size_in_GB,model_file,additional_files
0,BAAI/bge-small-en-v1.5,384,Fast and Default English model,0.067,model_optimized.onnx,
1,BAAI/bge-small-zh-v1.5,512,Fast and recommended Chinese model,0.09,model_optimized.onnx,
2,sentence-transformers/all-MiniLM-L6-v2,384,"Sentence Transformer model, MiniLM-L6-v2",0.09,model.onnx,
3,snowflake/snowflake-arctic-embed-xs,384,Based on all-MiniLM-L6-v2 model with only 22m ...,0.09,onnx/model.onnx,
4,jinaai/jina-embeddings-v2-small-en,512,English embedding model supporting 8192 sequen...,0.12,onnx/model.onnx,
5,snowflake/snowflake-arctic-embed-s,384,"Based on infloat/e5-small-unsupervised, does n...",0.13,onnx/model.onnx,
6,BAAI/bge-small-en,384,Fast English model,0.13,model_optimized.onnx,
7,nomic-ai/nomic-embed-text-v1.5-Q,768,Quantized 8192 context length english model,0.13,onnx/model_quantized.onnx,
8,BAAI/bge-base-en-v1.5,768,"Base English model, v1.5",0.21,model_optimized.onnx,
9,sentence-transformers/paraphrase-multilingual-...,384,"Sentence Transformer model, paraphrase-multili...",0.22,model_optimized.onnx,


In [17]:
supported_models.loc[9]['model']

'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

In [61]:
# documents: List[str] = [
#     "passage: Hello, World!",
#     "query: Hello, World!",
#     "passage: This is an example passage.",
#     "fastembed is supported by and maintained by Qdrant."
# ]
# embedding_model = TextEmbedding('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# embeddings: List[np.ndarray] = list(embedding_model.embed(documents))


In [62]:
# np.array(embeddings).shape 

In [34]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer


In [35]:
encoder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [29]:
client = QdrantClient(":memory:")

In [32]:
documents = [
    {
        "name": "The Time Machine",
        "description": "A man travels through time and witnesses the evolution of humanity.",
        "author": "H.G. Wells",
        "year": 1895,
    },
    {
        "name": "Ender's Game",
        "description": "A young boy is trained to become a military leader in a war against an alien race.",
        "author": "Orson Scott Card",
        "year": 1985,
    },
    {
        "name": "Brave New World",
        "description": "A dystopian society where people are genetically engineered and conditioned to conform to a strict social hierarchy.",
        "author": "Aldous Huxley",
        "year": 1932,
    },
    {
        "name": "The Hitchhiker's Guide to the Galaxy",
        "description": "A comedic science fiction series following the misadventures of an unwitting human and his alien friend.",
        "author": "Douglas Adams",
        "year": 1979,
    },
    {
        "name": "Dune",
        "description": "A desert planet is the site of political intrigue and power struggles.",
        "author": "Frank Herbert",
        "year": 1965,
    },
    {
        "name": "Foundation",
        "description": "A mathematician develops a science to predict the future of humanity and works to save civilization from collapse.",
        "author": "Isaac Asimov",
        "year": 1951,
    },
    {
        "name": "Snow Crash",
        "description": "A futuristic world where the internet has evolved into a virtual reality metaverse.",
        "author": "Neal Stephenson",
        "year": 1992,
    },
    {
        "name": "Neuromancer",
        "description": "A hacker is hired to pull off a near-impossible hack and gets pulled into a web of intrigue.",
        "author": "William Gibson",
        "year": 1984,
    },
    {
        "name": "The War of the Worlds",
        "description": "A Martian invasion of Earth throws humanity into chaos.",
        "author": "H.G. Wells",
        "year": 1898,
    },
    {
        "name": "The Hunger Games",
        "description": "A dystopian society where teenagers are forced to fight to the death in a televised spectacle.",
        "author": "Suzanne Collins",
        "year": 2008,
    },
    {
        "name": "The Andromeda Strain",
        "description": "A deadly virus from outer space threatens to wipe out humanity.",
        "author": "Michael Crichton",
        "year": 1969,
    },
    {
        "name": "The Left Hand of Darkness",
        "description": "A human ambassador is sent to a planet where the inhabitants are genderless and can change gender at will.",
        "author": "Ursula K. Le Guin",
        "year": 1969,
    },
    {
        "name": "The Three-Body Problem",
        "description": "Humans encounter an alien civilization that lives in a dying system.",
        "author": "Liu Cixin",
        "year": 2008,
    },
]


In [79]:
import unicodedata

def get_country_flags():
    flags = []
    base = 0x1F1E6  # Start of regional indicator symbols
    for code1 in range(base, base + 26):  # Loop over A to Z
        for code2 in range(base, base + 26):  # Loop over A to Z
            flag_char = chr(code1) + chr(code2)
            flag_name = f"Flag of {chr(code1 - base + 65)}{chr(code2 - base + 65)}"
            flags.append({'code': ord(flag_char[0]) * 0x10000 + ord(flag_char[1]), 'char': flag_char, 'name': flag_name})
    return flags

def get_standard_emojis():
    emojis = []
    # Ranges of emojis (excluding country flags)
    emoji_ranges = [
        (0x1F600, 0x1F64F),  # Emoticons
        (0x1F300, 0x1F5FF),  # Miscellaneous Symbols and Pictographs
        (0x1F680, 0x1F6FF),  # Transport and Map Symbols
        (0x1F700, 0x1F77F),  # Alchemical Symbols
        (0x2600, 0x26FF),    # Miscellaneous Symbols
        (0x2700, 0x27BF),    # Dingbats
        (0x2B50, 0x2BFF),    # Additional symbols
    ]
    for start, end in emoji_ranges:
        for code in range(start, end + 1):
            try:
                char = chr(code)
                name = unicodedata.name(char)
                emojis.append({'code': code, 'char': char, 'name': name})
            except ValueError:
                continue
    return emojis

# Combine all emojis and flags into one list
emojis = get_standard_emojis() + get_country_flags()

# Print the first few for demonstration
print(emojis[:10])  # Print only the first 10 for brevity

[{'code': 128512, 'char': '😀', 'name': 'GRINNING FACE'}, {'code': 128513, 'char': '😁', 'name': 'GRINNING FACE WITH SMILING EYES'}, {'code': 128514, 'char': '😂', 'name': 'FACE WITH TEARS OF JOY'}, {'code': 128515, 'char': '😃', 'name': 'SMILING FACE WITH OPEN MOUTH'}, {'code': 128516, 'char': '😄', 'name': 'SMILING FACE WITH OPEN MOUTH AND SMILING EYES'}, {'code': 128517, 'char': '😅', 'name': 'SMILING FACE WITH OPEN MOUTH AND COLD SWEAT'}, {'code': 128518, 'char': '😆', 'name': 'SMILING FACE WITH OPEN MOUTH AND TIGHTLY-CLOSED EYES'}, {'code': 128519, 'char': '😇', 'name': 'SMILING FACE WITH HALO'}, {'code': 128520, 'char': '😈', 'name': 'SMILING FACE WITH HORNS'}, {'code': 128521, 'char': '😉', 'name': 'WINKING FACE'}]


In [83]:
def print_flag(country_code):
    assert len(country_code) == 2 and country_code.isalpha(), "Country code must be two alphabetical characters"
    base = 0x1F1E6
    flag = chr(base + ord(country_code[0]) - ord('A')) + chr(base + ord(country_code[1]) - ord('A'))
    return str(flag) 

# Example usage:
print_flag('US')  # Outputs: 🇺🇸
print_flag('DE')  # Outputs: 🇩🇪


'🇩🇪'

In [74]:
emojis[765]

{'code': 127888, 'char': '🎐', 'name': 'Wind chime'}

In [39]:
client.recreate_collection(
    collection_name="my_books",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE,
    ),
)


  client.recreate_collection(


True

In [63]:
client.upload_points(
    collection_name="my_books",
    points=[
        models.PointStruct(
            id=idx, vector=encoder.encode(emoji["name"]).tolist(), payload=emoji
        )
        for idx, emoji in enumerate(emojis)
    ],
)


In [72]:
hits = client.search(
    collection_name="my_books",
    query_vector=encoder.encode("beach").tolist(),
    limit=10,
)

for hit in hits:
    print(hit.payload['char']) #, "score:", hit.score


🏖
👙
🏝
🏄
🏜
🐬
🐋
📟
♁
♆
