In [1]:
#!/usr/bin/env python3
import sys
import unicodedata

In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import pickle

In [167]:
import emoji

In [24]:
from tqdm.notebook import tqdm

In [4]:
from fastembed import TextEmbedding

In [5]:
supported_models = (
    pd.DataFrame(TextEmbedding.list_supported_models())
    .sort_values("size_in_GB")
    .drop(columns="sources")
    .reset_index(drop=True)
)
supported_models

	

Unnamed: 0,model,dim,description,size_in_GB,model_file,additional_files
0,BAAI/bge-small-en-v1.5,384,Fast and Default English model,0.067,model_optimized.onnx,
1,BAAI/bge-small-zh-v1.5,512,Fast and recommended Chinese model,0.09,model_optimized.onnx,
2,sentence-transformers/all-MiniLM-L6-v2,384,"Sentence Transformer model, MiniLM-L6-v2",0.09,model.onnx,
3,snowflake/snowflake-arctic-embed-xs,384,Based on all-MiniLM-L6-v2 model with only 22m ...,0.09,onnx/model.onnx,
4,jinaai/jina-embeddings-v2-small-en,512,English embedding model supporting 8192 sequen...,0.12,onnx/model.onnx,
5,snowflake/snowflake-arctic-embed-s,384,"Based on infloat/e5-small-unsupervised, does n...",0.13,onnx/model.onnx,
6,BAAI/bge-small-en,384,Fast English model,0.13,model_optimized.onnx,
7,nomic-ai/nomic-embed-text-v1.5-Q,768,Quantized 8192 context length english model,0.13,onnx/model_quantized.onnx,
8,BAAI/bge-base-en-v1.5,768,"Base English model, v1.5",0.21,model_optimized.onnx,
9,sentence-transformers/paraphrase-multilingual-...,384,"Sentence Transformer model, paraphrase-multili...",0.22,model_optimized.onnx,


In [6]:
supported_models.loc[9]['model']

'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

In [7]:
with open('emoji2dict.pkl', 'rb') as file:
    emoji_dict = pickle.load(file)

In [8]:
for emoji in emoji_dict:
    emoji_dict[emoji]['emoji_cahr'] = emoji

In [9]:
emoji_dict[emoji]['emoji_cahr']

'🇦🇽'

In [10]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer


In [11]:
sentence_encoder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')



In [12]:
client = QdrantClient(":memory:")

In [13]:
client.recreate_collection(
    collection_name="EMOJIS",
    vectors_config=models.VectorParams(
        size=sentence_encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE,
    ),
)


  client.recreate_collection(


True

In [409]:
client.upload_points(
    collection_name="EMOJIS",
    points=[
        models.PointStruct(
            id=idx, 
            vector=sentence_encoder.encode(
                info["Description"] + 
                ' '.join(str(s) for s in info["Semantic_Tags"]) 
            ).tolist(),
            payload=emoji_dict[emoji]
        )
        for idx, (emoji, info) in tqdm(
            enumerate(emoji_dict.items()), 
            total=len(emoji_dict),
            desc="Embedding emoji descriptions",
            #ncols=100       
        )
    ],
)

Embedding emoji descriptions:   0%|          | 0/5034 [00:00<?, ?it/s]

In [414]:
def return_simialr_emojis(query):
    hits = client.search(
        collection_name="EMOJIS",
        query_vector=sentence_encoder.encode(query).tolist(),
        limit=40,
    )

    hit_emojis = set()

    for hit in hits:
        emoji_char = hit.payload['emoji_cahr']
        score = hit.score

        _ord = ''
        for c in emoji_char:
            _ord += str(ord(c)) + ' '

        _spec = len(emoji_char) + 3

        if emoji_char not in hit_emojis: 
            print(f"{emoji_char:<{_spec}}{score:<7.3f}{emoji.demojize(emoji_char):<55}")
                #emoji_char + '\t' +  , end= '\n') #, "score:", hit.score
            #display(Markdown('***{}***'.format(emoji_char)))
            
        hit_emojis.add(emoji_char)

    #     for i in today_links:
    # display(Markdown('*{}*'.format(i[0], i[1])))

        


In [415]:
return_simialr_emojis(
    "DNA Biologie Labor Forschung"
)

🧬   0.484  :dna:                                                  
👩🏾‍🔬   0.314  :woman_scientist_medium-dark_skin_tone:                
🅾   0.310  :O_button_(blood_type):                                
🅾️   0.310  :O_button_(blood_type):                                
👩‍🔬   0.306  :woman_scientist:                                      
🧪   0.299  :test_tube:                                            
👩🏿‍🔬   0.298  :woman_scientist_dark_skin_tone:                       
👨‍🔬   0.292  :man_scientist:                                        
👩🏽‍🔬   0.289  :woman_scientist_medium_skin_tone:                     
🅱   0.289  :B_button_(blood_type):                                
👨🏾‍🔬   0.289  :man_scientist_medium-dark_skin_tone:                  
🅰   0.287  :A_button_(blood_type):                                
👩🏻‍🔬   0.285  :woman_scientist_light_skin_tone:                      
🧑🏽‍🔬   0.284  :scientist_medium_skin_tone:                           
🆎   0.279  :AB_button_(blood_type):    

In [416]:
return_simialr_emojis(
    "جينات بيولوجيا بحث"
)

🧬   0.580  :dna:                                                  
🦠   0.413  :microbe:                                              
👨‍🔬   0.338  :man_scientist:                                        
👩🏾‍🔬   0.332  :woman_scientist_medium-dark_skin_tone:                
🧑‍🔬   0.325  :scientist:                                            
👩‍🔬   0.322  :woman_scientist:                                      
🧑🏽‍🔬   0.311  :scientist_medium_skin_tone:                           
🅾   0.304  :O_button_(blood_type):                                
🅾️   0.304  :O_button_(blood_type):                                
👩🏿‍🔬   0.303  :woman_scientist_dark_skin_tone:                       
🧑🏾‍🔬   0.298  :scientist_medium-dark_skin_tone:                      
🧑🏿‍🔬   0.297  :scientist_dark_skin_tone:                             
🅱   0.295  :B_button_(blood_type):                                
🧪   0.291  :test_tube:                                            
👨🏿‍🔬   0.290  :man_scientist_dark_skin_t

In [331]:
return_simialr_emojis("علوم الفلك  و الفضاء")

🪐   0.710     :ringed_planet:                                        
🔭   0.662     :telescope:                                            
🌌   0.652     :milky_way:                                            
🧑‍🚀   0.638     :astronaut:                                            
👨‍🚀   0.612     :man_astronaut:                                        
☄   0.593     :comet:                                                
☄️   0.566     :comet:                                                
🧑🏾‍🚀   0.563     :astronaut_medium-dark_skin_tone:                      
👩‍🚀   0.553     :woman_astronaut:                                      
👨🏻‍🚀   0.552     :man_astronaut_light_skin_tone:                        
🌒   0.542     :waxing_crescent_moon:                                 
👽   0.541     :alien:                                                
🧑🏿‍🚀   0.527     :astronaut_dark_skin_tone:                             
🧑🏻‍🚀   0.521     :astronaut_light_skin_tone:                            
👨

In [332]:
return_simialr_emojis("science astronomy space")

🔭   0.711     :telescope:                                            
🪐   0.683     :ringed_planet:                                        
🌌   0.622     :milky_way:                                            
☄️   0.565     :comet:                                                
☄   0.549     :comet:                                                
🧑‍🚀   0.544     :astronaut:                                            
👨‍🚀   0.520     :man_astronaut:                                        
🌒   0.489     :waxing_crescent_moon:                                 
🧑🏾‍🚀   0.480     :astronaut_medium-dark_skin_tone:                      
👨🏻‍🚀   0.473     :man_astronaut_light_skin_tone:                        
🛰️   0.471     :satellite:                                            
🛰   0.471     :satellite:                                            
🌘   0.471     :waning_crescent_moon:                                 
👩🏿‍🚀   0.471     :woman_astronaut_dark_skin_tone:                       
🧑🏻‍🚀 

In [417]:
return_simialr_emojis("winter holiday season")

⛄   0.691  :snowman_without_snow:                                 
☃   0.578  :snowman:                                              
☃️   0.578  :snowman:                                              
🎄   0.547  :Christmas_tree:                                       
🎅🏽   0.523  :Santa_Claus_medium_skin_tone:                         
🎅   0.510  :Santa_Claus:                                          
🤶🏻   0.507  :Mrs._Claus_light_skin_tone:                           
🤶🏾   0.506  :Mrs._Claus_medium-dark_skin_tone:                     
🧑🏻‍🎄   0.502  :mx_claus_light_skin_tone:                             
🤶🏽   0.499  :Mrs._Claus_medium_skin_tone:                          
🎅🏾   0.499  :Santa_Claus_medium-dark_skin_tone:                    
🤶🏼   0.496  :Mrs._Claus_medium-light_skin_tone:                    
🎅🏻   0.495  :Santa_Claus_light_skin_tone:                          
🤶   0.492  :Mrs._Claus:                                           
🎅🏿   0.492  :Santa_Claus_dark_skin_tone:           

In [388]:
return_simialr_emojis("virus health pandemic")
# mexican asian french italina food

🦠   0.559     :microbe:                                              
😷   0.544     :face_with_medical_mask:                               
🤒   0.456     :face_with_thermometer:                                
💉   0.374     :syringe:                                              
🤮   0.364     :face_vomiting:                                        
🌩️   0.355     :cloud_with_lightning:                                 
🌩   0.355     :cloud_with_lightning:                                 
🤢   0.347     :nauseated_face:                                       
🧑🏼‍⚕   0.340     :health_worker_medium-light_skin_tone:                 
🧑🏼‍⚕️   0.340     :health_worker_medium-light_skin_tone:                 
🥼   0.334     :lab_coat:                                             
🌧️   0.329     :cloud_with_rain:                                      
🌧   0.329     :cloud_with_rain:                                      
⛈️   0.328     :cloud_with_lightning_and_rain:                        
🦹   0.319 

In [428]:
return_simialr_emojis("new beginnings")
# ambition

🌅   0.443  :sunrise:                                              
🥚   0.429  :egg:                                                  
🌄   0.416  :sunrise_over_mountains:                               
👶   0.415  :baby:                                                 
🌚   0.415  :new_moon_face:                                        
🌱   0.415  :seedling:                                             
🌑   0.388  :new_moon:                                             
🔰   0.384  :Japanese_symbol_for_beginner:                         
🐣   0.377  :hatching_chick:                                       
🚪   0.359  :door:                                                 
🆕   0.332  :NEW_button:                                           
🌼   0.332  :blossom:                                              
👶🏽   0.329  :baby_medium_skin_tone:                                
👶🏼   0.325  :baby_medium-light_skin_tone:                          
👶🏻   0.324  :baby_light_skin_tone:                          

In [427]:
emoji_dict['🌱']

{'Emoji': '🌱',
 'Description': 'This emoji represents a seedling, symbolizing new beginnings, growth, and the cycle of life, often used to express hope, optimism, and environmental awareness.',
 'Semantic_Tags': ['seedling',
  'growth',
  'new beginnings',
  'life cycle',
  'nature',
  'environment',
  'hope',
  'optimism'],
 'emoji_cahr': '🌱'}