In [1]:
LONG_TEXT = """Text literals and metacharacters make up this string. The compile function is used to create the pattern."""

In [2]:
from icecream import ic

In [3]:
import os, sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)
from StringSpans import StringSpans
from SemanticMasking import MaskGen, SemanticPositions

In [4]:
#@title random_bit_stream
import random

def random_bit_stream(length=None):
    """Return a random string of zeros and ones of the given length (default: random integer between 0 and 100)."""
    if length is None:
        length = random.randint(0, 100)
    return ''.join(str(random.randint(0, 1)) for _ in range(length))
def int_to_binary_string(n: int, length: int):
    binary_str = bin(n)[2:]  # convert to binary string, remove '0b' prefix
    padded_str = binary_str.rjust(length, '0')  # pad with zeros to length
    return padded_str

## Rule Based


In [None]:
#@title https://github.com/farkmarnum/emojify
import json
import random
from math import log2,floor,ceil 
import itertools
import re
from typing import Dict, Generator, List, Tuple

with open('./emoji-data.json', 'r') as f:
    emoji_data: Dict[str,Dict[str,List[str]]] = json.load(f)

In [None]:
#@title Constants
regex = re.compile(r'[a-z0-9]+')
ALL_EMOJIS = set()
for k,v in emoji_data.items():
  if regex.match(k) is None:
    ALL_EMOJIS.add(k)
    # print('k',k)
  if isinstance(v,str) and regex.match(v) is None:
    ALL_EMOJIS.add(v)
    # print('v',v)
  else:
    for kk,vv in v.items():
      if regex.match(kk) is None:
        ALL_EMOJIS.add(kk)
        # print('kk',kk)
      if isinstance(vv,str) and regex.match(vv) is None:
        ALL_EMOJIS.add(v)
        # print('vv',vv)
EMOJIER_COMMON_WORDS = {
    'a',
    'an',
    'as',
    'is',
    'if',
    'of',
    'the',
    'it',
    'its',
    'or',
    'are',
    'this',
    'with',
    'so',
    'to',
    'at',
    'was',
    'and',
  }

In [None]:
#@title encode decode
class Emojier:
  @staticmethod
  def gaussian_order(lst):
    length = len(lst)
    max_odd_ind = length - 1 if length % 2 == 0 else length - 2
    max_even_ind = length - 1 if length % 2 != 0 else length - 2
    dist = itertools.chain(range(max_odd_ind,0,-2),range(0,max_even_ind + 1 , 2))
    return [lst[i] for i in dist]

  @staticmethod
  def encode(
        input_str: str,
        bytes_str: str,
        verbose=False,
        mask=True,
        maskStep: int =6,
        topX=False,
        X: float=0.15
    ) -> Tuple[str,str]:
    
    if verbose:
      print('encode:')
    input_str_spans = StringSpans(input_str)
    word_span_n_words = zip(input_str_spans.words, input_str_spans.get_words())
    result = input_str
    acc_offset = 0
    
    word_span_n_words_options: List[Tuple[int,str,List[str]]] = []
    for (_,we), word_raw in word_span_n_words:
      word = word_raw.lower()
      is_too_common = word in EMOJIER_COMMON_WORDS

      emoji_options = \
        Emojier.gaussian_order( ['']+
          [x[0] for x in
            sorted(
              emoji_data.get(word, {}).items(),
              key=lambda x:x[1],
              reverse=True
            )
          ]
        )
      if not is_too_common and len(emoji_options)>=2:
        word_span_n_words_options.append((we,word_raw, emoji_options))
    
    if mask:
      word_span_n_words_options = word_span_n_words_options[::maskStep]
    if topX:
      word_span_n_words_options.sort(key=lambda tup : len(tup[2]),reverse=True)
      taken_elements = ceil(len(word_span_n_words_options) * X) 
      word_span_n_words_options = word_span_n_words_options[:taken_elements]
      
    for we, word_raw, emoji_options in word_span_n_words_options:
      word = word_raw.lower()

      if verbose:
        print(f"word: {word} \nlen: {len(emoji_options)} \temoji_options[:10]: {emoji_options[:10]}")

      bits = floor(log2(len(emoji_options)))
      taken_bits = bytes_str[:bits]
      ind = int(taken_bits, 2)
      bytes_str = bytes_str[bits:]
      emojis = emoji_options[ind]
      if len(emojis) > 0:
        we = we + acc_offset
        acc_offset += len(emojis) + 1
        if verbose:
          print(f'>>>encoding {taken_bits} = {ind} as {emojis}\nwe={we}\tacc_offset={acc_offset}')
          print(f'result[:we]="{result[:we]}" result[we:]="{result[we:]}"')  
        result = f'{result[:we]} {emojis}{result[we:]}'  

    return result, bytes_str

  @staticmethod
  def eat_back(s:str) -> Generator[str,None,None]:
    for i in range(len(s),-1,-1):
      yield s[0:i]
  @staticmethod
  def decode(
            input_str: str,
            verbose=False,
            mask: bool =True,
            maskStep: int =6,
            topX: bool =False,
            X: float=0.15
      ) -> Tuple[str,str]:
    
    if verbose:
      print('decoding!')
    wordish = re.compile(r'^[a-z]*$')
    input_str_ss = StringSpans(input_str)
    words = [input_str[s:e] for s,e in input_str_ss.non_spaces]
    result = input_str
    bytes_str = ''
    
    emoticons_used = []
    word_span_n_words_options: List[Tuple[int,str,List[str]]] = []
    for i, word_raw in enumerate(words[:-1]):
      word = word_raw.lower()
      
      if wordish.match(word) is None:
        continue 

      is_too_common = word in EMOJIER_COMMON_WORDS

      emoji_options = \
        Emojier.gaussian_order( ['']+
          [x[0] for x in
            sorted(
              emoji_data.get(word, {}).items(),
              key=lambda x:x[1],
              reverse=True
            )
          ]
        )
      if not is_too_common and len(emoji_options) >= 2:
        word_span_n_words_options.append((i,word_raw,emoji_options))

    if mask:
      word_span_n_words_options = word_span_n_words_options[::maskStep]
    if topX:
      word_span_n_words_options.sort(key=lambda tup : len(tup[2]),reverse=True)
      taken_elements = ceil(len(word_span_n_words_options) * X) 
      word_span_n_words_options = word_span_n_words_options[:taken_elements]
        
    for i, word_raw, emoji_options in word_span_n_words_options:
      word = word_raw.lower()

      if verbose:
        print(f"word: {word} \nlen: {len(emoji_options)} \temoji_options[:10]: {emoji_options[:10]}")

      bits = floor(log2(len(emoji_options)))
      index = 0
      for w in Emojier.eat_back(words[i+1]):
        if w in emoji_options:
          index = emoji_options.index(w)
          emoticons_used.append((w,i+1))
          break
        
      data_extracted = int_to_binary_string(index,bits)
      if verbose:
        print(f'>>>decoding word:"{words[i]}" next word:"{words[i+1]}" length:"{len(emoji_options)}"')
        print(f'bits:"{bits}" data extracted:"{data_extracted}" index:"{index}"')
      bytes_str += data_extracted

    for emo,idx in reversed(emoticons_used):
      s,e = input_str_ss.non_spaces[idx]
      if emo:
        result = result[:s-1] + result[s:e].replace(emo,'') + result[e:]
  
    return result, bytes_str


tests = 100
acc = 0
onlyRatio = False
print(f"Running {tests} tests")
for i in range(tests):
  data = random_bit_stream(60)
  # text = 'hi, how are you?'
  LONG_TEXT = '''Metaphysical solipsism is a variety of solipsism. Based on a philosophy of subjective idealism, metaphysical solipsists maintain that the self is the only existing reality and that all other realities, including the external world and other persons, are representations of that self, and have no independent existence.[citation needed] There are several versions of metaphysical solipsism, such as Caspar Hare's egocentric presentism (or perspectival realism), in which other people are conscious, but their experiences are simply not present.'''
  text = LONG_TEXT
  verbose = False
  encoded_text,rem = Emojier.encode(text,data,verbose=verbose)
  if not onlyRatio:
    print('rem=',rem)
    print('encoded_text=',encoded_text)
  original_txt, deData = Emojier.decode(encoded_text,verbose=verbose)
  if not onlyRatio:
    print('original_txt=',original_txt)
  deData += rem
  if not onlyRatio:
    print(f'text="{text}"\n->\nencoded_text="{encoded_text}" \ndata="{data}"\ndeData="{deData}"\ndata==deData="{data==deData}"')
  ratio =(len(data)-len(rem)) / len(text)
  acc += ratio
  if not onlyRatio:
    print(f'ratio={len(data)-len(rem)} / {len(text)}={ratio}')
  assert data==deData
  assert text==original_txt
  if not onlyRatio:
    print('\n')
    print("#"*100)
    print('\n')

print(f'avg ratio = {acc/tests}')
print(f'old ratio = 0.8235294117648159')

# 0000

In [None]:
# https://huggingface.co/spaces/adorkin/BilingualEmojiPredictor/blob/main/app.py

%pip install transformers

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import torch

BASE_MODEL = "amazon-sagemaker-community/xlm-roberta-en-ru-emoji-v2"
TOP_N = 5

model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
    
def get_top_emojis(text):
    preprocessed = preprocess(text)
    inputs = tokenizer(preprocessed, return_tensors="pt")
    preds = model(**inputs).logits
    scores = torch.nn.functional.softmax(preds, dim=-1).detach().numpy()
    sorted_scores = [float(value) for value in np.sort(scores.squeeze())[::-1]]
    ranking = np.argsort(scores)
    ranking = ranking.squeeze()[::-1]
    emojis = [model.config.id2label[i] for i in ranking]
    return dict(zip(emojis, sorted_scores))

get_top_emojis(preprocess('I’ve toyed with tghe idea of usuing GPT-3’s API to add much more intelligent capabilirties to RC'))

## NN Based

In [5]:
from icecream import ic

In [6]:
labels = ['❤', '😍', '📷', '🇺🇸', '☀', '💜', '😉', '💯', '😁', '🎄', '📸', '😜', '😂', '☹️', '😭', '😔', '😡', '💢', '😤', '😳', '🙃', '😩', '😠', '💕', '🙈', '🙄', '🔥', '😊', '😎', '✨', '💙', '😘']


In [20]:
import csv
import itertools
import re
import urllib.request
from math import floor, log2
from typing import Any, Generator, List

import numpy as np
import torch
from icecream import ic
from scipy.special import softmax  # type: ignore
from transformers import AutoModelForSequenceClassification  # type: ignore
from transformers import AutoTokenizer  # type: ignore
from transformers import TFAutoModelForSequenceClassification  # type: ignore


def pre_texts(string:str)->Generator[str, Any, None]:
  spans = [x.span() for x in re.finditer(r'(\s)+', string)]
  for span in spans:
    yield string[0:span[0]]
  if spans[-1][1] != len(string):
    yield string



def gaussian_order(lst):
    length = len(lst)
    max_odd_ind = length - 1 if length % 2 == 0 else length - 2
    max_even_ind = length - 1 if length % 2 != 0 else length - 2
    dist = itertools.chain(range(max_odd_ind, 0, -2), range(0, max_even_ind + 1, 2))
    return [lst[i] for i in dist]


class Emojier:
  BASE_MODEL = "amazon-sagemaker-community/xlm-roberta-en-ru-emoji-v2"
  model: Any = None
  tokenizer: Any = None

  def setVerbose(self, v: bool):
    self.verbose = v
    return self

  def __init__(self, tokenizer, model, interval: int):
    self.model = model
    self.tokenizer = tokenizer
    self.interval = interval
    self.verbose = False

  def predict(self, text: str):
    inputs = self.tokenizer(text, return_tensors="pt")
    outputs = self.model(**inputs)
    logits = outputs.logits.detach().numpy()[0]
    predicted_class = logits.argmax()
    return predicted_class
    
  def preprocess(self,text:str):
      new_text = []
      for t in text.split(" "):
          t = '@user' if t.startswith('@') and len(t) > 1 else t
          t = 'http' if t.startswith('http') else t
          new_text.append(t)
      return " ".join(new_text)
    
  def _predict(self,text:str) -> List[str]:
    # Preprocess text (username and link placeholders)
    preprocessed = self.preprocess(text)
    inputs = self.tokenizer(preprocessed, return_tensors="pt")
    preds = self.model(**inputs).logits
    scores = torch.nn.functional.softmax(preds, dim=-1).detach().numpy()
    # sorted_scores = [float(value) for value in np.sort(scores.squeeze())[::-1]]
    ranking = np.argsort(scores)
    ranking = ranking.squeeze()[::-1]
    emojis = [self.model.config.id2label[i] for i in ranking]
    # return dict(zip(emojis, sorted_scores))
    return list(filter(lambda x : x != '🇺🇸',emojis))
  
  def encode(self,text:str,bytes_str:str):
    mask = MaskGen(text)
    ticks = [text[:v] for u,v in mask.NVA_words]
    original_length = len(text)
    new_ending = lambda x : (len(text) - original_length) + len(x)
    for pre_text in ticks:
      breakPoint = new_ending(pre_text)
      emoji_options = gaussian_order(self._predict(text[:breakPoint]))

      if bytes_str[0] == "0":
        bytes_str = bytes_str[1:]
        continue
      if self.verbose:
        print(f"word: {pre_text} \nlen: {len(emoji_options)} \temoji_options: {emoji_options}")

      bits = floor(log2(len(emoji_options)))
      taken_bits = bytes_str[:bits]
      ind = int(taken_bits, 2)
      bytes_str = bytes_str[bits:]
      emojis = emoji_options[ind]

      # Mutliplicity
      taken_bits = bytes_str[:2]
      mult = int(taken_bits, 2)+1
      bytes_str = bytes_str[2:]
      
      if len(emojis) > 0:
        text = f'{text[0:breakPoint]} {mult * emojis}{text[breakPoint:]}'
      if self.verbose:
        print(f'>>>encoding {taken_bits} = {ind} as {emojis}\nencoded text={text}')
    return text, bytes_str
  @staticmethod
  def int_to_binary_string(n: int, length: int) -> str:
    binary_str = bin(n)[2:]  # convert to binary string, remove '0b' prefix
    padded_str = binary_str.rjust(length, '0')  # pad with zeros to length
    return padded_str
  @staticmethod
  def cntPrefix(string:str, prefix:str):
    count = 0
    prefix_len = len(prefix)
    i = 0
    while i < len(string):
        if string[i:i+prefix_len] == prefix:
            count += 1
            i += prefix_len
        else:
            i += 1
    return count
  def decode(self,text:str):
    bytes_str = ''
    mask = MaskGen(text)
    ticks = [text[:v] for u,v in mask.NVA_words]
    original_length = len(text)
    new_ending = lambda x : (len(text) - original_length) + len(x)
    emoji_locations = []
    for pre_text in ticks:
      breakPoint = new_ending(pre_text)

      emoji = \
          [label for label in labels if text[breakPoint:].startswith(' '+label)][0] \
            if any((text[breakPoint:].startswith(' '+label) for label in labels)) \
              else None
      if emoji is not None:
        emoji_options = gaussian_order(self._predict(text[:breakPoint]))
        bits = floor(log2(len(emoji_options)))
        idx = emoji_options.index(emoji)
        bytes_str += Emojier.int_to_binary_string(idx,bits)
        # Multiplicity
        multi = Emojier.cntPrefix(text[breakPoint+1:],emoji) 
        bytes_str += Emojier.int_to_binary_string(multi-1,2)
        emoji_locations.append((breakPoint, breakPoint + 1 + len(emoji)*multi))
        if self.verbose:
          print(f"word: {pre_text} \nlen: {len(emoji_options)} \temoji_options: {emoji_options}")
          print(f"emoji: {emoji} \nlen: {len(emoji)} \tmulti: {multi}")
      else:
        bytes_str += '0'
    # remove emojies
    original = text
    for s,e in reversed(emoji_locations):
      original = original[:s] + original[e:]
    return original, bytes_str

In [21]:
emo = Emojier(tokenizer,model,4).setVerbose(True)

In [22]:
emo.verbose

True

In [23]:
emo.encode("hi, how are you?","1000001001")

('hi, how are you?', '1000001001')

In [24]:
encoded, rem = emo.encode("tokenization is the pain of my existence","1000001001010101001101010101")

word: tokenization 
len: 31 	emoji_options: ['🙈', '😳', '☀', '🎄', '😔', '☹️', '📷', '😘', '❤', '💕', '😍', '🔥', '💯', '😎', '😊', '✨', '😂', '😁', '😜', '😉', '💜', '💙', '😤', '📸', '💢', '😡', '😭', '🙄', '😠', '🙃', '😩']
>>>encoding 00 = 8 as ❤
encoded text=tokenization ❤ is the pain of my existence
word: tokenization is the pain 
len: 31 	emoji_options: ['💕', '📷', '😎', '💙', '📸', '❤', '✨', '☀', '😠', '🙈', '😳', '😂', '😡', '💢', '☹️', '😔', '😭', '😤', '🙄', '😩', '🙃', '💯', '🔥', '😊', '💜', '😜', '😉', '😁', '😘', '🎄', '😍']
>>>encoding 01 = 9 as 🙈
encoded text=tokenization ❤ is the pain 🙈🙈 of my existence


In [25]:
emo.decode(encoded)

word: tokenization 
len: 31 	emoji_options: ['🙈', '😳', '☀', '🎄', '😔', '☹️', '📷', '😘', '❤', '💕', '😍', '🔥', '💯', '😎', '😊', '✨', '😂', '😁', '😜', '😉', '💜', '💙', '😤', '📸', '💢', '😡', '😭', '🙄', '😠', '🙃', '😩']
emoji: ❤ 
len: 1 	multi: 1
word: tokenization ❤ is the pain 
len: 31 	emoji_options: ['💕', '📷', '😎', '💙', '📸', '❤', '✨', '☀', '😠', '🙈', '😳', '😂', '😡', '💢', '☹️', '😔', '😭', '😤', '🙄', '😩', '🙃', '💯', '🔥', '😊', '💜', '😜', '😉', '😁', '😘', '🎄', '😍']
emoji: 🙈 
len: 1 	multi: 2


('tokenization is the pain of my existence', '1000001001010')