In [None]:
import json
import os
import numpy as np
import itertools
import random

from transformers import PreTrainedTokenizerFast, AutoTokenizer, AutoConfig
from transformers import XLMRobertaTokenizerFast
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
import notebook_utils as nbu

In [None]:
# config and constants
tokenizer_dir = "../../../tokenizers"

languages = ('ar', 'tr', 'zh','el', 'es', 'en')
multil_string = '-'.join(languages)
all_languages = languages + (multil_string,)

alphas = ('0.0', '0.25', '0.5', '0.75', '1.0')
alphas_num = np.array([float(alpha) for alpha in alphas])
NV_mono = 20000
NV_multi = 120000

tok_type = "sp-unigram"

# Explore the tokenizer vocabularies

In [None]:
# load token occurences
token_stats = {}
for alpha in alphas:
    token_stats[alpha] = {}
    for lang in all_languages:
        NV = NV_mono if lang in languages else NV_multi
        token_stats[alpha][lang] = nbu.get_token_frequencies(tokenizer_dir, tok_type, lang, alpha, NV)

In [None]:
import csv
with open('unicode_blocks.tsv', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    unicode_blocks = list(reader)
# convert hex strings to int
unicode_blocks = [(int(block[0], 16), int(block[1], 16), block[2]) for block in unicode_blocks]
len(unicode_blocks)

In [None]:
# compute character occurences from the token occurences
from collections import Counter

def count_stats(token_stats, process_token_fn, skip_tokens):
    stats = {}
    for alpha in token_stats:
        stats[alpha] = {}
        for lang in token_stats[alpha]:
            stats[alpha][lang] = Counter()
            for token, freq in token_stats[alpha][lang].items():
                if token in skip_tokens:
                    continue
                # if token.startswith("<"):
                #     print(f"Possibly special token: {token}")
                stats[alpha][lang].update()
    return stats

char_stats = count_stats(token_stats, lambda token: {char: freq for char in token}, skip_tokens=["<s>", "</s>"])
char_stats_unicode_blocks = count_stats(token_stats, lambda token: {char_to_block(char): freq for char in token}, skip_tokens=["<s>", "</s>"])
for alpha in alphas:
    char_stats_unicode_blocks[alpha] = {}
    for lang in all_languages:
        char_stats_unicode_blocks[alpha][lang] = Counter()
        for char, freq in char_stats[alpha][lang].items():
            if token in ["<s>", "</s>"]:
                continue
            # if token.startswith("<"):
            #     print(f"Possibly special token: {token}")
            char_stats_unicode_blocks[alpha][lang].update({nbu.get_unicode_block(letter): freq for letter in token})


# for lang in languages: # languages = ('ar', 'tr', 'zh','el', 'es', 'en')
#     vocab = mono_tokenizers[f"alpha1.0"][lang].get_vocab()
#     tokens = list(vocab.keys())
#     lang_stats = Counter()
#     for token in tokens:
#         for char in token:
#             # print(ord(char))
#             lang_stats[ord(char)] += 1
#             all_stats[ord(char)] += 1

#     # plot histplot of lang_stats using seaborn
#     print(f"Language: {lang}")
#     # print(max(lang_stats.keys()))
#     # print(max(lang_stats.values()))
#     # sns.set_theme(style="whitegrid")
#     # ax = sns.histplot(x=np.arange(num_utf8_chars), y=lang_stats, bins=100, log_scale=(False, False))
#     # ax.set(xlabel='unicode code', ylabel='Number of characters', title=f"Character frequency in {lang} vocabulary")
#     # plt.show()

In [None]:
# count number of characters in each block
block_counts = [0] * len(unicode_blocks)
for char in all_stats.keys():
    for i, block in enumerate(unicode_blocks):
        if char >= block[0] and char <= block[1]:
            block_counts[i] += all_stats[char]
            break
# create examples of characters in each block
block_examples = []
for block in unicode_blocks:
    all_used = [chr(i) for i in range(block[0], block[1]+1) if i in all_stats.keys()]
    # random shuffle
    random.shuffle(all_used)
    block_examples.append(all_used[:10])

for block, count, example in zip(unicode_blocks, block_counts, block_examples):
    print(f"{block[2]}, occurence {count} {example}")

In [None]:
import matplotlib

# barplot of block_counts using seaborn
# skip unused blocks
filter_n = 20
filtered_block_names = [f"{block[2]} ({''.join(ex)})" for block, count, ex in zip(unicode_blocks, block_counts, block_examples) if count > filter_n]
filtered_block_counts = list(filter(lambda x: x > filter_n, block_counts))

# use Droid Sans and Droid Sans Fallback fonts
# sns.set_theme(style="whitegrid", )

matplotlib.rcParams['font.family'] = ['DejaVu Sans', 'Droid Sans', 'Droid Sans Fallback', 'DejaVu Sans Mono', 
'Noto Sans Arabic', 'Noto Sans Devanagari', 'STIXNonUnicode', 'EB Garamond', 'Noto Sans Symbols2']
# sns.set(font='Comic Sans MS')

# suppress userwarning
import warnings
warnings.filterwarnings("ignore")
# enable user warning
warnings.filterwarnings("default")

plt.figure(figsize=(10,20))
#sns.set_theme(style="whitegrid")
ax = sns.barplot(y=filtered_block_names, x=filtered_block_counts, log=True)
ax.set(ylabel='unicode block', xlabel='Number of characters', title=f"Character frequency in all vocabulary")
plt.xticks(rotation=90)
plt.show()


In [None]:
# in this cell we can search for a font that contains a specific character

from fontTools.ttLib import TTFont
import matplotlib.font_manager as mfm

def char_in_font(unicode_char, font):
    for cmap in font['cmap'].tables:
        if cmap.isUnicode():
            if ord(unicode_char) in cmap.cmap:
                return True
    return False

# uni_char =  u"诋"
uni_char = u"\N{SYRIAC LETTER SHIN}"
print(uni_char)

font_info = [(f.fname, f.name) for f in mfm.fontManager.ttflist]

for i, font in enumerate(font_info):
    if char_in_font(uni_char, TTFont(font[0])):
        print(font[0], font[1])