In [2]:
import networkx as nx
from semiolog.util import subsequences

import string

from typing import List

from tokenizers import Tokenizer, Regex, NormalizedString, PreTokenizedString, normalizers
# from tokenizers.models import WordLevel
# from tokenizers.pre_tokenizers import PreTokenizer
# from tokenizers.normalizers import NFKC, Lowercase, Replace

In [3]:
punctuation = "...—•…–’"

In [4]:
import semiolog as slg

semiotic = slg.Cenematic("en_bnc_old_segments")

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at models/en_bnc_old_segments/paradigms/tf_model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
Using custom data configuration corpus-48c43d9b917d9e96
Reusing dataset text (/Users/Gianni/.cache/huggingface/datasets/text/corpus-48c43d9b917d9e96/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
100%|██████████| 3/3 [00:00<00:00, 26.93it/s]
All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at models/en_bnc_old_segments/paradigms/tf_model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further trainin

In [5]:
class SequenceSLG:

    """
    """

    def __init__(self, semiotic) -> None:
        self.zipf_factor = .135
        self.semiotic = semiotic
        self.voc = self.semiotic.vocab.freq

        # TODO: Zipf factor should (in principle) be computable following Mandelbrot (or not?)
        self.voc_rank = {k:(v+1)**self.zipf_factor for v,k in enumerate(self.voc.keys())}

    def build_graph_data(
        self,
        string: str,
        voc: dict
        )-> List[tuple]:

        edge_data = []
        for beginning in range(0, len(string)):
            for end in range(beginning + 1, len(string) + 1):
                subsequence_label = string[beginning:end]
                if subsequence_label not in voc or subsequence_label == string:
                    continue
                edge_data.append(
                    (
                        beginning,
                        end,
                        {
                            "label": subsequence_label,
                        },
                    )
                )

        return edge_data

    def chain2seq(
        self, string:str
    ) -> List[tuple]:
        lSt = len(string)
        
        # In case of a character in the string not in vocab, add it
        for c in string:
            if c not in self.voc:
                self.voc[c]=1
                self.voc_rank[c]=(len(self.voc)+1)**self.zipf_factor

        graph_data = self.build_graph_data(string, self.voc)
        seg_graph_full = nx.DiGraph()
        seg_graph_full.add_edges_from(graph_data)

        # Construct weights
        for edge in seg_graph_full.edges:
            rank = self.voc_rank[seg_graph_full.edges[edge]["label"]]
            seg_graph_full.edges[edge]["weight"] = rank

        # Find best segmentation out of shortest path
        shortest_path = nx.shortest_path(seg_graph_full, 0, lSt, weight="weight")

        seg_offsets = subsequences(shortest_path, 2)

        return seg_offsets

    def SequenceSLG_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:

        seg_offsets = self.chain2seq(str(normalized_string))

        splits = []
        for start,end in seg_offsets:
            splits.append(normalized_string[start:end])

        return splits

    def pre_tokenize(self, pretok: PreTokenizedString):

        pretok.split(self.SequenceSLG_split)

In [8]:
class CustomDecoder:
    def decode(self, tokens: List[str]) -> str:
        return "".join(tokens)

In [19]:
# This section shows how to attach these custom components to the Tokenizer
tok = Tokenizer(WordLevel(vocab=semiotic.vocab.encode,unk_token="[UNK]"))
tok.normalizer = normalizers.Sequence([NFKC(), Lowercase(), Replace(Regex(f"{[i for i in string.whitespace]}"),""),Replace(Regex(f"{[i for i in string.punctuation+punctuation]}"),"")])
tok.pre_tokenizer = PreTokenizer.custom(SequenceSLG(semiotic))
# tok.decoder = Decoder.custom(CustomDecoder())


In [20]:
input = semiotic.corpus.test["text"][2]
input = "I have made my plans: I must stick to them!"
print(input)

input_norm = tok.normalizer.normalize_str(input)
print(input_norm)

bla = tok.pre_tokenizer.pre_tokenize_str(input_norm)
bla

I have made my plans: I must stick to them!
ihavemademyplansimuststicktothem


[('ihave', (0, 5)),
 ('made', (5, 9)),
 ('my', (9, 11)),
 ('plans', (11, 16)),
 ('imust', (16, 21)),
 ('stick', (21, 26)),
 ('tothem', (26, 32))]

In [24]:
output = tok.encode("Welcome to the 🤗 Tokenizers library.")
print(output.ids)
# [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]

tok.decode(output.ids)
# "Hello , y ' all ! How are you ?"

[4484, 34, 29996, 22026, 920, 54, 2313]


'welcome tothe [UNK] token iz ers library'