In [None]:
import numpy as np
import re
import os
import random
import custom
import clang
from clang import *
from clang import cindex
from pathlib import Path
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import NormalizedString,PreTokenizedString
from tokenizers import Tokenizer
from tokenizers import normalizers,decoders
from tokenizers.normalizers import StripAccents, unicode_normalizer_from_str, Replace
from tokenizers.processors import TemplateProcessing
from tokenizers import processors,pre_tokenizers
from tokenizers.models import BPE
from typing import List 

## Pre-requisites stuff

In [None]:
## Deterministic/reproducible flags

seedlist = [42, 834, 692, 489, 901, 408, 819, 808, 531, 166]

seed = seedlist[0]
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
random.seed(seed)

## Load/initialise custom tokenizer

In [None]:
## Tokenizer



class MyTokenizer:
    
    cidx = cindex.Index.create()    

    def clang_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        ## Tokkenize using clang
        tok = []
        tu = self.cidx.parse('tmp.c',
                       args=[''],  
                       unsaved_files=[('tmp.c', str(normalized_string.original))],  
                       options=0)
        for t in tu.get_tokens(extent=tu.cursor.extent):
            spelling = t.spelling.strip()
            
            if spelling == '':
                continue
                
            ## Keyword no need

            ## Punctuations no need

            ## Literal all to BPE
            
            #spelling = spelling.replace(' ', '')
            tok.append(NormalizedString(spelling))

        return(tok)
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.clang_split)
        

## Init new tokenizers
my_tokenizer = Tokenizer(BPE(unk_token="<unk>"))
my_tokenizer = Tokenizer(BPE())


## Load pre-trained tokenizers
#vocab, merges = BPE.read_file(vocab="./tokenizer5/v5_drapgh/drapgh-vocab.json", merges="./tokenizer5/v5_drapgh/drapgh-merges.txt")
#my_tokenizer = Tokenizer(BPE(vocab, merges, unk_token="<unk>"))

my_tokenizer.normalizer = normalizers.Sequence([StripAccents(), Replace(" ", "Ã„")])
my_tokenizer.pre_tokenizer = PreTokenizer.custom(MyTokenizer())
my_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
my_tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    special_tokens=[
    ("<s>",0),
    ("<pad>",1),
    ("</s>",2),
    ("<unk>",3),
    ("<mask>",4)
    ]
)


## Train tokenizers

In [None]:
st = ['<s>','<pad>','</s>','<unk>','<mask>','char','int','switch','case','if','break','for','const','unsigned','struct','default','return','long','goto','this','enum','bool','static','false','true','new','delete','while','double','else','private','do','sizeof','void','continue','__attribute__','short','throw','float','register','__FUNCTION__','static_cast','__func__','class','try','dynamic_cast','template','union','reinterpret_cast','catch','operator','const_cast','using','namespace','typename','wchar_t','not','typeof','__label__','__PRETTY_FUNCTION__','auto','__extension__','volatile','__asm__','__volatile__','extern','asm','signed','typedef','typeid','and','or','public','virtual','nullptr','__restrict','__asm','__typeof__','xor','__complex__','__real__','__imag__','not_eq','export','compl','__alignof__','__restrict__','__cdecl','bitor','protected','explicit','friend','decltype','mutable','inline','__const','__stdcall','char16_t','char32_t','_Decimal64','constexpr','bitand','alignof','static_assert','__attribute','thread_local','__alignof','__builtin_va_arg','_Decimal32','\"','(','*',',',')','{',';','->',':','.','-','=','+','<','++','+=','==','||','!=','}','/','!','>=','[',']','&','::','&&','>','#','--','<=','-=','|','%','?','<<','>>','|=','&=','^','~','^=','...','/=','*=','>>=','<<=','%=','##','->*','\\','.*','@','_Exit','abs','acos','acosh','asctime','asin','asinh','assert','at_quick_exit','atan','atan2','atanh','atexit','atof','atol','bsearch','btowc','c16rtomb','c32rtomb','cbrt','ceil','cerr','cin','clearerr','clock','clog','copysign','cos','cosh','cout','ctime','difftime','div','errno','exp','exp2','expm1','fabs','fclose','fdim','feclearexcept','fegetenv','fegetexceptflag','fegetround','feholdexcept','feof','feraiseexcept','ferror','fesetenv','fesetexceptflag','fesetround','fetestexcept','feupdateenv','fflush','fgetc','fgetpos','fgets','fgetwc','fgetws','floor','fma','fmax','fmod','fopen','fprintf','fputc','fputs','fputwc','fputws','fread','free','freopen','frexp','fscanf','fseek','fsetpos','ftell','fwide','fwprintf','fwrite','fwscanf','getc','getchar','getenv','gets','getwc','getwchar','gmtime','hypot','ilogb','imaxabs','imaxdiv','isblank','iscntrl','isdigit','isgraph','islower','isprint','ispunct','isspace','isupper','iswalnum','iswalpha','iswblank','iswcntrl','iswctype','iswdigit','iswgraph','iswlower','iswprint','iswpunct','iswspace','iswupper','iswxdigit','isxdigit','labs','ldexp','ldiv','llabs','lldiv','llrint','llround','localeconv','localtime','log','log10','log1p','log2','logb','longjmp','lrint','lround','malloc','mblen','mbrlen','mbrtoc16','mbrtoc32','mbrtowc','mbsinit','mbsrtowcs','mbstowcs','mbtowc','memchr','memcmp','memcpy','memmove','memset','mktime','modf','nan','nearbyint','nextafter','nexttoward','perror','pow','printf','putc','putchar','puts','putwchar','qsort','quick_exit','raise','realloc','remainder','remove','remquo','rename','rewind','rint','round','sca','scalbln','scalbn','setbuf','setjmp','setlocale','setvbuf','signal','sin','sinh','snprintf','sprintf','sqrt','srand','sscanf','strcat','strchr','strcmp','strcoll','strcpy','strcspn','strerror','strftime','strlen','strncat','strncmp','strncpy','strpbrk','strrchr','strspn','strstr','strtod','strtoimax','strtok','strtol','strtoll','strtoull','strtoumax','strxfrm','swprintf','swscanf','tan','tanh','time','tmpfile','tmpnam','tolower','toupper','towctrans','towlower','towupper','trunc','ungetc','ungetwc','vfprintf','vfscanf','vfwprintf','vfwscanf','vprintf','vscanf','vsfscanf','vsnprintf','vsprintf','vsscanf','vswprintf','vwprintf','vwscanf','wcerr','wcin','wclog','wcout','wcrtomb','wcscat','wcschr','wcscmp','wcscoll','wcscpy','wcscspn','wcsftime','wcslne','wcsncat','wcsncmp','wcsncpy','wcspbrk','wcsrchr','wcsrtombs','wcsspn','wcsstr','wcstod','wcstof','wcstoimax','wcstok','wcstol','wcstold','wcstoll','wcstombs','wcstoul','wcstoull','wcstoumax','wcsxfrm','wctob','wctomb','wctrans','wctype','wmemchr','wmemcmp','wmemcpy','wmemmove','wmemset','wprintf','wscanf']

trainer = BpeTrainer(vocab_size=50000, min_frequency=2, show_progress=True, special_tokens=st)
my_tokenizer.train(['data/tokenizer/drapgh.txt'],trainer)
my_tokenizer.model.save("./tokenizer/","drapgh")