# Encode sample and generate pkl files (token dictionaries)

Encode a sample file by all tokenizers with all vocabulary sizes. Creates a dictionary where keys are the tokens and values are the number of occurances in the encoded sequences a given file. Saved in a .pkl file in the correspodnimg folder.

In [1]:
import os
import pickle
print(os.getcwd())
%config Completer.use_jedi = False

/home/lieberze/DP/Thesis/tokenizery_2_attempt


In [2]:
import IPython
# print(IPython.sys_info())

## Paths

In [3]:
import os
RootFolder = ""
DataFolder = os.path.abspath(os.path.join(RootFolder, 'data/sample'))

## Tokenizers

In [4]:
from tokenizers import CharBPETokenizer
from tokenizers import ByteLevelBPETokenizer

## Functions

In [5]:
def LoadTokenizer(TokenizerPath, Tokenizer):
    vocab = f"{TokenizerPath}/vocab.json"
    merges = f"{TokenizerPath}/merges.txt"
    tokenizer = Tokenizer(vocab, merges)    
    return tokenizer

In [6]:
def TestTokenizer(Paths, VocabSizes, TestSequence, Tokenizer):
    for Path in Paths:
        for Size in VocabSizes:
            TokenizerPath = f"{Path}/{Size}/"
            tokenizer = LoadTokenizer(TokenizerPath, Tokenizer)
            encoded = tokenizer.encode(TestSequence)             
            print(TokenizerPath)
            print(encoded.ids)
            print(encoded.tokens)

In [7]:
from collections import Counter
def CountTokensAndUpdateDictionary(Dictionary, Encoded):
    Tokens = Encoded.tokens
    SmallTokenCountDict = Counter(Tokens)    
    for Token, Value in SmallTokenCountDict.items():
        if Token in Dictionary.keys():
            ValueInTheBigDictionary = int(Dictionary.get(Token)) + Value
            Dictionary[Token] = ValueInTheBigDictionary
        else:
            Dictionary[Token] = Value

In [8]:
def SortAndPickle(Dictionary, Name, FolderPath):
    Dictionary = {k: v for k, v in sorted(Dictionary.items(), key=lambda item: item[1], reverse=True)}
    pickle.dump(Dictionary, open(f"{FolderPath}/{Name}.pkl", 'wb'))

In [9]:
def EncodeAndCountTokenOccurences(FolderPath, FilePath, Tokenizer):
    Tokenizer = LoadTokenizer(FolderPath, Tokenizer)
    exons, introns, intergenic = {}, {}, {}
    with open(FilePath, "r") as file_in:
        for Line in file_in:
            LineSplit = Line.strip().split()
            SeqType, Seq = LineSplit[0], LineSplit[-1]
            Encoded = Tokenizer.encode(Seq)          
            if SeqType == "exon":
                CountTokensAndUpdateDictionary(exons, Encoded)
            elif SeqType == "intron":
                CountTokensAndUpdateDictionary(introns, Encoded)
            elif SeqType == "intergenic":
                CountTokensAndUpdateDictionary(intergenic, Encoded)
        for Name, Dict in {"exons":exons, "introns":introns, "intergenic":intergenic}.items():       
            SortAndPickle(Dict, Name, FolderPath)

## Apply

#### Paths

In [10]:
import os
RootFolder = ""
DataFolder = os.path.abspath(os.path.join(RootFolder, 'data/sample/Encoding'))

FileToEncode = os.path.abspath(os.path.join(DataFolder, 'All_equal_shuffled_100k.txt'))

FolderName = "All_genomes_sample"
name = "01_CharBPE"
CharBPE = os.path.abspath(os.path.join(RootFolder, f'{name}/'))
All_1000_BPE = os.path.abspath(os.path.join(CharBPE, f'{FolderName}/All_1000/'))
All_512_BPE = os.path.abspath(os.path.join(CharBPE, f'{FolderName}/All_512/'))

name = "02_ByteLevelBPE"
ByteLevelBPE = os.path.abspath(os.path.join(RootFolder, f'{name}/'))
All_1000_BLBPE = os.path.abspath(os.path.join(ByteLevelBPE, f'{FolderName}/All_1000/'))
All_512_BLBPE = os.path.abspath(os.path.join(ByteLevelBPE, f'{FolderName}/All_512/'))

### All genomes 41.6 MB sample (all equal)

#### Encoding test

BPE

In [11]:
# Paths = [All_1000_BPE, All_512_BPE]
# VocabSizes = [5000, 15000, 50000]
# Tokenizer = CharBPETokenizer
# TestSequence = "GCGTGATTACGAGTCGTGGCAAATTTGGTCTGGCTGTGGTCTAGACATTCCAGGCGGTGCGTCTGCTCTCGGGTGCCTCTA"

# TestTokenizer(Paths, VocabSizes, TestSequence, Tokenizer)

BLBPE

In [12]:
# Paths = [All_1000_BLBPE, All_512_BLBPE]
# VocabSizes = [5000, 15000, 50000]
# Tokenizer = ByteLevelBPETokenizer
# TestSequence = "GCGTGATTACGAGTCGTGGCAAATTTGGTCTGGCTGTGGTCTAGACATTCCAGGCGGTGCGTCTGCTCTCGGGTGCCTCTA"

# TestTokenizer(Paths, VocabSizes, TestSequence, Tokenizer)

#### Encode

##### CharBPE

In [13]:
Paths = [All_1000_BPE, All_512_BPE]
VocabSizes = [5000, 15000, 50000]
Tokenizer = CharBPETokenizer
TestFile = FileToEncode

for Path in Paths:
    for Size in VocabSizes:
        TokenizerPath = f"{Path}/{Size}/"
        EncodeAndCountTokenOccurences(TokenizerPath, TestFile, Tokenizer)

##### ByteLevelBPE

In [14]:
Paths = [All_1000_BLBPE, All_512_BLBPE]
VocabSizes = [5000, 15000, 50000]
Tokenizer = ByteLevelBPETokenizer
TestFile = FileToEncode

for Path in Paths:
    for Size in VocabSizes:
        TokenizerPath = f"{Path}/{Size}/"
        print(TokenizerPath)
        EncodeAndCountTokenOccurences(TokenizerPath, TestFile, Tokenizer)

/home/lieberze/DP/Thesis/tokenizery_2_attempt/02_ByteLevelBPE/All_genomes_sample/All_1000/5000/
/home/lieberze/DP/Thesis/tokenizery_2_attempt/02_ByteLevelBPE/All_genomes_sample/All_1000/15000/
/home/lieberze/DP/Thesis/tokenizery_2_attempt/02_ByteLevelBPE/All_genomes_sample/All_1000/50000/
/home/lieberze/DP/Thesis/tokenizery_2_attempt/02_ByteLevelBPE/All_genomes_sample/All_512/5000/
/home/lieberze/DP/Thesis/tokenizery_2_attempt/02_ByteLevelBPE/All_genomes_sample/All_512/15000/
/home/lieberze/DP/Thesis/tokenizery_2_attempt/02_ByteLevelBPE/All_genomes_sample/All_512/50000/


### \<unk> token detection

In [53]:
def CheckPattern(path, pattern):
    Dict = pickle.load(open(path, 'rb'))
    Tokens = Dict.keys()
    if pattern in Tokens:
        # print(f"pattern {pattern} is present in file {path}")
        return 1
    else:
        # print(f"pattern {pattern} is NOT present in file {path}")
        return 0

In [58]:
Paths = [All_1000_BPE, All_512_BPE, All_1000_BLBPE, All_512_BLBPE]
VocabSizes = [5000, 15000, 50000]
SeqTypes = ["exons", "introns", "intergenic"]

token_to_check_the_presence_of = "<unk>"

UNK = []
for Path in Paths:
    for Size in VocabSizes:
        for Seq in SeqTypes:
            Pth = f"{Path}/{Size}/{Seq}.pkl"
            PatternPresent = CheckPattern(Pth, token_to_check_the_presence_of)
            UNK.append(PatternPresent)
            
if 1 in UNK:
    print(f"there is some file which has the {token_to_check_the_presence_of} token")
else:
    print(f"there is NO file which has the {token_to_check_the_presence_of} token")

there is NO file which has the <unk> token
