In [1]:
import tokenizers
from tokenizers import Tokenizer
from tokenizers.normalizers import StripAccents
from tokenizers.pre_tokenizers import Sequence, WhitespaceSplit, Punctuation
from tokenizers.processors import TemplateProcessing, ByteLevel
from tokenizers.trainers import WordLevelTrainer, BpeTrainer, WordPieceTrainer
from tokenizers.models import WordLevel, BPE, WordPiece
import time
import pandas as pd
import numpy as np

In [2]:
# restrict to a certain size, use nRows = None to read all data
nrows = 1000

# read train data set
df_train = pd.read_csv('data/train_data.txt', sep=':::',\
                       engine='python', header=None, nrows=nrows)
# rename columns
df_train.rename(columns={0:'id', 1:'title', 2:'genre', 3:'description'}, inplace=True)
# make everything lower case and remove trailing whitespaces
df_train['description'] = df_train['description'].apply(lambda x: x.lower().strip())
df_train['genre'] = df_train['genre'].apply(lambda x: x.lower().strip())

# read test data set
df_test = pd.read_csv('data/test_data_solution.txt', sep=':::',\
                       engine='python', header=None, nrows=nrows)
# rename columns
df_test.rename(columns={0:'id', 1:'title', 2:'genre', 3:'description'}, inplace=True)
# make everything lower case and remove trailing whitespaces
df_test['description'] = df_test['description'].apply(lambda x: x.lower().strip())
df_test['genre'] = df_test['genre'].apply(lambda x: x.lower().strip())

In [3]:
tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
# train tokenizer
trainer = WordLevelTrainer(special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]'],\
                          vocab_size=10000)
tokenizer.normalizer = StripAccents() # add method for stripping accents
tokenizer.pre_tokenizer = Sequence([WhitespaceSplit(), Punctuation(behavior='removed')]) 
#tokenizer.post_processor = TemplateProcessing(single='[CLS] $0 [SEP]',\
#                                              special_tokens=[('[CLS]',1),('[SEP]',2)])
tokenizer.train_from_iterator(df_train['description'], trainer=trainer)

In [4]:
vocabSize = tokenizer.get_vocab_size()
print('size of vocabulary: {}'.format(vocabSize))
for i in range(10):
    print('vocabulary id: {0}, word: {1}'.format(i, tokenizer.id_to_token(i)))
    j = vocabSize-i-1
    print('vocabulary id: {0}, word: {1}'.format(j, tokenizer.id_to_token(j)))
    
display(df_test.loc[0, 'description'])
out=tokenizer.encode(df_test.loc[0,'description'])
print(out.ids)
print(out.tokens)
display(tokenizer.decode(out.ids))

outFull=tokenizer.encode_batch(df_test['description'])
ntokens = 0
nunk = 0
for encoded in outFull:
    ntokens += len(encoded.ids) 
    nunk += len(encoded.ids) - np.count_nonzero(encoded.ids)
print('ratio of unknown tokens: {0:.4f}'.format(nunk/ntokens))

print('total number of tokens: {}'.format(ntokens))

size of vocabulary: 10000
vocabulary id: 0, word: [UNK]
vocabulary id: 9999, word: gigs
vocabulary id: 1, word: [CLS]
vocabulary id: 9998, word: gibbs
vocabulary id: 2, word: [SEP]
vocabulary id: 9997, word: gibbons
vocabulary id: 3, word: [PAD]
vocabulary id: 9996, word: giants
vocabulary id: 4, word: the
vocabulary id: 9995, word: giantlands
vocabulary id: 5, word: and
vocabulary id: 9994, word: gianni
vocabulary id: 6, word: a
vocabulary id: 9993, word: giaguara
vocabulary id: 7, word: of
vocabulary id: 9992, word: ghraib
vocabulary id: 8, word: to
vocabulary id: 9991, word: ghoulish
vocabulary id: 9, word: in
vocabulary id: 9990, word: ghettos


"l.r. brane loves his life - his car, his apartment, his job, but especially his girlfriend, vespa. one day while showering, vespa runs out of shampoo. l.r. runs across the street to a convenience store to buy some more, a quick trip of no more than a few minutes. when he returns, vespa is gone and every trace of her existence has been wiped out. l.r.'s life becomes a tortured existence as one strange event after another occurs to confirm in his mind that a conspiracy is working against his finding vespa."

[857, 1151, 0, 488, 11, 34, 11, 353, 11, 517, 11, 233, 26, 784, 11, 357, 0, 36, 70, 97, 0, 0, 1450, 45, 7, 0, 857, 1151, 1450, 267, 4, 646, 8, 6, 5255, 1947, 8, 2970, 82, 60, 6, 1906, 435, 7, 103, 60, 109, 6, 312, 1278, 35, 14, 451, 0, 10, 1392, 5, 212, 0, 7, 15, 786, 31, 92, 0, 45, 857, 1151, 12, 34, 158, 6, 6968, 786, 19, 36, 645, 621, 51, 198, 3315, 8, 0, 9, 11, 505, 17, 6, 3836, 10, 341, 182, 11, 2549, 0]
['l', 'r', '[UNK]', 'loves', 'his', 'life', 'his', 'car', 'his', 'apartment', 'his', 'job', 'but', 'especially', 'his', 'girlfriend', '[UNK]', 'one', 'day', 'while', '[UNK]', '[UNK]', 'runs', 'out', 'of', '[UNK]', 'l', 'r', 'runs', 'across', 'the', 'street', 'to', 'a', 'convenience', 'store', 'to', 'buy', 'some', 'more', 'a', 'quick', 'trip', 'of', 'no', 'more', 'than', 'a', 'few', 'minutes', 'when', 'he', 'returns', '[UNK]', 'is', 'gone', 'and', 'every', '[UNK]', 'of', 'her', 'existence', 'has', 'been', '[UNK]', 'out', 'l', 'r', 's', 'life', 'becomes', 'a', 'tortured', 'existence

'l r loves his life his car his apartment his job but especially his girlfriend one day while runs out of l r runs across the street to a convenience store to buy some more a quick trip of no more than a few minutes when he returns is gone and every of her existence has been out l r s life becomes a tortured existence as one strange event after another occurs to in his mind that a conspiracy is working against his finding'

ratio of unknown tokens: 0.1245
total number of tokens: 102719


In [5]:
tokenizerBPE = Tokenizer(BPE(unk_token='[UNK]',dropout=None))
# train tokenizer
trainer = BpeTrainer(special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]'],\
                     continuing_subword_prefix='##',\
                     vocab_size=10000)
tokenizerBPE.normalizer = StripAccents() # add method for stripping accents
tokenizerBPE.pre_tokenizer = Sequence([WhitespaceSplit(), Punctuation(behavior='removed')]) 
tokenizerBPE.post_processor = ByteLevel(trim_offsets=True)
tokenizerBPE.train_from_iterator(df_train['description'], trainer=trainer)

In [6]:
vocabSize = tokenizerBPE.get_vocab_size()
print('size of vocabulary: {}'.format(vocabSize))
for i in range(10):
    print('vocabulary id: {0}, word: {1}'.format(i, tokenizerBPE.id_to_token(i)))
    j = vocabSize-i-1
    print('vocabulary id: {0}, word: {1}'.format(j, tokenizerBPE.id_to_token(j)))
    
display(df_test.loc[0, 'description'])
out=tokenizerBPE.encode(df_test.loc[0,'description'])
print(out.ids)
print(out.tokens)
display(tokenizerBPE.decode(out.ids))

outFull=tokenizerBPE.encode_batch(df_test['description'])
ntokens = 0
nunk = 0
for encoded in outFull:
    ntokens += len(encoded.ids) 
    nunk += len(encoded.ids) - np.count_nonzero(encoded.ids)
print('ratio of unknown tokens: {0:.4f}'.format(nunk/ntokens))
print('total number of tokens: {}'.format(ntokens))

size of vocabulary: 10000
vocabulary id: 0, word: [UNK]
vocabulary id: 9999, word: marivela
vocabulary id: 1, word: [CLS]
vocabulary id: 9998, word: marijuana
vocabulary id: 2, word: [SEP]
vocabulary id: 9997, word: marginal
vocabulary id: 3, word: [PAD]
vocabulary id: 9996, word: marisa
vocabulary id: 4, word: 0
vocabulary id: 9995, word: mario
vocabulary id: 5, word: 1
vocabulary id: 9994, word: marks
vocabulary id: 6, word: 2
vocabulary id: 9993, word: marine
vocabulary id: 7, word: 3
vocabulary id: 9992, word: mari
vocabulary id: 8, word: 4
vocabulary id: 9991, word: whenever
vocabulary id: 9, word: 5
vocabulary id: 9990, word: allog


"l.r. brane loves his life - his car, his apartment, his job, but especially his girlfriend, vespa. one day while showering, vespa runs out of shampoo. l.r. runs across the street to a convenience store to buy some more, a quick trip of no more than a few minutes. when he returns, vespa is gone and every trace of her existence has been wiped out. l.r.'s life becomes a tortured existence as one strange event after another occurs to confirm in his mind that a conspiracy is working against his finding vespa."

[25, 31, 268, 1112, 1829, 156, 271, 156, 385, 156, 1964, 156, 980, 237, 2555, 156, 1456, 35, 127, 7396, 298, 395, 587, 419, 1246, 35, 127, 7396, 3638, 299, 138, 231, 543, 2593, 25, 31, 3638, 1209, 123, 2112, 136, 14, 8021, 740, 4227, 136, 4890, 466, 437, 14, 1831, 1405, 138, 583, 437, 603, 14, 1286, 3348, 294, 169, 1730, 35, 127, 7396, 151, 3460, 135, 496, 203, 361, 138, 184, 2538, 255, 574, 36, 283, 140, 299, 25, 31, 32, 271, 806, 14, 3058, 1511, 2538, 189, 298, 2195, 1345, 373, 989, 6727, 136, 6625, 82, 139, 156, 1432, 195, 14, 8719, 151, 1410, 910, 156, 5478, 35, 127, 7396]
['l', 'r', 'br', '##ane', 'loves', 'his', 'life', 'his', 'car', 'his', 'apartment', 'his', 'job', 'but', 'especially', 'his', 'girlfriend', 'v', '##es', '##pa', 'one', 'day', 'while', 'show', '##ering', 'v', '##es', '##pa', 'runs', 'out', 'of', 'sh', '##amp', '##oo', 'l', 'r', 'runs', 'across', 'the', 'street', 'to', 'a', 'conven', '##ience', 'store', 'to', 'buy', 'some', 'more', 'a', 'quick', 'trip', 'of', 'no',

'l r br ##ane loves his life his car his apartment his job but especially his girlfriend v ##es ##pa one day while show ##ering v ##es ##pa runs out of sh ##amp ##oo l r runs across the street to a conven ##ience store to buy some more a quick trip of no more than a few minutes when he returns v ##es ##pa is gone and every tr ##ace of her existence has been w ##ip ##ed out l r s life becomes a tort ##ured existence as one strange event after another occurs to confir ##m in his mind that a conspiracy is working against his finding v ##es ##pa'

ratio of unknown tokens: 0.0002
total number of tokens: 127002


In [7]:
tokenizerWP = Tokenizer(WordPiece(unk_token='[UNK]'))
# train tokenizer
trainer = WordPieceTrainer(special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]'],\
                           continuing_subword_prefix='##',\
                           vocab_size=10000)
tokenizerWP.normalizer = StripAccents() # add method for stripping accents
tokenizerWP.pre_tokenizer = Sequence([WhitespaceSplit(), Punctuation(behavior='removed')]) 
tokenizerWP.post_processor = TemplateProcessing(single='[CLS] $0 [SEP]',\
                                              special_tokens=[('[CLS]',1),('[SEP]',2)])
tokenizerWP.train_from_iterator(df_train['description'], trainer=trainer,)

In [8]:
vocabSize = tokenizerWP.get_vocab_size()
print('size of vocabulary: {}'.format(vocabSize))
for i in range(10):
    print('vocabulary id: {0}, word: {1}'.format(i, tokenizerWP.id_to_token(i)))
    j = vocabSize-i-1
    print('vocabulary id: {0}, word: {1}'.format(j, tokenizerWP.id_to_token(j)))
    
display(df_test.loc[0, 'description'])
out=tokenizerWP.encode(df_test.loc[0,'description'])
print(out.ids)
print(out.tokens)
display(tokenizerWP.decode(out.ids))

outFull=tokenizerWP.encode_batch(df_test['description'])
ntokens = 0
nunk = 0
for encoded in outFull:
    ntokens += len(encoded.ids) 
    nunk += len(encoded.ids) - np.count_nonzero(encoded.ids)
print('ratio of unknown tokens: {0:.4f}'.format(nunk/ntokens))
print('total number of tokens: {}'.format(ntokens))

size of vocabulary: 10000
vocabulary id: 0, word: [UNK]
vocabulary id: 9999, word: marivela
vocabulary id: 1, word: [CLS]
vocabulary id: 9998, word: marijuana
vocabulary id: 2, word: [SEP]
vocabulary id: 9997, word: marginal
vocabulary id: 3, word: [PAD]
vocabulary id: 9996, word: marisa
vocabulary id: 4, word: 0
vocabulary id: 9995, word: mario
vocabulary id: 5, word: 1
vocabulary id: 9994, word: marks
vocabulary id: 6, word: 2
vocabulary id: 9993, word: marine
vocabulary id: 7, word: 3
vocabulary id: 9992, word: mari
vocabulary id: 8, word: 4
vocabulary id: 9991, word: whenever
vocabulary id: 9, word: 5
vocabulary id: 9990, word: allog


"l.r. brane loves his life - his car, his apartment, his job, but especially his girlfriend, vespa. one day while showering, vespa runs out of shampoo. l.r. runs across the street to a convenience store to buy some more, a quick trip of no more than a few minutes. when he returns, vespa is gone and every trace of her existence has been wiped out. l.r.'s life becomes a tortured existence as one strange event after another occurs to confirm in his mind that a conspiracy is working against his finding vespa."

[1, 25, 31, 268, 1112, 1829, 156, 271, 156, 385, 156, 1964, 156, 980, 237, 2555, 156, 1456, 1656, 76, 7316, 298, 395, 587, 419, 1246, 1656, 76, 7316, 3638, 299, 138, 231, 543, 2589, 25, 31, 3638, 1209, 123, 2112, 136, 14, 8021, 740, 4227, 136, 4889, 466, 437, 14, 1831, 1405, 138, 583, 437, 603, 14, 1286, 3349, 294, 169, 1729, 1656, 76, 7316, 151, 3461, 135, 496, 1762, 207, 138, 184, 2538, 255, 574, 7307, 68, 299, 25, 31, 32, 271, 806, 14, 8495, 68, 2538, 189, 298, 2195, 1345, 373, 989, 6727, 136, 6625, 70, 139, 156, 1432, 195, 14, 8720, 151, 1410, 910, 156, 5478, 1656, 76, 7316, 2]
['[CLS]', 'l', 'r', 'br', '##ane', 'loves', 'his', 'life', 'his', 'car', 'his', 'apartment', 'his', 'job', 'but', 'especially', 'his', 'girlfriend', 've', '##s', '##pa', 'one', 'day', 'while', 'show', '##ering', 've', '##s', '##pa', 'runs', 'out', 'of', 'sh', '##amp', '##oo', 'l', 'r', 'runs', 'across', 'the', 'street', 'to', 'a', 'conven', '##ience', 'store', 'to', 'buy', 'some', 'more', 'a', 'quick', 'trip

'l r br ##ane loves his life his car his apartment his job but especially his girlfriend ve ##s ##pa one day while show ##ering ve ##s ##pa runs out of sh ##amp ##oo l r runs across the street to a conven ##ience store to buy some more a quick trip of no more than a few minutes when he returns ve ##s ##pa is gone and every tra ##ce of her existence has been wipe ##d out l r s life becomes a torture ##d existence as one strange event after another occurs to confir ##m in his mind that a conspiracy is working against his finding ve ##s ##pa'

ratio of unknown tokens: 0.0002
total number of tokens: 128626
