# Tokenization, Lemmatization, Stemming

In [1]:
ru_text = """
Привет, меня зовут Тушин Кирилл,
я родился 26.09.1997, закончил МФТИ.
Я пытаюсь освоить курс по анализу текста!
Тестовый телефон: 89876543210
Тестовая почта: myemail@email.com
Тестовая сумма денег: 123456$
😎 🔥 ✌️
"""

en_text = """
Hi, my name is Kirill Tushin,
I was born on 26.09.1997, graduated from MIPT.
I'm trying to master a course on text analysis!
Test phone: 89876543210
Test mail: myemail@email.com
Test amount of money: 123456$
😎  🔥  ✌️
"""

In [2]:
def print_preprocess(tokens, prepared_tokens):
    print(*[
        f'Token: {token:10} Prepared: {prepare:10}'
        for token, prepare in zip(tokens, prepared_tokens)
    ], sep='\n')

## Tokenization, Lemmatization

### Stanza

In [3]:
import stanza

In [4]:
stanza.download('ru')
nlp = stanza.Pipeline('ru')

doc = nlp(ru_text)

tokens = [
    token.text
    for sent in doc.sentences
    for token in sent.words
]

lemmas = [
    token.lemma
    for sent in doc.sentences
    for token in sent.words
]
print_preprocess(tokens, lemmas)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json:   0%|   …

2021-09-08 15:59:27 INFO: Downloading default packages for language: ru (Russian)...
2021-09-08 15:59:28 INFO: File exists: /Users/kirilltusin/stanza_resources/ru/default.zip.
2021-09-08 15:59:34 INFO: Finished downloading models and saved to /Users/kirilltusin/stanza_resources.
2021-09-08 15:59:34 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |
| depparse  | syntagrus |
| ner       | wikiner   |

2021-09-08 15:59:34 INFO: Use device: cpu
2021-09-08 15:59:34 INFO: Loading: tokenize
2021-09-08 15:59:34 INFO: Loading: pos
2021-09-08 15:59:34 INFO: Loading: lemma
2021-09-08 15:59:34 INFO: Loading: depparse
2021-09-08 15:59:35 INFO: Loading: ner
2021-09-08 15:59:36 INFO: Done loading processors!


Token: Привет     Prepared: привет    
Token: ,          Prepared: ,         
Token: меня       Prepared: я         
Token: зовут      Prepared: звать     
Token: Тушин      Prepared: Тушин     
Token: Кирилл     Prepared: Кирилл    
Token: ,          Prepared: ,         
Token: я          Prepared: я         
Token: родился    Prepared: рождаться 
Token: 26.09.1997 Prepared: 26.09.1997
Token: ,          Prepared: ,         
Token: закончил   Prepared: закончить 
Token: МФТИ       Prepared: МФТИ      
Token: .          Prepared: .         
Token: Я          Prepared: я         
Token: пытаюсь    Prepared: пытаться  
Token: освоить    Prepared: освоить   
Token: курс       Prepared: курс      
Token: по         Prepared: по        
Token: анализу    Prepared: анализ    
Token: текста     Prepared: текст     
Token: !          Prepared: !         
Token: Тестовый   Prepared: тестовый  
Token: телефон    Prepared: телефон   
Token: :          Prepared: :         
Token: 89876543210 Prepar

In [5]:
stanza.download('en')
nlp = stanza.Pipeline('en')

doc = nlp(en_text)

tokens = [
    token.text
    for sent in doc.sentences
    for token in sent.words
]

lemmas = [
    token.lemma
    for sent in doc.sentences
    for token in sent.words
]

print_preprocess(tokens, lemmas)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json:   0%|   …

2021-09-08 15:59:37 INFO: Downloading default packages for language: en (English)...
2021-09-08 15:59:39 INFO: File exists: /Users/kirilltusin/stanza_resources/en/default.zip.
2021-09-08 15:59:43 INFO: Finished downloading models and saved to /Users/kirilltusin/stanza_resources.
2021-09-08 15:59:43 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-09-08 15:59:43 INFO: Use device: cpu
2021-09-08 15:59:43 INFO: Loading: tokenize
2021-09-08 15:59:43 INFO: Loading: pos
2021-09-08 15:59:43 INFO: Loading: lemma
2021-09-08 15:59:43 INFO: Loading: depparse
2021-09-08 15:59:44 INFO: Loading: sentiment
2021-09-08 15:59:44 INFO: Loading: ner
2021-09-08 15:59:45 INFO: Done loading processors!
  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


Token: Hi         Prepared: hi        
Token: ,          Prepared: ,         
Token: my         Prepared: my        
Token: name       Prepared: name      
Token: is         Prepared: be        
Token: Kirill     Prepared: Kirill    
Token: Tushin     Prepared: Tushin    
Token: ,          Prepared: ,         
Token: I          Prepared: I         
Token: was        Prepared: be        
Token: born       Prepared: bear      
Token: on         Prepared: on        
Token: 26.09.1997 Prepared: 26.09.1997
Token: ,          Prepared: ,         
Token: graduated  Prepared: graduate  
Token: from       Prepared: from      
Token: MIPT       Prepared: MIPT      
Token: .          Prepared: .         
Token: I          Prepared: I         
Token: 'm         Prepared: be        
Token: trying     Prepared: try       
Token: to         Prepared: to        
Token: master     Prepared: master    
Token: a          Prepared: a         
Token: course     Prepared: course    
Token: on         Prepare

### Spacy

In [6]:
import spacy

In [7]:
# !python -m spacy download en_core_web_sm

nlp = spacy.load('en_core_web_sm')
doc = nlp(en_text)
tokens = [str(token) for token in doc]
lemmas = [token.lemma_ for token in doc]

print_preprocess(tokens, lemmas)

Token: 
          Prepared: 
         
Token: Hi         Prepared: hi        
Token: ,          Prepared: ,         
Token: my         Prepared: my        
Token: name       Prepared: name      
Token: is         Prepared: be        
Token: Kirill     Prepared: Kirill    
Token: Tushin     Prepared: Tushin    
Token: ,          Prepared: ,         
Token: 
          Prepared: 
         
Token: I          Prepared: I         
Token: was        Prepared: be        
Token: born       Prepared: bear      
Token: on         Prepared: on        
Token: 26.09.1997 Prepared: 26.09.1997
Token: ,          Prepared: ,         
Token: graduated  Prepared: graduate  
Token: from       Prepared: from      
Token: MIPT       Prepared: MIPT      
Token: .          Prepared: .         
Token: 
          Prepared: 
         
Token: I          Prepared: I         
Token: 'm         Prepared: be        
Token: trying     Prepared: try       
Token: to         Prepared: to        
Token: master     Prepare

In [8]:
# !python -m spacy download ru_core_news_sm

nlp = spacy.load('ru_core_news_sm')
doc = nlp(ru_text)
tokens = [str(token) for token in doc]
lemmas = [token.lemma_ for token in doc]

print_preprocess(tokens, lemmas)

Token: 
          Prepared: 
         
Token: Привет     Prepared: привет    
Token: ,          Prepared: ,         
Token: меня       Prepared: меня      
Token: зовут      Prepared: звать     
Token: Тушин      Prepared: тушин     
Token: Кирилл     Prepared: кирилл    
Token: ,          Prepared: ,         
Token: 
          Prepared: 
         
Token: я          Prepared: я         
Token: родился    Prepared: родиться  
Token: 26.09.1997 Prepared: 26.09.1997
Token: ,          Prepared: ,         
Token: закончил   Prepared: закончить 
Token: МФТИ       Prepared: мфти      
Token: .          Prepared: .         
Token: 
          Prepared: 
         
Token: Я          Prepared: я         
Token: пытаюсь    Prepared: пытаться  
Token: освоить    Prepared: освоить   
Token: курс       Prepared: курс      
Token: по         Prepared: по        
Token: анализу    Prepared: анализ    
Token: текста     Prepared: текст     
Token: !          Prepared: !         
Token: 
          Prepare

### TextBlob

In [9]:
from textblob import TextBlob

In [10]:
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {
        'J': 'a', 
        'N': 'n', 
        'V': 'v', 
        'R': 'r',
    }
    tokens_and_tags = [
        (token, tag_dict.get(pos[0], 'n'))
        for token, pos in sent.tags
    ]
    tokens = [token for token, pos in sent.tags]
    lemmas = [token.lemmatize(tag) for token, tag in tokens_and_tags]
    return tokens, lemmas

tokens, lemmas = lemmatize_with_postag(en_text)

print_preprocess(tokens, lemmas)

Token: Hi         Prepared: Hi        
Token: my         Prepared: my        
Token: name       Prepared: name      
Token: is         Prepared: be        
Token: Kirill     Prepared: Kirill    
Token: Tushin     Prepared: Tushin    
Token: I          Prepared: I         
Token: was        Prepared: be        
Token: born       Prepared: bear      
Token: on         Prepared: on        
Token: 26.09.1997 Prepared: 26.09.1997
Token: graduated  Prepared: graduate  
Token: from       Prepared: from      
Token: MIPT       Prepared: MIPT      
Token: I          Prepared: I         
Token: 'm         Prepared: 'm        
Token: trying     Prepared: try       
Token: to         Prepared: to        
Token: master     Prepared: master    
Token: a          Prepared: a         
Token: course     Prepared: course    
Token: on         Prepared: on        
Token: text       Prepared: text      
Token: analysis   Prepared: analysis  
Token: Test       Prepared: Test      
Token: phone      Prepare

### Pymorphy2

In [11]:
import pymorphy2
from pymorphy2.tokenizers import simple_word_tokenize

In [12]:
morph = pymorphy2.MorphAnalyzer(lang='ru')
tokens = simple_word_tokenize(ru_text)
lemmas = [
    morph.parse(token)[0].normal_form
    for token in tokens
]

print_preprocess(tokens, lemmas)

Token: Привет     Prepared: привет    
Token: ,          Prepared: ,         
Token: меня       Prepared: я         
Token: зовут      Prepared: звать     
Token: Тушин      Prepared: тушин     
Token: Кирилл     Prepared: кирилл    
Token: ,          Prepared: ,         
Token: я          Prepared: я         
Token: родился    Prepared: родиться  
Token: 26         Prepared: 26        
Token: .          Prepared: .         
Token: 09         Prepared: 09        
Token: .          Prepared: .         
Token: 1997       Prepared: 1997      
Token: ,          Prepared: ,         
Token: закончил   Prepared: закончить 
Token: МФТИ       Prepared: мфть      
Token: .          Prepared: .         
Token: Я          Prepared: я         
Token: пытаюсь    Prepared: пытаться  
Token: освоить    Prepared: освоить   
Token: курс       Prepared: курс      
Token: по         Prepared: по        
Token: анализу    Prepared: анализ    
Token: текста     Prepared: текст     
Token: !          Prepare

### Pymystem3

In [13]:
from pymystem3 import Mystem

In [14]:
m = Mystem()
tokens = [x['text'] for x in m.analyze(ru_text)][:-1]
lemmas = m.lemmatize(ru_text)[:-1]

print_preprocess(tokens, lemmas)

Token: 
          Prepared: 
         
Token: Привет     Prepared: привет    
Token: ,          Prepared: ,         
Token: меня       Prepared: я         
Token:            Prepared:           
Token: зовут      Prepared: звать     
Token:            Prepared:           
Token: Тушин      Prepared: тушин     
Token:            Prepared:           
Token: Кирилл     Prepared: кирилл    
Token: ,
         Prepared: ,
        
Token: я          Prepared: я         
Token:            Prepared:           
Token: родился    Prepared: рождаться 
Token:            Prepared:           
Token: 26.09      Prepared: 26.09     
Token: .          Prepared: .         
Token: 1997       Prepared: 1997      
Token: ,          Prepared: ,         
Token: закончил   Prepared: заканчивать
Token:            Prepared:           
Token: МФТИ       Prepared: мфти      
Token: .          Prepared: .         
Token: 
          Prepared: 
         
Token: Я          Prepared: я         
Token:            Prepar

In [15]:
m = Mystem()
tokens = [x['text'] for x in m.analyze(en_text)][:-1]
lemmas = m.lemmatize(en_text)[:-1]

print_preprocess(tokens, lemmas)

Token: 
          Prepared: 
         
Token: Hi         Prepared: Hi        
Token: ,          Prepared: ,         
Token: my         Prepared: my        
Token:            Prepared:           
Token: name       Prepared: name      
Token:            Prepared:           
Token: is         Prepared: is        
Token:            Prepared:           
Token: Kirill     Prepared: Kirill    
Token:            Prepared:           
Token: Tushin     Prepared: Tushin    
Token: ,
         Prepared: ,
        
Token: I          Prepared: I         
Token:            Prepared:           
Token: was        Prepared: was       
Token:            Prepared:           
Token: born       Prepared: born      
Token:            Prepared:           
Token: on         Prepared: on        
Token:            Prepared:           
Token: 26.09      Prepared: 26.09     
Token: .          Prepared: .         
Token: 1997       Prepared: 1997      
Token: ,          Prepared: ,         
Token: graduated  Prepare

### Natasha

In [16]:
from natasha import (
    Doc,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Segmenter,
)

morph_vocab = MorphVocab()
segmenter = Segmenter()
emb = NewsEmbedding()

morph_tagger = NewsMorphTagger(emb)

doc = Doc(ru_text)
doc.segment(segmenter)
doc.tag_morph(morph_tagger)

for token in doc.tokens:
    token.lemmatize(morph_vocab)
    
tokens = [token.text for token in doc.tokens]
lemmas = [token.lemma for token in doc.tokens]

print_preprocess(tokens, lemmas)

Token: Привет     Prepared: привет    
Token: ,          Prepared: ,         
Token: меня       Prepared: я         
Token: зовут      Prepared: звать     
Token: Тушин      Prepared: тушин     
Token: Кирилл     Prepared: кирилл    
Token: ,          Prepared: ,         
Token: я          Prepared: я         
Token: родился    Prepared: родиться  
Token: 26.09.1997 Prepared: 26.09.1997
Token: ,          Prepared: ,         
Token: закончил   Prepared: закончить 
Token: МФТИ       Prepared: мфть      
Token: .          Prepared: .         
Token: Я          Prepared: я         
Token: пытаюсь    Prepared: пытаться  
Token: освоить    Prepared: освоить   
Token: курс       Prepared: курс      
Token: по         Prepared: по        
Token: анализу    Prepared: анализ    
Token: текста     Prepared: текст     
Token: !          Prepared: !         
Token: Тестовый   Prepared: тестовый  
Token: телефон    Prepared: телефон   
Token: :          Prepared: :         
Token: 89876543210 Prepar

## Tokenization, Stemming

### NLTK

In [17]:
import nltk
from nltk import WordPunctTokenizer

In [18]:
stemmer = nltk.stem.SnowballStemmer('english')
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(en_text)
stemms = [stemmer.stem(token) for token in tokens]

print_preprocess(tokens, stemms)

Token: Hi         Prepared: hi        
Token: ,          Prepared: ,         
Token: my         Prepared: my        
Token: name       Prepared: name      
Token: is         Prepared: is        
Token: Kirill     Prepared: kiril     
Token: Tushin     Prepared: tushin    
Token: ,          Prepared: ,         
Token: I          Prepared: i         
Token: was        Prepared: was       
Token: born       Prepared: born      
Token: on         Prepared: on        
Token: 26         Prepared: 26        
Token: .          Prepared: .         
Token: 09         Prepared: 09        
Token: .          Prepared: .         
Token: 1997       Prepared: 1997      
Token: ,          Prepared: ,         
Token: graduated  Prepared: graduat   
Token: from       Prepared: from      
Token: MIPT       Prepared: mipt      
Token: .          Prepared: .         
Token: I          Prepared: i         
Token: '          Prepared: '         
Token: m          Prepared: m         
Token: trying     Prepare

In [19]:
stemmer = nltk.stem.SnowballStemmer('russian')
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(ru_text)
stemms = [stemmer.stem(token) for token in tokens]

print_preprocess(tokens, stemms)

Token: Привет     Prepared: привет    
Token: ,          Prepared: ,         
Token: меня       Prepared: мен       
Token: зовут      Prepared: зовут     
Token: Тушин      Prepared: тушин     
Token: Кирилл     Prepared: кирилл    
Token: ,          Prepared: ,         
Token: я          Prepared: я         
Token: родился    Prepared: род       
Token: 26         Prepared: 26        
Token: .          Prepared: .         
Token: 09         Prepared: 09        
Token: .          Prepared: .         
Token: 1997       Prepared: 1997      
Token: ,          Prepared: ,         
Token: закончил   Prepared: законч    
Token: МФТИ       Prepared: мфти      
Token: .          Prepared: .         
Token: Я          Prepared: я         
Token: пытаюсь    Prepared: пыта      
Token: освоить    Prepared: осво      
Token: курс       Prepared: курс      
Token: по         Prepared: по        
Token: анализу    Prepared: анализ    
Token: текста     Prepared: текст     
Token: !          Prepare

In [20]:
stemmer = nltk.stem.snowball.RussianStemmer()
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(ru_text)
stemms = [stemmer.stem(token) for token in tokens]

print_preprocess(tokens, stemms)

Token: Привет     Prepared: привет    
Token: ,          Prepared: ,         
Token: меня       Prepared: мен       
Token: зовут      Prepared: зовут     
Token: Тушин      Prepared: тушин     
Token: Кирилл     Prepared: кирилл    
Token: ,          Prepared: ,         
Token: я          Prepared: я         
Token: родился    Prepared: род       
Token: 26         Prepared: 26        
Token: .          Prepared: .         
Token: 09         Prepared: 09        
Token: .          Prepared: .         
Token: 1997       Prepared: 1997      
Token: ,          Prepared: ,         
Token: закончил   Prepared: законч    
Token: МФТИ       Prepared: мфти      
Token: .          Prepared: .         
Token: Я          Prepared: я         
Token: пытаюсь    Prepared: пыта      
Token: освоить    Prepared: осво      
Token: курс       Prepared: курс      
Token: по         Prepared: по        
Token: анализу    Prepared: анализ    
Token: текста     Prepared: текст     
Token: !          Prepare

# Subword tokenization

In [21]:
def clean_dataset(dataset):
    dataset = [string.strip() for string in dataset]
    dataset = [string for string in dataset if string]
    return dataset


with open('../../data/war_and_peace_ru.txt', 'r') as ru_dataset_file:
    ru_dataset = ru_dataset_file.readlines()
    ru_dataset = clean_dataset(ru_dataset)
    
with open('../../data/the_picture_of_dorian_gray.txt', 'r') as en_dataset_file:
    en_dataset = en_dataset_file.readlines()
    en_dataset = clean_dataset(en_dataset)


full_dataset = ru_dataset + en_dataset

## Custom BPE

### Train

In [22]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer


vocab_size = 50000
dropout = 0.1
max_length = 64


tokenizer = Tokenizer(BPE(dropout=dropout))

tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size=vocab_size,
    special_tokens=['<sos>', '<pad>', '<eos>'],
)

### Train on full dataset
tokenizer.train_from_iterator(full_dataset, trainer)


### Add special tokens like in BERT model
tokenizer.post_processor = BertProcessing(
    ('<eos>', tokenizer.token_to_id('<eos>')),
    ('<sos>', tokenizer.token_to_id('<sos>')),
)

### Enable padding
tokenizer.enable_padding(
    pad_id=tokenizer.token_to_id('<pad>'),
    length=max_length,
)

### Enable truncation
tokenizer.enable_truncation(
    max_length=max_length,
)






### Usage

In [23]:
encoding = tokenizer.encode(ru_text)
encoding

Encoding(num_tokens=64, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [24]:
print_preprocess(encoding.tokens, encoding.ids)

Token: <sos>      Prepared:          0
Token: При        Prepared:       1634
Token: вет        Prepared:       2118
Token: ,          Prepared:         10
Token: меня       Prepared:        565
Token: зовут      Prepared:      13097
Token: Тушин      Prepared:       2958
Token: Кирил      Prepared:       6176
Token: л          Prepared:        133
Token: ,          Prepared:         10
Token: я          Prepared:        153
Token: родился    Prepared:      19205
Token: 26         Prepared:      29993
Token: .          Prepared:         12
Token: 0          Prepared:         13
Token: 9          Prepared:         22
Token: .          Prepared:         12
Token: 19         Prepared:       9981
Token: 9          Prepared:         22
Token: 7          Prepared:         20
Token: ,          Prepared:         10
Token: закончил   Prepared:      45128
Token: М          Prepared:        104
Token: Ф          Prepared:        112
Token: Т          Prepared:        110
Token: И          Prepare

In [25]:
encoding = tokenizer.encode(en_text)
encoding

Encoding(num_tokens=64, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [26]:
print_preprocess(encoding.tokens, encoding.ids)

Token: <sos>      Prepared:          0
Token: Hi         Prepared:      21290
Token: ,          Prepared:         10
Token: my         Prepared:        640
Token: name       Prepared:       2598
Token: i          Prepared:         63
Token: s          Prepared:         73
Token: K          Prepared:         36
Token: i          Prepared:         63
Token: rill       Prepared:       5114
Token: T          Prepared:         45
Token: ush        Prepared:       3358
Token: in         Prepared:        178
Token: ,          Prepared:         10
Token: I          Prepared:         34
Token: was        Prepared:        320
Token: bor        Prepared:       2263
Token: n          Prepared:         68
Token: on         Prepared:        204
Token: 26         Prepared:      29993
Token: .          Prepared:         12
Token: 0          Prepared:         13
Token: 9          Prepared:         22
Token: .          Prepared:         12
Token: 1          Prepared:         14
Token: 9          Prepare

## BPE

In [27]:
from tokenizers import CharBPETokenizer

### Train

In [28]:
tokenizer = CharBPETokenizer()
tokenizer.train_from_iterator(full_dataset)






### Usage

In [29]:
encoding = tokenizer.encode(ru_text)
encoding

Encoding(num_tokens=90, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [30]:
print_preprocess(encoding.tokens, encoding.ids)

Token: При        Prepared:       2157
Token: вет</w>    Prepared:       4308
Token: ,</w>      Prepared:        275
Token: меня</w>   Prepared:        768
Token: зовут</w>  Prepared:      14019
Token: Тушин</w>  Prepared:       3356
Token: Кирилл</w> Prepared:      17622
Token: ,</w>      Prepared:        275
Token: я</w>      Prepared:        181
Token: родился</w> Prepared:      20447
Token: 2          Prepared:         13
Token: 6</w>      Prepared:        226
Token: .</w>      Prepared:        259
Token: 0          Prepared:         11
Token: 9</w>      Prepared:        214
Token: .</w>      Prepared:        259
Token: 1          Prepared:         12
Token: 9          Prepared:         20
Token: 9          Prepared:         20
Token: 7</w>      Prepared:        218
Token: ,</w>      Prepared:        275
Token: закон      Prepared:       5056
Token: чил</w>    Prepared:       1854
Token: М          Prepared:        102
Token: Ф          Prepared:        110
Token: Т          Prepar

In [31]:
encoding = tokenizer.encode(en_text)
encoding

Encoding(num_tokens=95, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [32]:
print_preprocess(encoding.tokens, encoding.ids)

Token: H          Prepared:         31
Token: i</w>      Prepared:        204
Token: ,</w>      Prepared:        275
Token: my</w>     Prepared:        975
Token: name</w>   Prepared:       3054
Token: is</w>     Prepared:        361
Token: K          Prepared:         34
Token: i          Prepared:         61
Token: rill</w>   Prepared:       7282
Token: Tu         Prepared:      15354
Token: sh         Prepared:        694
Token: in</w>     Prepared:        401
Token: ,</w>      Prepared:        275
Token: I</w>      Prepared:        220
Token: was</w>    Prepared:        457
Token: born</w>   Prepared:      11489
Token: on</w>     Prepared:        394
Token: 2          Prepared:         13
Token: 6</w>      Prepared:        226
Token: .</w>      Prepared:        259
Token: 0          Prepared:         11
Token: 9</w>      Prepared:        214
Token: .</w>      Prepared:        259
Token: 1          Prepared:         12
Token: 9          Prepared:         20
Token: 9          Prepare

## WordPiece

In [33]:
from tokenizers import BertWordPieceTokenizer

### Train

In [34]:
tokenizer = BertWordPieceTokenizer()
tokenizer.train_from_iterator(full_dataset)






### Usage

In [35]:
encoding = tokenizer.encode(ru_text)
encoding

Encoding(num_tokens=87, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [36]:
print_preprocess(encoding.tokens, encoding.ids)

Token: привет     Prepared:       5011
Token: ,          Prepared:         12
Token: меня       Prepared:        643
Token: зовут      Prepared:      14317
Token: тушин      Prepared:       3168
Token: кирилл     Prepared:      16771
Token: ,          Prepared:         12
Token: я          Prepared:         90
Token: родился    Prepared:      19495
Token: 2          Prepared:         17
Token: ##6        Prepared:        162
Token: .          Prepared:         14
Token: 0          Prepared:         15
Token: ##9        Prepared:        154
Token: .          Prepared:         14
Token: 19         Prepared:      10355
Token: ##9        Prepared:        154
Token: ##7        Prepared:        144
Token: ,          Prepared:         12
Token: закон      Prepared:       6972
Token: ##чил      Prepared:       1727
Token: м          Prepared:         71
Token: ##ф        Prepared:        131
Token: ##ти       Prepared:        259
Token: .          Prepared:         14
Token: я          Prepare

In [37]:
encoding = tokenizer.encode(en_text)
encoding

Encoding(num_tokens=93, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [38]:
print_preprocess(encoding.tokens, encoding.ids)

Token: h          Prepared:         38
Token: ##i        Prepared:        127
Token: ,          Prepared:         12
Token: my         Prepared:        623
Token: name       Prepared:       2863
Token: is         Prepared:        377
Token: k          Prepared:         41
Token: ##ir       Prepared:        412
Token: ##ill      Prepared:        559
Token: tu         Prepared:      21740
Token: ##sh       Prepared:       2317
Token: ##in       Prepared:        191
Token: ,          Prepared:         12
Token: i          Prepared:         39
Token: was        Prepared:        324
Token: born       Prepared:      12439
Token: on         Prepared:        345
Token: 2          Prepared:         17
Token: ##6        Prepared:        162
Token: .          Prepared:         14
Token: 0          Prepared:         15
Token: ##9        Prepared:        154
Token: .          Prepared:         14
Token: 19         Prepared:      10355
Token: ##9        Prepared:        154
Token: ##7        Prepare

## SentencePiece (it used Unigram)

In [39]:
from tokenizers import SentencePieceBPETokenizer

### Train

In [40]:
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(full_dataset)






### Usage

In [41]:
encoding = tokenizer.encode(ru_text)
encoding

Encoding(num_tokens=92, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [42]:
print_preprocess(encoding.tokens, encoding.ids)

Token: ▁При       Prepared:       2048
Token: вет,       Prepared:      10971
Token: ▁меня      Prepared:        863
Token: ▁зовут     Prepared:      26341
Token: ▁Тушин     Prepared:       5909
Token: ▁Кирил     Prepared:       7430
Token: л,         Prepared:       1004
Token: я          Prepared:        151
Token: ▁роди      Prepared:       4325
Token: лся        Prepared:        389
Token: ▁2         Prepared:       2880
Token: 6          Prepared:         17
Token: .          Prepared:         10
Token: 0          Prepared:         11
Token: 9          Prepared:         20
Token: .          Prepared:         10
Token: 1          Prepared:         12
Token: 9          Prepared:         20
Token: 9          Prepared:         20
Token: 7          Prepared:         18
Token: ,          Prepared:          8
Token: ▁закон     Prepared:       9525
Token: чил        Prepared:       1947
Token: ▁М         Prepared:        342
Token: Ф          Prepared:        110
Token: Т          Prepare

In [43]:
encoding = tokenizer.encode(en_text)
encoding

Encoding(num_tokens=97, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [44]:
print_preprocess(encoding.tokens, encoding.ids)

Token: ▁H         Prepared:        354
Token: i,         Prepared:       2652
Token: ▁my        Prepared:        792
Token: ▁name      Prepared:       4297
Token: ▁is        Prepared:        427
Token: ▁K         Prepared:       4068
Token: i          Prepared:         61
Token: rill       Prepared:       8024
Token: ▁T         Prepared:        418
Token: us         Prepared:       1081
Token: h          Prepared:         60
Token: in,        Prepared:       3775
Token: I          Prepared:         32
Token: ▁was       Prepared:        385
Token: ▁born      Prepared:      12169
Token: ▁on        Prepared:        425
Token: ▁2         Prepared:       2880
Token: 6          Prepared:         17
Token: .          Prepared:         10
Token: 0          Prepared:         11
Token: 9          Prepared:         20
Token: .          Prepared:         10
Token: 1          Prepared:         12
Token: 9          Prepared:         20
Token: 9          Prepared:         20
Token: 7          Prepare

## Load some tokenizer

In [45]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

In [46]:
indexes = tokenizer.encode(ru_text)
tokens = [tokenizer._convert_id_to_token(index) for index in indexes]

print_preprocess(tokens, indexes)

Token: [CLS]      Prepared:        101
Token: привет     Prepared:      26856
Token: ,          Prepared:        128
Token: меня       Prepared:      14198
Token: зовут      Prepared:      39327
Token: туши       Prepared:     115235
Token: ##н        Prepared:        858
Token: кирилл     Prepared:      75305
Token: ,          Prepared:        128
Token: я          Prepared:        877
Token: родился    Prepared:       9551
Token: 26         Prepared:       7085
Token: .          Prepared:        132
Token: 09         Prepared:      11547
Token: .          Prepared:        132
Token: 1997       Prepared:      10900
Token: ,          Prepared:        128
Token: закончил   Prepared:      19305
Token: м          Prepared:        865
Token: ##фт       Prepared:      25037
Token: ##и        Prepared:        852
Token: .          Prepared:        132
Token: я          Prepared:        877
Token: пыта       Prepared:      21666
Token: ##юсь      Prepared:      29627
Token: освоить    Prepare

# Basic CleanUp

## Clean text

In [47]:
import string
from bs4 import BeautifulSoup as BS

def clean_text(
    text,
    strip=True,
    punctuation=True,
    lower=True,
):
    if strip:
        text = text.strip()

    if punctuation:
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.replace('\n', ' ')
        text = text.replace('\t', ' ')
    
    if lower:
        text = text.lower()
    
    return text


def extract_text_from_xml(xml_text):
    return BS(xml_text, "xml").get_text(separator='\n')


In [48]:
clean_text(ru_text)

'привет меня зовут тушин кирилл я родился 26091997 закончил мфти я пытаюсь освоить курс по анализу текста тестовый телефон 89876543210 тестовая почта myemailemailcom тестовая сумма денег 123456 😎 🔥 ✌️'

In [49]:
clean_text(en_text)

'hi my name is kirill tushin i was born on 26091997 graduated from mipt im trying to master a course on text analysis test phone 89876543210 test mail myemailemailcom test amount of money 123456 😎  🔥  ✌️'

## Replace text with tags

In [50]:
from cleantext import clean

In [51]:
clean(
    text=ru_text,
    to_ascii=False,
    no_urls=True,
    no_emails=True,
    no_phone_numbers=True,
    no_digits=True,
    no_currency_symbols=True,
    no_punct=True,
    no_emoji=True,
)

'привет меня зовут тушин кирилл\nя родился 00000000 закончил мфти\nя пытаюсь освоить курс по анализу текста\nтестовый телефон <phone>\nтестовая почта <email>\nтестовая сумма денег 000000<cur>\n️'

In [52]:
clean(
    text=en_text,
    to_ascii=False,
    no_urls=True,
    no_emails=True,
    no_phone_numbers=True,
    no_digits=True,
    no_currency_symbols=True,
    no_punct=True,
    no_emoji=True,
)

'hi my name is kirill tushin\ni was born on 00000000 graduated from mipt\nim trying to master a course on text analysis\ntest phone <phone>\ntest mail <email>\ntest amount of money 000000<cur>\n️'