## AutoReload `.py` files

In [1]:
%load_ext autoreload
%autoreload 2

## Определим необходимые константы

In [3]:
import os
import re
import sentencepiece as spm
from common.utils import (
    replace_numbers,
    create_sentencepiece_dataset
)


CODE_DIR = os.getcwd()
DATA_DIR = os.path.join(CODE_DIR, 'data')
CONLL2003_DIR = os.path.join(DATA_DIR, 'conll2003')
WNUT17_DIR = os.path.join(DATA_DIR, 'wnut17')
TOKENS_CONLL2003 = 30000
TOKENS_WNUT17 = 20000
TRAIN_SENTENCEPIECE_STRING = '--input={input_data} --model_prefix={prefix} --vocab_size={vocab_size} --model_type=bpe'

## Заменим все цифры на это слово [NUM]
`В загруженных на Github данных это уже было сделано, поэтому прогонять этот код необязательно`

In [3]:
for path in [
    os.path.join(CONLL2003_DIR, 'conll2003.train'),
    os.path.join(CONLL2003_DIR, 'conll2003.test'),
    os.path.join(CONLL2003_DIR, 'conll2003.valid'),
    os.path.join(WNUT17_DIR, 'wnut17conll.train'),
    os.path.join(WNUT17_DIR, 'wnut17conll.test')
]:
    txt = replace_numbers(path)
    with open(path, 'w', encoding='utf-8') as file:
        file.write(txt)

## Составим датасет для SentencePiece

In [4]:
paths = [
    os.path.join(CONLL2003_DIR, 'conll2003.train'),
    os.path.join(CONLL2003_DIR, 'conll2003.test'),
    os.path.join(CONLL2003_DIR, 'conll2003.valid'),
    os.path.join(WNUT17_DIR, 'wnut17conll.train'),
    os.path.join(WNUT17_DIR, 'wnut17conll.test')
]
save_paths = [f'{x}_sp' for x in paths]
for path, save_path in zip(paths, save_paths):
    create_sentencepiece_dataset(path, save_path)

### Теперь объединим `conll2003.train_sp`, `conll2003.test_sp` и `conll2003.valid_sp`

In [5]:
!cp ./data/conll2003/conll2003.train_sp ./data/conll2003/conll2003_sp_dataset.txt

!cat ./data/conll2003/conll2003.test_sp >> ./data/conll2003/conll2003_sp_dataset.txt
!cat ./data/conll2003/conll2003.valid_sp >> ./data/conll2003/conll2003_sp_dataset.txt

!rm ./data/conll2003/conll2003.train_sp ./data/conll2003/conll2003.test_sp ./data/conll2003/conll2003.valid_sp

In [6]:
spm.SentencePieceTrainer.Train(
    TRAIN_SENTENCEPIECE_STRING.format(
        input_data=os.path.join(CONLL2003_DIR, 'conll2003_sp_dataset.txt'),
        prefix='conll2003_sp',
        vocab_size=TOKENS_CONLL2003
    )
)

True

In [7]:
!python common/apply_sentencepiece.py \
    --dataset_paths ./data/conll2003/conll2003.train ./data/conll2003/conll2003.test ./data/conll2003/conll2003.valid \
    --sentencepiece_model_path conll2003_sp.model \
    --tagging_format IOB

[32m2020-04-25 21:55:47.917[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m18[0m - [1mProcessing dataset at path: ./data/conll2003/conll2003.train[0m
Processing lines: 28082it [00:02, 13817.41it/s]
[32m2020-04-25 21:55:49.960[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mmain[0m:[36m27[0m - [32m[1mSuccessfully processed dataset and started saving at path: ./data/conll2003/conll2003.train.spm[0m
[32m2020-04-25 21:55:49.976[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mmain[0m:[36m31[0m - [32m[1mSaved dataset[0m
[32m2020-04-25 21:55:49.976[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m18[0m - [1mProcessing dataset at path: ./data/conll2003/conll2003.test[0m
Processing lines: 6500it [00:00, 12878.53it/s]
[32m2020-04-25 21:55:50.484[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mmain[0m:[36m27[0m - [32m[1mSuccessfully processed dataset and started saving at path: ./data/conll2003/conll2003.test.spm[0m
[32m2020

### Теперь объединим `wnut17conll.train_sp`, `wnut17conll.test_sp`

In [8]:
!cp ./data/wnut17/wnut17conll.train_sp ./data/wnut17/wnut17conll_sp_dataset.txt

!cat ./data/wnut17/wnut17conll.test_sp >> ./data/wnut17/wnut17conll_sp_dataset.txt

!rm ./data/wnut17/wnut17conll.train_sp ./data/wnut17/wnut17conll.test_sp

In [9]:
spm.SentencePieceTrainer.Train(
    TRAIN_SENTENCEPIECE_STRING.format(
        input_data=os.path.join(WNUT17_DIR, 'wnut17conll_sp_dataset.txt'),
        prefix='wnut17conll_sp',
        vocab_size=TOKENS_WNUT17
    )
)

True

In [10]:
!python common/apply_sentencepiece.py \
    --dataset_paths ./data/wnut17/wnut17conll.train ./data/wnut17/wnut17conll.test \
    --sentencepiece_model_path wnut17conll_sp.model \
    --tagging_format BIO

[32m2020-04-25 21:55:54.279[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m18[0m - [1mProcessing dataset at path: ./data/wnut17/wnut17conll.train[0m
Processing lines: 6788it [00:00, 12237.62it/s]
[32m2020-04-25 21:55:54.838[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mmain[0m:[36m27[0m - [32m[1mSuccessfully processed dataset and started saving at path: ./data/wnut17/wnut17conll.train.spm[0m
[32m2020-04-25 21:55:54.840[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mmain[0m:[36m31[0m - [32m[1mSaved dataset[0m
[32m2020-04-25 21:55:54.840[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m18[0m - [1mProcessing dataset at path: ./data/wnut17/wnut17conll.test[0m
Processing lines: 2574it [00:00, 10764.92it/s]
[32m2020-04-25 21:55:55.080[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mmain[0m:[36m27[0m - [32m[1mSuccessfully processed dataset and started saving at path: ./data/wnut17/wnut17conll.test.spm[0m
[32m2020-04-2