# Тестирование Spell Checker с использованием SVM

В этом ноутбуке будет произведено тестировние модели, в которой в candidate scorer используется ranking SVM.

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import gc
import sys
import os
import json
import pickle
import re
from string import punctuation
sys.path.append('..')

import dotenv
import numpy as np
import pandas as pd
from transformers import BertForMaskedLM, BertTokenizer, BertConfig

from deeppavlov.core.data.simple_vocab import SimpleVocabulary

import kenlm
from sacremoses import MosesTokenizer, MosesDetokenizer

from src.models.SpellChecker import *
from src.models.BertScorer.bert_scorer_correction import (
    BertScorerCorrection
)

from sklearn.svm import LinearSVC

from IPython.display import display
from tqdm.notebook import tqdm

In [8]:
PROJECT_PATH = os.path.join(os.path.abspath(''), os.pardir)
DATA_PATH = os.path.join(PROJECT_PATH, 'data')
MODEL_PATH = os.path.join(PROJECT_PATH, 'models')

## Инициализация

Начнем с того, что инициализируем все необходимые компоненты модели. Параллельно так же будет описана роль каждого компонента в системе.

In [9]:
raw_tokenizer = MosesTokenizer(lang='ru')
raw_detokenizer = MosesDetokenizer(lang='ru')
tokenizer = lambda x: raw_tokenizer.tokenize(x, escape=False)
detokenizer = lambda x: raw_detokenizer.detokenize(x)

In [10]:
vocab_path = os.path.join(DATA_PATH, 'external', 'russian_words', 
                          'russian_words_vocab.dict')
vocab = SimpleVocabulary(load_path=vocab_path, save_path=vocab_path)
handcode_table_path = os.path.join(DATA_PATH, 'processed', 'handcode_table', 
                                   'table.json')
with open(handcode_table_path, 'r') as inf:
    handcode_table = json.load(inf)
candidate_generator = CandidateGenerator(
    words=vocab.keys(), handcode_table=handcode_table, max_distance=1
)

2021-01-28 15:47:03.857 INFO in 'deeppavlov.core.data.simple_vocab'['simple_vocab'] at line 115: [loading vocabulary from /home/mrgeekman/Documents/MIPT/НИР/Repo/data/external/russian_words/russian_words_vocab.dict]


In [11]:
model_left_right = kenlm.LanguageModel(
    os.path.join(MODEL_PATH, 'kenlm', 'left_right_3_100.arpa.binary')
)
model_right_left = kenlm.LanguageModel(
    os.path.join(MODEL_PATH, 'kenlm', 'right_left_3_100.arpa.binary')
)
position_selector = KenlmPositionSelector(model_left_right, model_right_left)

In [12]:
BERT_PATH = os.path.join(MODEL_PATH, 'conversational_rubert')
config = BertConfig.from_json_file(
    os.path.join(BERT_PATH, 'bert_config.json')
)
model = BertForMaskedLM.from_pretrained(
    os.path.join(BERT_PATH, 'pytorch_model.bin'),
    config=config
)
bert_tokenizer = BertTokenizer(os.path.join(BERT_PATH, 'vocab.txt'))
bert_scorer_correction = BertScorerCorrection(model, bert_tokenizer)
agg_subtoken_func = np.mean
bert_scorer = BertScorer(
    bert_scorer_correction, agg_subtoken_func
)

with open(os.path.join(DATA_PATH, 'processed', 'scorer_learning', 'svm.bin'), 'rb') as inf:
    svm_model = pickle.load(inf)

svm_scorer = SVMScorer(svm_model, bert_scorer=bert_scorer)
candidate_scorer = CandidateScorer(svm_scorer)

Some weights of the model checkpoint at /home/mrgeekman/Documents/MIPT/НИР/Repo/notebooks/../models/conversational_rubert/pytorch_model.bin were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at /home/mrgeekman/Documents/MIPT/НИР/Repo/notebooks/../models/conversational_rubert/pytorch_model.bin and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a d

In [13]:
stopping_criteria = MarginStoppingCriteria(np.log(2.5))

In [14]:
# максимальное количество итераций
max_it = 5

spellchecker = IterativeSpellChecker(
    candidate_generator,
    position_selector,
    candidate_scorer,
    stopping_criteria,
    tokenizer,
    detokenizer,
    num_selected_candidates=None,
    max_it=max_it
)

## Тестирование

### Валидация

In [15]:
with open(
    os.path.join(DATA_PATH, 'external', 'spell_ru_eval', 'train_source.txt'), 
    'r'
) as inf:
    sentences = inf.readlines()
    
with open(
    os.path.join(DATA_PATH, 'external', 'spell_ru_eval', 
                 'train_corrected.txt'), 
    'r'
) as inf:
    true_sentences = inf.readlines()

Запустим наш spell checker, подавая ему предложения батчами размера `batch_size`.

In [20]:
batch_size = 5
sentences_corrected = []
num_batches = int(np.ceil(len(sentences) // batch_size))

for i in tqdm(range(num_batches)):
    cur_sentences = sentences[i*batch_size:(i+1)*batch_size]
    sentences_corrected += spellchecker(cur_sentences)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))




Запишем результаты файл.

In [21]:
!mkdir -p ../data/processed/results_svm/

In [22]:
with open(os.path.join(DATA_PATH, 'processed', 'results_svm', 'validation.txt'), 'w') as ouf:
    ouf.writelines([sentence + '\n' for sentence in sentences_corrected])

Выполним скрит для измерения качества.

In [23]:
!python ../src/evaluation/spell_ru_eval/evaluate.py -d ../data/processed/results_svm/diffs_validation.txt ../data/external/spell_ru_eval/train_source.txt ../data/external/spell_ru_eval/train_corrected.txt ../data/processed/results_svm/validation.txt | tail -n 2

Precision=90.36 Recall=75.45 FMeasure=82.23
1303 1442 1727


Как видим, имеем
* True Positive: $1303$
* Внесенных исправлений: $1442$ 
* Требуемых исправления: $1727$ 
* Precision: $90.36$
* Recall: $75.45$
* FMeasure: $82.23$

### Тест

In [24]:
with open(
    os.path.join(DATA_PATH, 'external', 'spell_ru_eval', 'test_source.txt'), 
    'r'
) as inf:
    sentences = inf.readlines()
    
with open(
    os.path.join(DATA_PATH, 'external', 'spell_ru_eval', 
                 'test_corrected.txt'), 
    'r'
) as inf:
    true_sentences = inf.readlines()

Запустим наш spell checker, подавая ему предложения батчами размера `batch_size`.

In [25]:
batch_size = 5
sentences_corrected = []
num_batches = int(np.ceil(len(sentences) // batch_size))

for i in tqdm(range(num_batches)):
    cur_sentences = sentences[i*batch_size:(i+1)*batch_size]
    sentences_corrected += spellchecker(cur_sentences)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=401.0), HTML(value='')))




Запишем результаты файл.

In [27]:
with open(os.path.join(DATA_PATH, 'processed', 'results_base', 'test.txt'), 'w') as ouf:
    ouf.writelines([sentence + '\n' for sentence in sentences_corrected])

Выполним скрит для измерения качества.

In [28]:
!python ../src/evaluation/spell_ru_eval/evaluate.py -d ../data/processed/results_base/diffs_test.txt ../data/external/spell_ru_eval/test_source.txt ../data/external/spell_ru_eval/test_corrected.txt ../data/processed/results_base/test.txt | tail -n 2

Precision=85.78 Recall=66.38 FMeasure=74.84
1309 1526 1972


Как видим, имеем
* True Positive: $1309$
* Внесенных исправлений: $1526$ 
* Требуемых исправления: $1972$ 
* Precision: $85.78$
* Recall: $66.38$
* FMeasure: $74.84$

Видим, что на тесте качество падает, но это ожидаемо по результатам участников соревнования.

## Выводы

1. Результаты удалось заметно улучшить и достичь практически SOTA.
2. Теперь precision выглядит особенно хорошо и имеет смысл поработать над recall.