In [61]:
import pandas as pd
# from gensim.models import Word2Vec
import tqdm
import ast
import numpy as np
from collections import Counter
from navec import Navec
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, top_k_accuracy_score,\
                            f1_score, precision_score, recall_score, average_precision_score
from joblib import dump
import re

In [2]:
df_train = pd.read_csv('final_markup/train.csv').sample(n=20000, random_state=999)
df_val = pd.read_csv('final_markup/val.csv').sample(n=5000, random_state=999)
df_test = pd.read_csv('final_markup/test.csv').sample(n=5000, random_state=999)

## Логистическая регрессия на эмбеддингах Navec

Эмбеддинги Navec не учитывают никакой контекст, поэтому от решения не ожидается ничего впечатляющего и мы будем его использовать для проверки работы сервиса (tg-бота) и подсчета метрик

In [3]:
# !wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

In [4]:
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

In [6]:
def get_embeds(data):
    '''
    Функция для получения эмбеддингов слов предложений из data.
    Также возвращает таргет для каждого слова
    '''
    data_lst = []

    for i in tqdm.tqdm(data):
        list_of_words = ast.literal_eval(i[1])
        list_of_targets = ast.literal_eval(i[2])
    
        for i in range(len(list_of_targets)):
            try:
                data_lst.append([*navec[list_of_words[i]], list_of_targets[i]])
            except:
                data_lst.append([*navec['<unk>'], list_of_targets[i]])
                
    return pd.DataFrame(data_lst, 
                        columns=[f'embed_{i}' for i in range(300)] + ['target'])
    

In [9]:
%%time

train_embed = get_embeds(df_train.values)
val_embed = get_embeds(df_val.values)
test_embed = get_embeds(df_test.values)

100%|████████████████████████████████████| 20000/20000 [01:06<00:00, 298.83it/s]
100%|██████████████████████████████████████| 5000/5000 [00:10<00:00, 476.84it/s]
100%|██████████████████████████████████████| 5000/5000 [00:10<00:00, 457.28it/s]


CPU times: user 3min 50s, sys: 1min 5s, total: 4min 55s
Wall time: 5min 11s


In [10]:
len(train_embed), len(val_embed), len(test_embed)

(838808, 215133, 211519)

In [21]:
target_counts_df = pd.concat([train_embed.target.value_counts(),
                              val_embed.target.value_counts(),
                              test_embed.target.value_counts()], axis=1)

target_counts_df.columns = ['train', 'val', 'test']
target_counts_df

Unnamed: 0,train,val,test
o,711878,182407,179729
",",64023,16800,15944
.,58385,14863,14654
:,2293,531,567
;,1792,411,512
?,196,52,54
!,176,47,48
...,65,22,11


Как видно, очень мало предложений со знаками `! ? ...`. Причиной этого является рандомное сэмплирование из разметки, в которой преобладали текста из Википедии

In [11]:
le = LabelEncoder().fit(train_embed['target'])

model = LogisticRegression(max_iter=10000).fit(train_embed.drop('target', axis=1),
                                 le.transform(train_embed['target']))

In [12]:
y_pred = model.predict(test_embed.drop('target', axis=1))
y_pred_proba = model.predict_proba(test_embed.drop('target', axis=1))
y_true = le.transform(test_embed['target'])

In [88]:
def calc_metrics(y_true, y_pred, y_pred_proba):
    print('Доля пробелов:', (y_true == 7).mean())
    print('Accuracy:', top_k_accuracy_score(y_true, y_pred_proba, k=1))
    print('Top-2 Accuracy:', top_k_accuracy_score(y_true, y_pred_proba, k=2))
    print('ROC-AUC (OVR):',roc_auc_score(y_true, y_pred_proba, multi_class='ovr'))
    print('AUC-PR:',average_precision_score(y_true, y_pred_proba, average='weighted'))
    
    metrics = []
    metrics.append(list(dict(sorted(Counter(y_true).items())).values()))
    metrics.append(f1_score(y_true, y_pred, average=None))
    metrics.append(precision_score(y_true, y_pred, average=None, zero_division=0))
    metrics.append(recall_score(y_true, y_pred, average=None, zero_division=0))
    metrics.append(roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average=None))
    metrics.append(average_precision_score(y_true, y_pred_proba, average=None))
    metrics_index = ['Count', 'F1-Score', 'Precision', 'Recall', 'ROC-AUC', 'AUC-PR']
    df_metrics = pd.DataFrame(metrics, columns=le.classes_, index=metrics_index)
    
    return df_metrics

In [89]:
calc_metrics(y_true, y_pred, y_pred_proba)

Доля пробелов: 0.8497061729679131
Accuracy: 0.8544527914749975
Top-2 Accuracy: 0.9338877358535167
ROC-AUC (OVR): 0.7555137348366741
AUC-PR: 0.8163815439633352


Unnamed: 0,!,",",.,...,:,;,?,o
Count,48.0,15944.0,14654.0,11.0,567.0,512.0,54.0,179729.0
F1-Score,0.0,0.000125,0.144237,0.0,0.0,0.0,0.0,0.921261
Precision,0.0,0.076923,0.832973,0.0,0.0,0.0,0.0,0.854643
Recall,0.0,6.3e-05,0.078955,0.0,0.0,0.0,0.0,0.999143
ROC-AUC,0.810487,0.679878,0.71423,0.890871,0.709458,0.710344,0.821617,0.707224
AUC-PR,0.005221,0.125714,0.229541,0.000949,0.009757,0.006743,0.00829,0.930859


In [92]:
dump(le, 'le.joblib')
dump(model, 'log_reg.joblib');

## Готовая опенсурс-модель


А почему бы не взять готовую модель и не постараться ее улучшить/превзойти в будущем. Для тестирования была взята [XLM Roberta](https://huggingface.co/1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase). Модель весьма хороша и может быть использована в качестве финальной: умеет в 47 языков, также умеет разделять текст на предложения и делать заглавными нужные буквы.

<img src=https://cdn-uploads.huggingface.co/production/uploads/62d34c813eebd640a4f97587/WJ8aWIM4A--xzYu8FR4ht.png alt="drawing" width="700"/>

Конечно, такую модель потенциально будет нелегко побить. Однако, возможно, это удастся сделать, так как мы будем упираться лишь в один язык, а не в 47. В любом случае посмотрим на метрики, которые показывает модель:

In [95]:
# pip install punctuators

In [140]:
from punctuators.models import PunctCapSegModelONNX

m = PunctCapSegModelONNX.from_pretrained(
    "1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase"
)

input_texts = [
    'привет как дела это новый кадиллак'
]

results = m.infer(
    texts=input_texts, apply_sbd=True,
)

' '.join(results[0])

'Привет, как дела? Это новый кадиллак.'

In [142]:
results

['Привет, как дела? Это новый кадиллак.']

In [150]:
results = m.infer(
    texts=input_texts, apply_sbd=False,
)

results[0]

'Привет, как дела? Это новый кадиллак.'

In [233]:
punctuation_signs = ['!', ',', '.', '...', ':', ';', '?']

def roberta_prediction(text):
    text = re.sub('– ', '', text)
    text = re.sub('— ', '', text)
    text = re.sub('"', '', text)
    text = text.lower()
    text = re.sub('\s+', ' ', text)
    
    for sign in punctuation_signs:
        text = text.replace(sign + ' ', ' ')
        
    if text[-1] in punctuation_signs:
        text = text[:-1]
        
    preds = m.infer(
    texts=[text], apply_sbd=False,
    )
    prediction = preds[0]
    tokens = [token for token in prediction.split(' ') if token != '']
    labels = []
    
    for token in tokens:
        if (len(token) > 3) & (token[-3:] == '...'):
            labels.append('...')
        elif token[-1] in punctuation_signs:
            labels.append(token[-1])
        else:
            labels.append('o')
            
    
    return labels

In [240]:
preds = []
true_labels = []

for id_text in tqdm.tqdm(range(len(df_test.text.values))):
    prediction= roberta_prediction(df_test.text.values[id_text])
    needed_labels = ast.literal_eval(df_test.labels.values[id_text])
    
    # little bug in markup
    if len(prediction) != len(needed_labels):        
        not_empty_token_idxs = ~(np.array(ast.literal_eval(df_test.tokens.values[id_text])) == '')
        needed_labels = np.array(needed_labels)[not_empty_token_idxs].tolist()
        
    if len(needed_labels) == len(prediction):
        true_labels += needed_labels
        preds += prediction

100%|█████████████████████████████████████| 5000/5000 [1:07:30<00:00,  1.23it/s]


In [242]:
len(preds), len(true_labels)

(211107, 211107)

In [244]:
y_pred = le.transform(preds)
y_true = le.transform(true_labels)

In [245]:
def calc_metrics_no_proba(y_true, y_pred):
    print('Доля пробелов:', (y_true == 7).mean())
#     print('Accuracy:', top_k_accuracy_score(y_true, y_pred_proba, k=1))
#     print('Top-2 Accuracy:', top_k_accuracy_score(y_true, y_pred_proba, k=2))
#     rint('ROC-AUC (OVR):',roc_auc_score(y_true, y_pred_proba, multi_class='ovr'))
#     print('AUC-PR:',average_precision_score(y_true, y_pred_proba, average='weighted'))
    
    metrics = []
    metrics.append(list(dict(sorted(Counter(y_true).items())).values()))
    metrics.append(f1_score(y_true, y_pred, average=None))
    metrics.append(precision_score(y_true, y_pred, average=None, zero_division=0))
    metrics.append(recall_score(y_true, y_pred, average=None, zero_division=0))
#     metrics.append(roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average=None))
#     metrics.append(average_precision_score(y_true, y_pred_proba, average=None))
    metrics_index = ['Count', 'F1-Score', 'Precision', 'Recall']
#                      'ROC-AUC', 'AUC-PR']
    df_metrics = pd.DataFrame(metrics, columns=le.classes_, index=metrics_index)
    
    return df_metrics

In [246]:
calc_metrics_no_proba(y_true, y_pred)

Доля пробелов: 0.8498486549474911


Unnamed: 0,!,",",.,...,:,;,?,o
Count,48.0,15906.0,14607.0,11.0,560.0,512.0,54.0,179409.0
F1-Score,0.0,0.78524,0.81223,0.392857,0.0,0.0,0.451613,0.982528
Precision,0.0,0.742795,0.833501,0.244444,0.0,0.0,0.4,0.982887
Recall,0.0,0.83283,0.792018,1.0,0.0,0.0,0.518519,0.982169
