In [2]:
import pandas as pd
import tqdm
import ast
from collections import Counter
from navec import Navec
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer

from sklearn.metrics import confusion_matrix, roc_auc_score, top_k_accuracy_score,\
                            f1_score, precision_score, recall_score, average_precision_score
from joblib import dump
import re
import numpy as np

In [3]:
path = '/home/niknikiforov/RuPunctNet/new_books_prepared.csv'

df = pd.read_csv(path, index_col=0)
df['tokens'] = df.tokens.apply(ast.literal_eval)
df['labels'] = df.labels.apply(ast.literal_eval)
df = df[df.tokens.apply(len) < 200]
df_train, df_val_test = train_test_split(df, test_size=0.2, random_state=999)
df_val, df_test  = train_test_split(df_val_test, test_size=0.5, random_state=999)

df_test

Unnamed: 0,text,tokens,clear_punct_lower,labels
1341,Все это смог я различить лишь смутно и с трудо...,"[все, это, смог, я, различить, лишь, смутно, и...",все это смог я различить лишь смутно и с трудо...,"[o, o, o, o, o, o, o, o, o, ., o, o, o, o, o, ..."
25027,"Она поехала в игрушечную лавку, накупила игруш...","[она, поехала, в, игрушечную, лавку, накупила,...",она поехала в игрушечную лавку накупила игруше...,"[o, o, o, o, ,, o, o, o, o, o, ., o, o, o, ,, ..."
2585,Наконец настало утро четырнадцатого числа. пог...,"[наконец, настало, утро, четырнадцатого, числа...",наконец настало утро четырнадцатого числа пого...,"[o, o, o, o, ., o, o, o, o, o, o, o, ,, o, o, ..."
16829,"Хорошо. А почему прежде, бывало, с восьми часо...","[хорошо, а, почему, прежде, бывало, с, восьми,...",хорошо а почему прежде бывало с восьми часов в...,"[., o, o, ,, ,, o, o, o, o, o, o, o, ,, o, o, ..."
7937,"Говоря это, графиня оглянулась на дочь. Наташа...","[говоря, это, графиня, оглянулась, на, дочь, н...",говоря это графиня оглянулась на дочь наташа л...,"[o, ,, o, o, o, ., o, ,, o, o, o, o, o, o, o, ..."
...,...,...,...,...
13908,Разве на одну секунду... Я пришел за советом. ...,"[разве, на, одну, секунду, я, пришел, за, сове...",разве на одну секунду я пришел за советом я ко...,"[o, o, o, ..., o, o, o, ., ,, ,, o, o, o, ,, ,..."
21490,"План был очень хорош, но дело заключалось в то...","[план, был, очень, хорош, но, дело, заключалос...",план был очень хорош но дело заключалось в том...,"[o, o, o, ,, o, o, o, o, ,, o, o, o, o, o, o, ..."
2567,"Сохраняя, поелику возможно, равновесие, чтобы ...","[сохраняя, поелику, возможно, равновесие, чтоб...",сохраняя поелику возможно равновесие чтобы хор...,"[,, o, ,, ,, o, o, o, ,, o, o, ,, o, o, o, o, ..."
25405,"Было ли в лице Левина что-нибудь особенное, ил...","[было, ли, в, лице, левина, чтонибудь, особенн...",было ли в лице левина чтонибудь особенное или ...,"[o, o, o, o, o, o, ,, o, o, o, ,, o, o, o, o, ..."


## Простые правила

In [13]:
import requests
from bs4 import BeautifulSoup

target_url = 'https://russkiiyazyk.ru/sintaksis/spisok-vvodnyh-slov.html'
resp = requests.get(target_url)
print(resp)

soup = BeautifulSoup(resp.text, 'html.parser')
print(soup.title)

<Response [200]>
<title>Список вводных слов русского языка</title>


In [14]:
all_comma_words_comb = soup.find_all("p")
commas_words_dict = {'smth':0}

for i in range(5,len(all_comma_words_comb)):
    if str(all_comma_words_comb[i])[:11] != '<p><strong>':
        for word_comb in all_comma_words_comb[i].text.replace(u'\xa0', u' ').split('\n'):
            try: 
                if word_comb[0] != '*' or word_comb[-1] != '*' or word_comb[0] != '\t' or word_comb[0] != "Δ" or word_comb != 'Поиск' or  word_comb[0] != '2':
                    commas_words_dict[word_comb]  = 1
            except:
                continue

check = list(commas_words_dict.keys())[-1]

while check != 'ясное дело':
    commas_words_dict.pop(check)
    check = list(commas_words_dict.keys())[-1]

commas_words_dict.pop('smth')

before_commas_words_dict = {}
before_commas_words_dict['а'] = 1
before_commas_words_dict['что'] = 1
before_commas_words_dict['когда'] = 1
before_commas_words_dict['который'] = 1
before_commas_words_dict['которая'] = 1
before_commas_words_dict['которое'] = 1
before_commas_words_dict['которые'] = 1
before_commas_words_dict['но'] = 1
before_commas_words_dict['ущ'] = 1
before_commas_words_dict['их'] = 1
before_commas_words_dict['ющ'] = 1
before_commas_words_dict['ую'] = 1
before_commas_words_dict['ащ'] = 1
before_commas_words_dict['ом'] = 1
before_commas_words_dict['ем'] = 1
before_commas_words_dict['вш'] = 1


In [36]:
import ast 
from scipy.stats import bernoulli
import random

class dumb_model():
    def __init__(self, data, type) -> list:
        self.data = data.text.apply(lambda x: x.split())
        self.type = type

        if type == 'vera': 
            punct_dict = {
                'total_words' : 0,
                ',' : 0,
                ':' : 0,
                ';' : 0,
                '.' : 0,
                '!' : 0,
                '?' : 0,
                '...' : 0
            }
            enctryption = {1:':',2:';',3:".",4:"!",5:"?",6:'...'}

            for lst in self.data:
                for ind in range(len(lst)):
                    if lst[ind][-1] in punct_dict:
                        punct_dict[lst[ind][-1]] += 1
                    punct_dict['total_words'] += 1 
            self.punct_dict = punct_dict   
            self.enctryption = enctryption
        
    def transform(self, string):
        if self.type == 'dict':
            res = ['']
            for i in range(len(string)):
                if commas_words_dict.get(string[i], 0) == 1 and i == 0:
                    res.append(string[i] + ",")
                elif commas_words_dict.get(string[i], 0) == 1 and i != 0:
                    previous = res.pop()
                    res.append(previous+",")
                    res.append(string[i]+",")
                else:
                    if before_commas_words_dict.get(string[i], 0) == 1 and i != 0:
                        previous = res.pop()
                        res.append(previous+",")
                        res.append(string[i])
                    elif len(string[i]) > 2:
                        if before_commas_words_dict.get(string[i][-2:], 0) == 1 and string[i][-2:] != string[i]:
                            previous = res.pop()
                            res.append(previous+",")
                            res.append(string[i])
                        elif string[i] == 'г':
                            res.append(string[i]+".")
                        else:
                            res.append(string[i])
                    else:
                        res.append(string[i])
            res = " ".join(res)
            if res[-1] in ',;:':
                return res[1:-1]+'.'
            else:
                return res[1:] +'.'
        
        elif self.type == 'vera':
            res = ''
            for i in range(len(string)):
                flag_dict = {',':0, '.':0, ':':0, ';':0, '?':0, '!':0, 'before_,':0}
                if commas_words_dict.get(string[i], 0) == 1 and i == 0:
                    flag_dict[','] += 1
                elif commas_words_dict.get(string[i], 0) == 1 and i != 0:
                    flag_dict['before_,'] += 1
                    flag_dict[','] += 1
                else:
                    if before_commas_words_dict.get(string[i], 0) == 1 and i != 0:
                        flag_dict['before_,'] += 1
                    elif len(string[i]) > 2:
                        if before_commas_words_dict.get(string[i][-2:], 0) == 1 and string[i][-2:] != string[i]:
                            flag_dict['before_,'] += 1
                        if string[i] == 'г':
                            flag_dict['.'] += 1
                right = ''
                left = ' '

                if bernoulli.rvs(size=1,p=(sum(self.punct_dict.values()) - self.punct_dict['total_words'])/self.punct_dict['total_words'])[0] == 1 or sum(flag_dict.values()) > 1:
                    if  flag_dict['before_,'] > 0:
                        left = ', '
                    if bernoulli.rvs(size=1,p=self.punct_dict[',']/self.punct_dict['total_words'])[0] == 1 or flag_dict[','] > 0:
                        right = ','
                    else:
                        mark = self.enctryption.get(random.choices(list(self.enctryption.keys()), weights=list(self.punct_dict.values())[2:], k=1)[0], '')
                        if mark == '.' or flag_dict['.'] > 0:
                            right = '.'
                        else:
                            right = mark
                res += left + string[i] + right
            if res[0] == ' ':
                res = res[1:]
            if res[-1] in ',:;':
                return res[:-1] + self.enctryption.get(random.choices(list(self.enctryption.keys())[2:], weights=list(self.punct_dict.values())[4:], k=1)[0], '.')
            else:
                return res + '.'

In [37]:
def get_data_for_metrics(df_train, df_test):
    model_pr = dumb_model(df_train, type = 'vera')
    model_dc = dumb_model(df_train, type = 'dict')

    df_test['model_probs_pred'] = df_test.tokens.apply(lambda x: model_pr.transform(x))
    df_test['model_dict_pred'] = df_test.tokens.apply(lambda x: model_dc.transform(x))

    enctryption = {0:",",1:':',2:';',3:".",4:"!",5:"?",6:'...'}
    back_enctryption = {",":0,':':1,';':2,".":3,"!":4,"?":5,'...':6,}
    
    def transform_to_labels(data, back_enctryption):
        output = []
        for sentence in data:
            for word in sentence.split():
                if word != "":
                    if word[-1] in back_enctryption:
                        output.append(back_enctryption[word[-1]])
                    else:
                        output.append(7)
        return np.array(output)
    
    
    return transform_to_labels(df_test.text, back_enctryption), transform_to_labels(df_test['model_probs_pred'], back_enctryption), transform_to_labels(df_test['model_dict_pred'], back_enctryption)
            
    

In [43]:
y_true, y_pred_prob, y_pred = get_data_for_metrics(df_train, df_test)

In [68]:
def calc_metrics(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    print('Доля пробелов:', (y_true == 7).mean())
    metrics = []
    enctryption = {0:",",1:':',2:';',3:".",4:"!",5:"?",6:'...', 7:'o'}
    sorted_dict = {enctryption[i[0]] : i[1] for i in sorted(Counter(y_true).items())}
    
    metrics.append(list(dict(sorted(Counter(y_true).items())).values()))
    metrics.append(f1_score(y_true, y_pred, average=None))
    metrics.append(precision_score(y_true, y_pred, average=None, zero_division=0))
    metrics.append(recall_score(y_true, y_pred, average=None, zero_division=0))
    metrics_index = ['Count', 'F1-Score', 'Precision', 'Recall']
    df_metrics = pd.DataFrame(metrics, columns=sorted_dict.keys(), index=metrics_index)
    
    return df_metrics

In [69]:
# Via dictionary aproach

calc_metrics(y_true, y_pred)

Доля пробелов: 0.7982334371883065


Unnamed: 0,",",:,.,!,?,o
Count,13004.0,214.0,5814.0,213.0,310.0,77515.0
F1-Score,0.12452,0.0,0.037944,0.0,0.0,0.825903
Precision,0.132125,0.0,0.062645,0.0,0.0,0.79854
Recall,0.117743,0.0,0.027213,0.0,0.0,0.855209


In [70]:
# Via probability aproach

calc_metrics(y_true, y_pred_prob)

Доля пробелов: 0.7982334371883065


Unnamed: 0,",",:,.,!,?,o
Count,13004.0,214.0,5814.0,213.0,310.0,77515.0
F1-Score,0.088669,0.009434,0.089153,0.002469,0.00197,0.766169
Precision,0.125424,0.007109,0.060398,0.001678,0.00141,0.797963
Recall,0.068574,0.014019,0.170166,0.004673,0.003268,0.736812


## Xlm-roberta_punctuation


In [5]:
from punctuators.models import PunctCapSegModelONNX

m = PunctCapSegModelONNX.from_pretrained(
    "1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase"
)

input_texts = [
    'привет как дела это новый кадиллак'
]

results = m.infer(
    texts=input_texts, apply_sbd=True,
)

' '.join(results[0])

sp.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/531 [00:00<?, ?B/s]

'Привет, как дела? Это новый кадиллак.'

In [6]:
punctuation_signs = ['!', ',', '.', '...', ':', ';', '?']

def roberta_prediction(text):
    text = re.sub('– ', '', text)
    text = re.sub('— ', '', text)
    text = re.sub('"', '', text)
    text = text.lower()
    text = re.sub('\s+', ' ', text)

    for sign in punctuation_signs:
        text = text.replace(sign + ' ', ' ')
        
    if text[-1] in punctuation_signs:
        text = text[:-1]
        
    preds = m.infer(
    texts=[text], apply_sbd=False,
    )
    prediction = preds[0]
    tokens = [token for token in prediction.split(' ') if token != '']
    labels = []
    
    for token in tokens:
        if (len(token) > 3) & (token[-3:] == '...'):
            labels.append('...')
        elif token[-1] in punctuation_signs:
            labels.append(token[-1])
        else:
            labels.append('o')
            
    
    return labels

In [25]:
preds = []
true_labels = []

for id_text in tqdm.tqdm(range(len(df_test.text.values))):
    if len(df_test.text.values[id_text]) < 10:
        continue
        
    prediction= roberta_prediction(df_test.text.values[id_text])
    needed_labels = df_test.labels.values[id_text]
    
    if len(prediction) != len(needed_labels):        
        not_empty_token_idxs = ~(np.array(df_test.tokens.values[id_text]) == '')
        needed_labels = np.array(needed_labels)[not_empty_token_idxs].tolist()
        
    if len(needed_labels) == len(prediction):
        true_labels += needed_labels
        preds += prediction

100%|███████████████████████████████████████████████████████████████████████████████| 2586/2586 [47:50<00:00,  1.11s/it]


In [32]:
le = LabelEncoder().fit(true_labels)

In [37]:
y_pred = le.transform(preds)
y_true = le.transform(true_labels)

In [45]:
def calc_metrics_no_proba(y_true, y_pred):
    print('Доля пробелов:', (y_true == 6).mean())
    
    metrics = []
    metrics.append(list(dict(sorted(Counter(y_true).items())).values()))
    metrics.append(f1_score(y_true, y_pred, average=None))
    metrics.append(precision_score(y_true, y_pred, average=None, zero_division=0))
    metrics.append(recall_score(y_true, y_pred, average=None, zero_division=0))
    metrics_index = ['Count', 'F1-Score', 'Precision', 'Recall']
    df_metrics = pd.DataFrame(metrics, columns=le.classes_, index=metrics_index)
    
    return df_metrics

In [46]:
calc_metrics_no_proba(y_true, y_pred)

Доля пробелов: 0.7975428559669088


Unnamed: 0,!,",",.,...,:,?,o
Count,205.0,13002.0,5794.0,164.0,214.0,297.0,77510.0
F1-Score,0.0,0.812895,0.727008,0.783883,0.0,0.616372,0.976453
Precision,0.0,0.791148,0.72078,0.981651,0.0,0.588957,0.978929
Recall,0.0,0.835871,0.733345,0.652439,0.0,0.646465,0.97399
