In [1]:
import pandas as pd
# from gensim.models import Word2Vec
import tqdm
import ast
from navec import Navec
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder



In [2]:
df_train = pd.read_csv('final_markup/train.csv')
df_test = pd.read_csv('final_markup/test.csv')

In [3]:
# !wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

In [4]:
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

In [5]:
def get_embeds(data):

    data_lst = []

    for i in tqdm.tqdm(data):
        list_of_words = ast.literal_eval(i[1])
        list_of_targets = ast.literal_eval(i[2])
    
        for i in range(len(list_of_targets)):
            try:
                data_lst.append([*navec[list_of_words[i]], list_of_targets[i]])
            except:
                data_lst.append([*navec['<unk>'], list_of_targets[i]])
                
    return pd.DataFrame(data_lst, 
                        columns=[f'embed_{i}' for i in range(300)] + ['target'])
    

In [6]:
%%time

train_embed = get_embeds(df_train.sample(n=20000, random_state=999).values)
test_embed = get_embeds(df_test.sample(n=5000, random_state=999).values)

100%|████████████████████████████████████| 20000/20000 [00:41<00:00, 483.93it/s]
100%|██████████████████████████████████████| 5000/5000 [00:07<00:00, 640.93it/s]


CPU times: user 2min 47s, sys: 53.9 s, total: 3min 41s
Wall time: 4min 32s


In [7]:
len(train_embed), len(test_embed)

(838808, 211519)

In [8]:
le = LabelEncoder().fit(train_embed['target'])

model = LogisticRegression(max_iter=10000).fit(train_embed.drop('target', axis=1),
                                 le.transform(train_embed['target']))

In [9]:
y_pred = model.predict(test_embed.drop('target', axis=1))
y_pred_proba = model.predict_proba(test_embed.drop('target', axis=1))
y_true = le.transform(test_embed['target'])

In [10]:
from sklearn.metrics import confusion_matrix, roc_auc_score, top_k_accuracy_score

print(roc_auc_score(y_true, y_pred_proba, multi_class='ovr'))
print(top_k_accuracy_score(y_true, y_pred_proba, k=1))
confusion_matrix(y_true, y_pred)

0.7555137348366741
0.8544527914749975


array([[     0,      0,      1,      0,      0,      0,      0,     47],
       [     0,      1,     74,      0,      0,      0,      0,  15869],
       [     0,      1,   1157,      0,      0,      0,      0,  13496],
       [     0,      0,      0,      0,      0,      0,      0,     11],
       [     0,      0,      4,      0,      0,      0,      0,    563],
       [     0,      1,      9,      0,      0,      0,      0,    502],
       [     0,      0,      0,      0,      0,      0,      0,     54],
       [     0,     10,    144,      0,      0,      0,      0, 179575]])

In [11]:
le.classes_

array(['!', ',', '.', '...', ':', ';', '?', 'o'], dtype=object)

In [12]:
test_embed['target'].value_counts(normalize=True)

target
o      0.849706
,      0.075379
.      0.069280
:      0.002681
;      0.002421
?      0.000255
!      0.000227
...    0.000052
Name: proportion, dtype: float64

In [83]:
from joblib import dump

dump(le, 'le.joblib')
dump(model, 'log_reg.joblib')
dump(navec, 'navec.joblib');

In [71]:
import re
def preprocess_input(line):
    
    # обработка сообщения
    line = re.sub('– ', '', line)
    line = re.sub('— ', '', line)
#     line = re.sub('\(', '', line)
#     line = re.sub('\)', '', line)
    line = re.sub('"', '', line)
    line = line.lower()
    line = re.sub("[^\w\s]", '', line)
    line = re.sub('\s+', ' ', line)
    
    # разбиение на токены и преобразование в эмбеддинги
    tokens = [token for token in line.split(' ') if token != '']
    embeds = []
    
    for i in tokens:
        try:
            embeds.append(navec[i])
        except:
            embeds.append(navec['<unk>'])

    embed_df = pd.DataFrame(embeds, columns=[f'embed_{i}' for i in range(300)])
            
    # предсказания модели
    preds = le.inverse_transform(model.predict(embed_df))
    answer = ''
    flg_new_sent = 1
    
    for i in range(len(preds)):
        token_to_add = tokens[i]
        
        if flg_new_sent:
            token_to_add = token_to_add[0].upper() + token_to_add[1:]
        
        if preds[i] != 'o':
            token_to_add += preds[i]
        
        answer += token_to_add
        
        if preds[i] in ['?', '...', '.', '!']:
            flg_new_sent = 1
        else:
            flg_new_sent = 0
            
            # если в конце нет завершающего знака, то ставим его
            if i == (len(preds) - 1):
                answer += '.'
                
        answer += ' '
            
    return answer.strip()


In [84]:
pip install punctuators

Collecting punctuators
  Downloading punctuators-0.0.5-py3-none-any.whl.metadata (322 bytes)
Collecting onnxruntime (from punctuators)
  Downloading onnxruntime-1.16.3-cp39-cp39-macosx_10_15_x86_64.whl.metadata (4.3 kB)
Collecting omegaconf (from punctuators)
  Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting antlr4-python3-runtime==4.9.* (from omegaconf->punctuators)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting coloredlogs (from onnxruntime->punctuators)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m701.6 kB/s[0m et

In [85]:
from punctuators.models import PunctCapSegModelONNX

m = PunctCapSegModelONNX.from_pretrained(
    "1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase"
)

input_texts = [
    'привет как дела это новый кадиллак'
]

results = m.infer(
    texts=input_texts, apply_sbd=True,
)

' '.join(results[0])

Downloading sp.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading model.onnx:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading config.yaml:   0%|          | 0.00/531 [00:00<?, ?B/s]

Input: привет как дела это новый кадиллак
Outputs:
	Привет, как дела?
	Это новый кадиллак.



In [91]:
' '.join(results[0])

'Привет, как дела? Это новый кадиллак.'

In [89]:
results

[['Привет, как дела?', 'Это новый кадиллак.']]