In [1]:
from sklearn.metrics import mean_absolute_error

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! pip install ruts

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ruts
  Downloading ruts-0.8.1-py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.2/51.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting numpy<2.0.0,>=1.23.0 (from ruts)
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy2<0.10.0,>=0.9.1 (from ruts)
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting razdel<0.6.0,>=0.5.0 (from ruts)
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Collecting dawg-python>=0.7.1 (from pymorphy2<0.10.0,>=0.9.1->ruts)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)


In [4]:
import pandas as pd
import numpy as np
import json
from ruts import BasicStats
import pickle

In [5]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [6]:
! mkdir accents
! unzip drive/MyDrive/all_accents.zip -d accents

Archive:  drive/MyDrive/all_accents.zip
  inflating: accents/all_accents.tsv  


In [7]:
df = pd.read_csv("accents/all_accents.tsv", sep="\t", header=None, names=["value", "acc"])

In [8]:
df = df[:1680442]
df

Unnamed: 0,value,acc
0,-де,-д^е
1,-ка,-к^а
2,-либо,-л^ибо
3,-нибудь,-ниб^удь
4,-с,-с
...,...,...
1680437,ящурок,^ящурок
1680438,ящуром,^ящуром
1680439,ящуру,^ящуру
1680440,яэль,я^эль


In [9]:
def unknown_symbs(word):
    symbols = [chr(i) for i in range (ord('А'), ord('А') + 64)]
    for letter in word:
        if letter not in symbols:
            return False
    return True

In [10]:
df['is_symbs'] = df['value'].apply(unknown_symbs)

In [11]:
df = df[df['is_symbs'] == True]
df

Unnamed: 0,value,acc,is_symbs
8,а,^а,True
14,аав,^аав,True
15,аава,^аава,True
16,аавам,^аавам,True
17,аавами,^аавами,True
...,...,...,...
1680437,ящурок,^ящурок,True
1680438,ящуром,^ящуром,True
1680439,ящуру,^ящуру,True
1680440,яэль,я^эль,True


In [12]:
def acc_to_num(word):
    i = word.find('^')
    if i == -1:
        return 0
    word = word[:i]
    if word == "":
        return 1
    try:
        bs = BasicStats(word)
    except:
        return -1
    return bs.get_stats()['n_syllables'] + 1

In [13]:
df['target'] = df['acc'].apply(acc_to_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'] = df['acc'].apply(acc_to_num)


In [14]:
df = df[df['target'] != 0]

In [15]:
df.groupby('target').count()

Unnamed: 0_level_0,value,acc,is_symbs
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,232598,232598,232598
2,561349,561349,561349
3,583207,583207,583207
4,207344,207344,207344
5,62698,62698,62698
6,16395,16395,16395
7,3764,3764,3764
8,756,756,756
9,111,111,111
10,18,18,18


In [25]:
df[df['target'] > 10]

Unnamed: 0,value,acc,is_symbs,target,goodValue,syl
625000,лланвайрпуллгуингиллгогерихуирндробуллллантиси...,лланвайрпуллгуингиллгогерихуирндробуллллантиси...,True,21,лланвайрпуллгуингиллгогерихуирндробуллллантиси...,21
637135,магнэзиоалюминокатофорит,магнэзиоалюминокатоф^орит,True,11,магнэзиоалюминокатоф[о]рит,12
790569,никотинамидадениндинуклеотид,никотинамидадениндинуклеот^ид,True,13,никотинамидадениндинуклеот[и]д,13
790570,никотинамидадениндинуклеотида,никотинамидадениндинуклеот^ида,True,13,никотинамидадениндинуклеот[и]да,14
790571,никотинамидадениндинуклеотидфосфат,никотинамидадениндинуклеотидфосф^ат,True,15,никотинамидадениндинуклеотидфосф[а]т,15
1540989,ультравысокотемпературнообработанный,ультравысокотемпературнообраб^отанный,True,13,ультравысокотемпературнообраб[о]танный,15


# Обучение модели на датасете русских ударений

In [None]:
import string

import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

def process_text(text):
    if text == "":
        text = "empty"
    return [word for word in word_tokenize(text.lower()) if word not in string.punctuation]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
#model.cuda()  # uncomment it if you have a GPU

def embed_bert_cls(text):
    global model
    global tokenizer
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

print(embed_bert_cls('привет мир').shape)
# (312,)


Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(312,)


In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, random_state=1412) # <- для локального тестирования

In [None]:
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [None]:
df_trainX = df_train['value']
df_trainy = df_train['target']
df_testX = df_test['value']
df_testy = df_test['target']

In [None]:
len(df_trainX), len(df_testX)

(1251184, 417062)

In [None]:
from tqdm import tqdm
mod_X = []
for i in tqdm(range(49781, len(df_trainX)//15)):
    mod_X.append(embed_bert_cls(df_trainX[i]))
    with open("drive/MyDrive/train_acc2.pkl", "wb") as f:
        pickle.dump(mod_X, f)

100%|██████████| 33631/33631 [2:43:30<00:00,  3.43it/s]


In [None]:
with open('drive/MyDrive/train_acc.pkl', 'rb') as f:
    modX1 = pickle.load(f)
with open('drive/MyDrive/train_acc2.pkl', 'rb') as f:
    modX2 = pickle.load(f)
with open('drive/MyDrive/test_acc.pkl', 'rb') as f:
    mod_test_X = pickle.load(f)
modX = modX1[:49781] + modX2
len(modX), len(df_trainX)//15

(83412, 83412)

In [None]:
! pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [None]:
from tqdm import tqdm
mod_test_X = []
for i in tqdm(range(len(df_testX)//50)):
    mod_test_X.append(embed_bert_cls(df_testX[i]))
    with open("drive/MyDrive/test_acc.pkl", "wb") as f:
        pickle.dump(mod_test_X, f)

100%|██████████| 8341/8341 [08:36<00:00, 16.15it/s]


In [None]:
from catboost import CatBoostRegressor
regressor = CatBoostRegressor(task_type="GPU",
                           devices='0:1',
                           learning_rate=0.05,
                           depth=8,
                           num_trees=6000)
regressor.fit(modX, df_trainy[:83412])
train_pred = (regressor.predict(modX)).round()
print(train_pred)
print(mean_absolute_error(df_trainy[:83412], train_pred))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1002:	learn: 0.6066529	total: 18.3s	remaining: 1m 31s
1003:	learn: 0.6065131	total: 18.4s	remaining: 1m 31s
1004:	learn: 0.6063574	total: 18.4s	remaining: 1m 31s
1005:	learn: 0.6062406	total: 18.4s	remaining: 1m 31s
1006:	learn: 0.6061027	total: 18.4s	remaining: 1m 31s
1007:	learn: 0.6059167	total: 18.4s	remaining: 1m 31s
1008:	learn: 0.6057841	total: 18.4s	remaining: 1m 31s
1009:	learn: 0.6056246	total: 18.5s	remaining: 1m 31s
1010:	learn: 0.6055116	total: 18.5s	remaining: 1m 31s
1011:	learn: 0.6053699	total: 18.5s	remaining: 1m 31s
1012:	learn: 0.6052527	total: 18.5s	remaining: 1m 31s
1013:	learn: 0.6051447	total: 18.5s	remaining: 1m 31s
1014:	learn: 0.6049935	total: 18.5s	remaining: 1m 30s
1015:	learn: 0.6048879	total: 18.5s	remaining: 1m 30s
1016:	learn: 0.6047660	total: 18.6s	remaining: 1m 30s
1017:	learn: 0.6045788	total: 18.6s	remaining: 1m 30s
1018:	learn: 0.6044301	total: 18.6s	remaining: 1m 30s
1019:	learn: 0.60

In [None]:
0.002541600728911907
0.004795473073418693
0.041468853402388144
0.0
0.041468853402388144
0.009674866925622213
0.08791301011844818
0.3131324030115571
0.08570709250467559
1.1988682683546732e-05
0.001342732460557234
0.0
0.207152448089004
0.002697453603798015
0.05896034143768283
0.17216947201841462

In [None]:
test_pred = (regressor.predict(mod_test_X)).round()
print(test_pred)
print(mean_absolute_error(df_testy[:8341], test_pred))

[1. 2. 3. ... 4. 2. 2.]
0.4660112696319386


In [None]:
0.6005275146864885
0.5285936938017024
0.5387843184270471
0.511569356192303
0.595612036926028
0.524037885145666
0.5285936938017024
0.5746313391679655
0.5116892458937777
0.5086920033569117
0.5026975182831794
0.5182831794748831
0.48735163649442514
0.4845941733605083
0.48387483515166047
0.48351516604723654
0.48411461455460975
0.4839947248531351
0.48387483515166047
0.48351516604723654
0.5295528114134996
0.4703272988850258
0.5101306797746074
0.47440354873516366
0.4703272988850258
0.4747632178395876
0.46804939455700756
0.4660112696319386
0.477640570674979

In [None]:
test_pred, df_testy[:8341]

(array([2., 2., 3., ..., 5., 2., 2.]),
 0       1
 1       1
 2       3
 3       6
 4       3
        ..
 8336    3
 8337    2
 8338    5
 8339    2
 8340    1
 Name: target, Length: 8341, dtype: int64)

In [None]:
dtest = pd.DataFrame(df_testX[:8341])
dtest['target'] = df_testy[:8341]
dtest['prediction'] = test_pred
dtest[dtest['target'] != dtest['prediction']]

Unnamed: 0,value,target,prediction
1,зрительное,1,2.0
7,старцах,1,2.0
8,ударениями,3,2.0
9,нагнутого,1,2.0
10,постаявший,2,3.0
...,...,...,...
8335,схрумкаете,1,3.0
8336,увалявшийся,3,2.0
8337,развалят,2,3.0
8338,биопрепаратами,5,4.0


In [None]:
train_pred

array([4., 2., 2., ..., 1., 3., 5.])

# Датасет ударений

In [17]:
def value_to_good(word):
    i = word.find('^')
    word = word[:i] + "[" + word[i + 1] + "]" + word[i+2:]
    return word

In [18]:
df['goodValue'] = df['acc'].apply(value_to_good)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['goodValue'] = df['acc'].apply(value_to_good)


In [19]:
df

Unnamed: 0,value,acc,is_symbs,target,goodValue
8,а,^а,True,1,[а]
14,аав,^аав,True,1,[а]ав
15,аава,^аава,True,1,[а]ава
16,аавам,^аавам,True,1,[а]авам
17,аавами,^аавами,True,1,[а]авами
...,...,...,...,...,...
1680436,ящуров,^ящуров,True,1,[я]щуров
1680437,ящурок,^ящурок,True,1,[я]щурок
1680438,ящуром,^ящуром,True,1,[я]щуром
1680439,ящуру,^ящуру,True,1,[я]щуру


In [20]:
def syl(word):
    bs = BasicStats(word)
    return bs.get_stats()['n_syllables']

In [21]:
df['syl'] = df['value'].apply(syl)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['syl'] = df['value'].apply(syl)


In [27]:
df = df[df['syl'] >= 4]

In [61]:
vov = ['ё', 'у', 'е', 'ы', 'а', 'о', 'э', 'я', 'и', 'ю']
def wrong_acc(word):
    num = 3
    i = word.find('[')
    anword = word[:i]
    if anword == "":
        s = 1
    else:
        bs = BasicStats(anword)
        s = bs.get_stats()['n_syllables'] + 1
    clean_word = anword + word[i + 1] + word[i + 3:]
    if "бал" in word and num == 3:
        i = clean_word.index("бал")
        ss = 0
        for j in range(i):
            if clean_word[j] in vov:
                ss += 1
        if ss > 2 and s > 4 or ss > 3:
            print(word)
            print(clean_word[:i + 1] + '[' + clean_word[i + 1] + ']' + clean_word[i + 2:])
            return clean_word[:i + 1] + '[' + clean_word[i + 1] + ']' + clean_word[i + 2:]
    if "графия" in word or "кратия" in word and num == 3:
        i = clean_word.index("ия")
        ss = 0
        for j in range(i):
            if clean_word[j] in vov:
                ss += 1
        if ss > 2 and s > 4 or ss > 3:
            return clean_word[:i] + '[' + clean_word[i] + ']' + clean_word[i + 1:]
    if num >= s:
        num += 1
    counter = 0
    for i in range(len(clean_word)):
        if clean_word[i] in vov:
            counter += 1
        if counter == num:
            return clean_word[:i] + '[' + clean_word[i] + ']' + clean_word[i + 1:]
    return None
    

In [62]:
df['third'] = df['goodValue'].apply(wrong_acc)

альтерглобал[и]зм
альтерглоб[а]лизм
альтерглобал[и]зма
альтерглоб[а]лизма
альтерглобал[и]змам
альтерглоб[а]лизмам
альтерглобал[и]змами
альтерглоб[а]лизмами
альтерглобал[и]змах
альтерглоб[а]лизмах
альтерглобал[и]зме
альтерглоб[а]лизме
альтерглобал[и]змов
альтерглоб[а]лизмов
альтерглобал[и]змом
альтерглоб[а]лизмом
альтерглобал[и]зму
альтерглоб[а]лизму
альтерглобал[и]змы
альтерглоб[а]лизмы
альтэрглобал[и]зм
альтэрглоб[а]лизм
альтэрглобал[и]сский
альтэрглоб[а]лисский
альтэрглобал[и]ст
альтэрглоб[а]лист
альтэрглобал[и]стский
альтэрглоб[а]листский
антиглобализаци[о]нная
антиглоб[а]лизационная
антиглобализаци[о]нного
антиглоб[а]лизационного
антиглобализаци[о]нное
антиглоб[а]лизационное
антиглобализаци[о]нной
антиглоб[а]лизационной
антиглобализаци[о]нном
антиглоб[а]лизационном
антиглобализаци[о]нному
антиглоб[а]лизационному
антиглобализаци[о]нною
антиглоб[а]лизационною
антиглобализаци[о]нную
антиглоб[а]лизационную
антиглобализаци[о]нные
антиглоб[а]лизационные
антиглобализаци[о]нный
антиглоб[а]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['third'] = df['goodValue'].apply(wrong_acc)


In [63]:
df

Unnamed: 0,value,acc,is_symbs,target,goodValue,syl,first,second,third
17,аавами,^аавами,True,1,[а]авами,4,а[а]вами,аав[а]ми,аавам[и]
18,аавасакса,^аавасакса,True,1,[а]авасакса,5,а[а]васакса,аав[а]сакса,аавас[а]кса
28,ааиша,аа^иша,True,3,аа[и]ша,4,[а]аиша,а[а]иша,ааиш[а]
32,ааками,а^аками,True,2,а[а]ками,4,[а]аками,аак[а]ми,аакам[и]
40,аалбоот,аалб^оот,True,3,аалб[о]от,4,[а]албоот,а[а]лбоот,аалбо[о]т
...,...,...,...,...,...,...,...,...,...
1680428,ящурному,^ящурному,True,1,[я]щурному,4,ящ[у]рному,ящурн[о]му,ящурном[у]
1680429,ящурною,^ящурною,True,1,[я]щурною,4,ящ[у]рною,ящурн[о]ю,ящурно[ю]
1680430,ящурную,^ящурную,True,1,[я]щурную,4,ящ[у]рную,ящурн[у]ю,ящурну[ю]
1680431,ящурные,^ящурные,True,1,[я]щурные,4,ящ[у]рные,ящурн[ы]е,ящурны[е]


In [64]:
final_df = df[['goodValue', 'first', 'second', 'third']]
final_df.set_index('goodValue', inplace=True)

In [67]:
final_dict = final_df.T.to_dict('list')

In [71]:
key_to_key = dict(zip(list(df['value']), list(df['goodValue'])))

In [73]:
final_dict

{'[а]авами': ['а[а]вами', 'аав[а]ми', 'аавам[и]'],
 '[а]авасакса': ['а[а]васакса', 'аав[а]сакса', 'аавас[а]кса'],
 'аа[и]ша': ['[а]аиша', 'а[а]иша', 'ааиш[а]'],
 'а[а]ками': ['[а]аками', 'аак[а]ми', 'аакам[и]'],
 'аалб[о]от': ['[а]албоот', 'а[а]лбоот', 'аалбо[о]т'],
 'аал[е]нец': ['[а]аленец', 'а[а]ленец', 'аален[е]ц'],
 '[а]алтонен': ['а[а]лтонен', 'аалт[о]нен', 'аалтон[е]н'],
 'аальб[о]от': ['[а]альбоот', 'а[а]льбоот', 'аальбо[о]т'],
 '[а]арау': ['а[а]рау', 'аар[а]у', 'аара[у]'],
 '[а]аргау': ['а[а]ргау', 'аарг[а]у', 'аарга[у]'],
 '[а]арона': ['а[а]рона', 'аар[о]на', 'аарон[а]'],
 '[а]ароне': ['а[а]роне', 'аар[о]не', 'аарон[е]'],
 'аар[о]нов': ['[а]аронов', 'а[а]ронов', 'аарон[о]в'],
 'аар[о]новец': ['[а]ароновец', 'а[а]роновец', 'аарон[о]вец'],
 'аар[о]нович': ['[а]аронович', 'а[а]ронович', 'аарон[о]вич'],
 'аар[о]ново': ['[а]ароново', 'а[а]роново', 'аарон[о]во'],
 'аар[о]новца': ['[а]ароновца', 'а[а]роновца', 'аарон[о]вца'],
 'аар[о]новцам': ['[а]ароновцам', 'а[а]роновцам', 'аарон[

In [72]:
key_to_key

{'аавами': '[а]авами',
 'аавасакса': '[а]авасакса',
 'ааиша': 'аа[и]ша',
 'ааками': 'а[а]ками',
 'аалбоот': 'аалб[о]от',
 'ааленец': 'аал[е]нец',
 'аалтонен': '[а]алтонен',
 'аальбоот': 'аальб[о]от',
 'аарау': '[а]арау',
 'ааргау': '[а]аргау',
 'аарона': '[а]арона',
 'аароне': '[а]ароне',
 'ааронов': 'аар[о]нов',
 'аароновец': 'аар[о]новец',
 'ааронович': 'аар[о]нович',
 'аароново': 'аар[о]ново',
 'аароновца': 'аар[о]новца',
 'аароновцам': 'аар[о]новцам',
 'аароновцами': 'аар[о]новцами',
 'аароновцах': 'аар[о]новцах',
 'аароновце': 'аар[о]новце',
 'аароновцев': 'аар[о]новцев',
 'аароновцем': 'аар[о]новцем',
 'аароновцу': 'аар[о]новцу',
 'аароновцы': 'аар[о]новцы',
 'аароновщина': 'аар[о]новщина',
 'аароном': '[а]ароном',
 'аарону': '[а]арону',
 'аахена': '[а]ахена',
 'аахенам': '[а]ахенам',
 'аахенами': '[а]ахенами',
 'аахенах': '[а]ахенах',
 'аахене': '[а]ахене',
 'аахенец': '[а]ахенец',
 'аахенов': '[а]ахенов',
 'аахеном': '[а]ахеном',
 'аахенская': '[а]ахенская',
 'аахенские': '[а]а

In [74]:
wow = ["баловень", "аналог", "провод", "электропровод", "электропривод", "биолог", "филолог", "теолог", "проктолог", "уфолог", "травматолог", "офтальмолог", "психолог", "стоматолог"]
for key in key_to_key.keys():
    if key in wow:
        final_dict[key_to_key[key]] = {"dif": 1, "distortions": final_dict[key_to_key[key]]}
    elif "лог" in key or "метр" in key or "вод" in key or "кратия" in key or "графия" in key or "бал" in key:
        final_dict[key_to_key[key]] = {"dif": 0.75, "distortions": final_dict[key_to_key[key]]}
    elif "ё" in key:
        final_dict[key_to_key[key]] = {"dif": 0.6, "distortions": final_dict[key_to_key[key]]}
    else:
        final_dict[key_to_key[key]] = {"dif": 0.5, "distortions": final_dict[key_to_key[key]]}

In [75]:
final_dict

{'[а]авами': {'dif': 0.5, 'distortions': ['а[а]вами', 'аав[а]ми', 'аавам[и]']},
 '[а]авасакса': {'dif': 0.5,
  'distortions': ['а[а]васакса', 'аав[а]сакса', 'аавас[а]кса']},
 'аа[и]ша': {'dif': 0.5, 'distortions': ['[а]аиша', 'а[а]иша', 'ааиш[а]']},
 'а[а]ками': {'dif': 0.5, 'distortions': ['[а]аками', 'аак[а]ми', 'аакам[и]']},
 'аалб[о]от': {'dif': 0.5,
  'distortions': ['[а]албоот', 'а[а]лбоот', 'аалбо[о]т']},
 'аал[е]нец': {'dif': 0.5,
  'distortions': ['[а]аленец', 'а[а]ленец', 'аален[е]ц']},
 '[а]алтонен': {'dif': 0.5,
  'distortions': ['а[а]лтонен', 'аалт[о]нен', 'аалтон[е]н']},
 'аальб[о]от': {'dif': 0.5,
  'distortions': ['[а]альбоот', 'а[а]льбоот', 'аальбо[о]т']},
 '[а]арау': {'dif': 0.5, 'distortions': ['а[а]рау', 'аар[а]у', 'аара[у]']},
 '[а]аргау': {'dif': 0.5, 'distortions': ['а[а]ргау', 'аарг[а]у', 'аарга[у]']},
 '[а]арона': {'dif': 0.5, 'distortions': ['а[а]рона', 'аар[о]на', 'аарон[а]']},
 '[а]ароне': {'dif': 0.5, 'distortions': ['а[а]роне', 'аар[о]не', 'аарон[е]']},
 '

In [76]:
with open("drive/MyDrive/accents.json", "w", encoding='utf8') as f:
    json.dump(final_dict, f, ensure_ascii=False)

In [77]:
with open("drive/MyDrive/accent_keys.json", "w", encoding='utf8') as f:
    json.dump(key_to_key, f, ensure_ascii=False)

# Датасет предложений

In [78]:
!pip install jsonlines

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jsonlines
  Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.1.0


In [79]:
import json
import jsonlines
import pandas
import tqdm

In [80]:
with jsonlines.open('drive/MyDrive/alisa_selezneva.jsonl') as f:
    sentences = pandas.DataFrame(f)

In [81]:
with open("drive/MyDrive/accents.json", "r", encoding='utf8') as f:
    accents = json.load(f)
with open("drive/MyDrive/accent_keys.json", "r", encoding='utf8') as f:
    keys = json.load(f)

In [82]:
accents

{'[а]авами': {'dif': 0.5, 'distortions': ['а[а]вами', 'аав[а]ми', 'аавам[и]']},
 '[а]авасакса': {'dif': 0.5,
  'distortions': ['а[а]васакса', 'аав[а]сакса', 'аавас[а]кса']},
 'аа[и]ша': {'dif': 0.5, 'distortions': ['[а]аиша', 'а[а]иша', 'ааиш[а]']},
 'а[а]ками': {'dif': 0.5, 'distortions': ['[а]аками', 'аак[а]ми', 'аакам[и]']},
 'аалб[о]от': {'dif': 0.5,
  'distortions': ['[а]албоот', 'а[а]лбоот', 'аалбо[о]т']},
 'аал[е]нец': {'dif': 0.5,
  'distortions': ['[а]аленец', 'а[а]ленец', 'аален[е]ц']},
 '[а]алтонен': {'dif': 0.5,
  'distortions': ['а[а]лтонен', 'аалт[о]нен', 'аалтон[е]н']},
 'аальб[о]от': {'dif': 0.5,
  'distortions': ['[а]альбоот', 'а[а]льбоот', 'аальбо[о]т']},
 '[а]арау': {'dif': 0.5, 'distortions': ['а[а]рау', 'аар[а]у', 'аара[у]']},
 '[а]аргау': {'dif': 0.5, 'distortions': ['а[а]ргау', 'аарг[а]у', 'аарга[у]']},
 '[а]арона': {'dif': 0.5, 'distortions': ['а[а]рона', 'аар[о]на', 'аарон[а]']},
 '[а]ароне': {'dif': 0.5, 'distortions': ['а[а]роне', 'аар[о]не', 'аарон[е]']},
 '

In [84]:
import string

import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

def process_text(text):
    if text == "":
        text = "empty"
    return [word for word in word_tokenize(text.lower()) if word not in string.punctuation]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [86]:
f = jsonlines.open('drive/MyDrive/accents_alisa_new.jsonl', mode='w')
all = sentences['sentence'].to_list()[:-1]
acclist = list(keys.keys())
new_sentences = {}
for j in tqdm.tqdm(range(len(all))):
    now_sent = {'sentence': all[j], 'using_word_id': 0, 'complex_words': []}
    difs = []
    for word in process_text(all[j]):
        if word in acclist:
            now_sent['complex_words'].append({'word': keys[word], 'dif': accents[keys[word]]["dif"], 'distortions': accents[keys[word]]["distortions"]})
            difs.append(accents[keys[word]]['dif'])
    if len(now_sent['complex_words']) > 0:
        now_sent['using_word_id'] = difs.index(max(difs))
        f.write(now_sent)
f.close()

100%|██████████| 10000/10000 [1:13:25<00:00,  2.27it/s]


In [87]:
now_sent

{'sentence': 'Читайте об этих приключениях Коры Орват в романе Средство от замухраков, пятом томе знаменитого сериала Галактическая полиция.',
 'using_word_id': 0,
 'complex_words': [{'word': 'приключ[е]ниях',
   'dif': 0.5,
   'distortions': ['пр[и]ключениях', 'прикл[ю]чениях', 'приключен[и]ях']},
  {'word': 'знамен[и]того',
   'dif': 0.5,
   'distortions': ['зн[а]менитого', 'знам[е]нитого', 'знаменит[о]го']},
  {'word': 'сери[а]ла',
   'dif': 0.5,
   'distortions': ['с[е]риала', 'сер[и]ала', 'сериал[а]']},
  {'word': 'галакт[и]ческая',
   'dif': 0.5,
   'distortions': ['г[а]лактическая', 'гал[а]ктическая', 'галактич[е]ская']},
  {'word': 'пол[и]ция',
   'dif': 0.5,
   'distortions': ['п[о]лиция', 'полиц[и]я', 'полици[я]']}]}