In [None]:
!pip install conllu

Collecting conllu
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.3


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13705 sha256=c494c02242eaccdbe497d3016b766e7ceca114c423585e76567b9083d06017d5
  Stored in directory: /root/.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
TRAIN_DATA_PATH = '/content/drive/MyDrive/Think-BERT/data/experiment1/ru_taiga-ud-train.conllu'
TEST_DATA_PATH = '/content/drive/MyDrive/Think-BERT/data/experiment1/ru_taiga-ud-test.conllu'
GRAMMAR_PATH = '/content/drive/MyDrive/Think-BERT/data/experiment2/generate/grammar.cfg'

In [None]:
import conllu
import pandas as pd
import numpy as np
from typing import Set, List
from conllu import parse_incr
from sklearn.preprocessing import LabelEncoder

# Embeddings

## Data

In [None]:
class CoNLLUSentenceHandler:

    def __init__(self,
                 markup : conllu.models.SentenceGenerator,
                 avoid_genres : Set[str] = [],
                 features : List[str] = ['Tense', 'Aspect']):
        """
        Initialisation.
        """
        self.markup = markup
        self.avoid_genres = avoid_genres
        self.features = features
        self.dataframe = self._sentence_data(features)

    def _sentence_data(self,
                       features : List[str]) -> pd.DataFrame:
        """
        Create a sentence dataset out of the input string.
        """
        dataframe = pd.DataFrame({'sentence': [], **{feat: [] for feat in features}})
        markup = self.markup
        for sent in markup:
            sent_text = sent.metadata['text']
            root = sent.filter(head=0, upos='VERB')
            if len(root) == 1:
                if root[0]['feats'] \
                and all(feat in root[0]['feats'] for feat in features) \
                and sent.metadata['genre'] not in self.avoid_genres:
                    feature_tags = [root[0]['feats'][feat] for feat in features]
                    dataframe.loc[len(dataframe.index)] = [sent_text, *feature_tags]
        return dataframe

In [None]:
with open(TRAIN_DATA_PATH, 'r', encoding='utf-8') as f:
    train_data = parse_incr(f)
    train_conllu_handler = CoNLLUSentenceHandler(train_data, ['wiki', 'news'])

with open(TEST_DATA_PATH, 'r', encoding='utf-8') as f:
    test_data = parse_incr(f)
    test_conllu_handler = CoNLLUSentenceHandler(test_data, ['wiki', 'news'])

In [None]:
train_conllu_handler.dataframe.head()

Unnamed: 0,sentence,Tense,Aspect
0,"Снова приобрел дозу,",Past,Perf
1,Уже не та на лоб спадает челка...,Pres,Imp
2,"Но ты не живешь по-евангельски, и это — причин...",Pres,Imp
3,Ведь этот цветок цветёт для меня!,Pres,Imp
4,Как свет добра струился с глаз!,Past,Imp


In [None]:
train_conllu_handler.dataframe.groupby(['Tense', 'Aspect']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence
Tense,Aspect,Unnamed: 2_level_1
Fut,Perf,376
Past,Imp,1003
Past,Perf,2013
Pres,Imp,3442


In [None]:
le_tense = LabelEncoder()
y_tense_train = le_tense.fit_transform(train_conllu_handler.dataframe.Tense)

In [None]:
le_tense.classes_

array(['Fut', 'Past', 'Pres'], dtype=object)

In [None]:
y_tense_test = le_tense.transform(test_conllu_handler.dataframe.Tense)

In [None]:
le_aspect = LabelEncoder()
y_aspect_train = le_aspect.fit_transform(train_conllu_handler.dataframe.Aspect)

In [None]:
le_aspect.classes_

array(['Imp', 'Perf'], dtype=object)

In [None]:
y_aspect_test = le_aspect.transform(test_conllu_handler.dataframe.Aspect)

In [None]:
le_aspect

## Experiment

In [None]:
DEVICE = "cuda"

In [None]:
from transformers import AutoTokenizer, BertForMaskedLM, BertModel
from transformers.tokenization_utils_base import BatchEncoding
from tqdm import tqdm
import torch

In [None]:
sent_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
sent_model = BertModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence").to(DEVICE)

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
def sentence_embed(sentences, model):
    encoded_input = sent_tokenizer(sentences,
                    padding=True,
                    truncation=True,
                    return_tensors="pt").to(DEVICE)
    encoded_input_split_input_ids = torch.split(encoded_input['input_ids'], 10)
    encoded_input_split_token_type_ids = torch.split(encoded_input['token_type_ids'], 10)
    encoded_input_split_attention_mask = torch.split(encoded_input['attention_mask'], 10)

    outputs = []
    for input_ids, token_type_ids, attention_mask in tqdm(zip(encoded_input_split_input_ids,
                                                              encoded_input_split_token_type_ids,
                                                              encoded_input_split_attention_mask)):
        input_dict = {'input_ids': input_ids,
                      'token_type_ids': token_type_ids,
                      'attention_mask': attention_mask}
        batch_encoding = BatchEncoding(input_dict)
        with torch.no_grad():
            model_output = model(**batch_encoding)
        sentence_embeddings = mean_pooling(model_output, batch_encoding['attention_mask'])
        outputs.append(sentence_embeddings)
    return torch.cat(outputs, 0)

In [None]:
train_sentences = list(train_conllu_handler.dataframe.sentence)

In [None]:
X_train = sentence_embed(train_sentences, sent_model)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
684it [01:55,  5.94it/s]


In [None]:
X_train = X_train.to('cpu').numpy()

In [None]:
test_sentences = list(test_conllu_handler.dataframe.sentence)

In [None]:
X_test = sentence_embed(test_sentences, sent_model)

34it [00:02, 16.51it/s]


In [None]:
X_test = X_test.to('cpu').numpy()

In [None]:
X_test

array([[-0.948893  , -0.33011666,  0.13756968, ..., -0.09647324,
        -0.6287118 ,  0.8583425 ],
       [-0.24631986, -0.80171496, -0.24494423, ...,  0.13394766,
        -0.7702907 ,  1.3716329 ],
       [-0.31510836, -1.2277154 , -0.21950327, ..., -0.23889814,
        -0.5170102 ,  1.4287614 ],
       ...,
       [-0.98610306,  0.53517973,  0.1859416 , ...,  0.9492559 ,
         0.41824195,  0.90690064],
       [-0.8599921 , -0.49004853,  0.22836645, ..., -0.20890138,
        -0.00267607,  0.8887618 ],
       [ 0.34976372, -0.39239755, -0.12239909, ..., -0.07524933,
        -0.13357325,  1.4212575 ]], dtype=float32)

## Evaluation

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
from sklearn import svm, tree, neighbors

In [None]:
np.unique(y_tense_train, return_counts=True)

(array([0, 1, 2]), array([ 376, 3016, 3442]))

In [None]:
3442/(3442+3016+376)

0.5036581796897863

In [None]:
np.unique(y_aspect_train, return_counts=True)

(array([0, 1]), array([4445, 2389]))

In [None]:
4445/(4445+2389)

0.6504243488440152

### SVM

In [None]:
tense_svm_clf = svm.SVC()

In [None]:
tense_svm_clf.fit(X_train, y_tense_train)

In [None]:
tense_svm_predictions = tense_svm_clf.predict(X_test)

In [None]:
(tense_svm_predictions == y_tense_test).mean()

0.6355421686746988

In [None]:
precision_recall_fscore_support(tense_svm_predictions, y_tense_test, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(0.7019618449988249, 0.6355421686746988, 0.6670252468162645, None)

In [None]:
aspect_svm_clf = svm.SVC()

In [None]:
aspect_svm_clf.fit(X_train, y_aspect_train)

In [None]:
aspect_svm_predictions = aspect_svm_clf.predict(X_test)

In [None]:
aspect_svm_predictions

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,

In [None]:
precision_recall_fscore_support(aspect_svm_predictions, y_aspect_test, average='weighted')

(0.79639690838827, 0.7018072289156626, 0.7305787000631955, None)

### SVM balanced

In [None]:
tense_bal_clf = svm.SVC(class_weight="balanced")

In [None]:
tense_bal_clf.fit(X_train, y_tense_train)

In [None]:
tense_bal_predictions = tense_bal_clf.predict(X_test)

In [None]:
precision_recall_fscore_support(tense_bal_predictions, y_tense_test, average='weighted')

(0.6467996753253572, 0.6325301204819277, 0.6177304206638372, None)

In [None]:
aspect_bal_clf = svm.SVC(class_weight="balanced")

In [None]:
aspect_bal_clf.fit(X_train, y_aspect_train)

In [None]:
aspect_bal_predictions = aspect_bal_clf.predict(X_test)

In [None]:
precision_recall_fscore_support(aspect_bal_predictions, y_aspect_test, average='weighted')

(0.6799670379631735, 0.6807228915662651, 0.6753318111886715, None)

### KNN 50

In [None]:
tense_knn_clf = neighbors.KNeighborsClassifier(n_neighbors=50)

In [None]:
tense_knn_clf.fit(X_train, y_tense_train)

In [None]:
tense_knn_predictions = tense_knn_clf.predict(X_test)

In [None]:
precision_recall_fscore_support(tense_knn_predictions, y_tense_test, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(1.0, 0.5391566265060241, 0.7005870841487281, None)

In [None]:
aspect_knn_clf = neighbors.KNeighborsClassifier(n_neighbors=50)

In [None]:
aspect_knn_clf.fit(X_train, y_aspect_train)

In [None]:
aspect_knn_predictions = aspect_knn_clf.predict(X_test)

In [None]:
precision_recall_fscore_support(aspect_knn_predictions, y_aspect_test, average='weighted')

(0.9893375388345835, 0.6385542168674698, 0.7740061935480109, None)

In [None]:
aspect_knn_predictions

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
np.unique(aspect_knn_predictions, return_counts=True)

(array([0, 1]), array([330,   2]))

In [None]:
y_aspect_test

array([1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,

### KNN 100

In [None]:
tense_knn_clf = neighbors.KNeighborsClassifier(n_neighbors=100)

In [None]:
tense_knn_clf.fit(X_train, y_tense_train)

In [None]:
tense_knn_predictions = tense_knn_clf.predict(X_test)

In [None]:
precision_recall_fscore_support(tense_knn_predictions, y_tense_test, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(0.7858954822863209, 0.4999999999999999, 0.5993440517566603, None)

In [None]:
aspect_knn_clf = neighbors.KNeighborsClassifier(n_neighbors=100)

In [None]:
aspect_knn_clf.fit(X_train, y_aspect_train)

In [None]:
aspect_knn_predictions = aspect_knn_clf.predict(X_test)

In [None]:
precision_recall_fscore_support(aspect_knn_predictions, y_aspect_test, average='weighted')

(0.8900460331893612, 0.6144578313253012, 0.7195593416814887, None)

### KNN 500

In [None]:
tense_knn_clf = neighbors.KNeighborsClassifier(n_neighbors=500)

In [None]:
tense_knn_clf.fit(X_train, y_tense_train)

In [None]:
tense_knn_predictions = tense_knn_clf.predict(X_test)

In [None]:
precision_recall_fscore_support(tense_knn_predictions, y_tense_test, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(0.718691829319292, 0.49096385542168675, 0.571269041988512, None)

In [None]:
aspect_knn_clf = neighbors.KNeighborsClassifier(n_neighbors=500)

In [None]:
aspect_knn_clf.fit(X_train, y_aspect_train)

In [None]:
aspect_knn_predictions = aspect_knn_clf.predict(X_test)

In [None]:
precision_recall_fscore_support(aspect_knn_predictions, y_aspect_test, average='weighted')

(0.5087017882852164, 0.5180722891566265, 0.5102607884972467, None)

### Decision Tree

In [None]:
tense_tree_clf = tree.DecisionTreeClassifier()

In [None]:
tense_tree_clf.fit(X_train, y_tense_train)

In [None]:
tense_tree_predictions = tense_tree_clf.predict(X_test)

In [None]:
precision_recall_fscore_support(tense_tree_predictions, y_tense_test, average='weighted')

(0.4936282921688794, 0.4819277108433735, 0.48684211106248887, None)

In [None]:
aspect_tree_clf = tree.DecisionTreeClassifier()

In [None]:
aspect_tree_clf.fit(X_train, y_aspect_train)

In [None]:
aspect_tree_predictions = aspect_tree_clf.predict(X_test)

In [None]:
precision_recall_fscore_support(aspect_tree_predictions, y_aspect_test, average='weighted')

(0.5209413124194893, 0.5301204819277109, 0.5232501156500218, None)

# Prediction

## Data

In [None]:
from nltk import CFG
from nltk.parse.generate import generate
from bs4 import BeautifulSoup
import re
import random
from collections import defaultdict

In [None]:
with open(GRAMMAR_PATH) as f:
    grammar_string = f.read()

In [None]:
grammar = CFG.fromstring(grammar_string)

In [None]:
generated_sentences = []
for sentence in generate(grammar):
     generated_sentences.append(sentence)

In [None]:
len(generated_sentences)

2121808

In [None]:
def tags_in_sentence(tags, sentence):
    return all(tag in sentence for tag in tags)

def is_tag(string):
    return (string.startswith('<') and string.endswith('>'))

def prettify_sentence(sentence, full_stop='.'):
    sentence = list(filter(lambda x: not is_tag(x), sentence))
    sentence[0] = sentence[0][0].upper() + sentence[0][1:]
    return ' '.join(sentence)+full_stop

In [None]:
def random_sentences(sentences, tags, k=100, seed=None):
    random.seed(seed)
    sentences_with_tags = list(filter(lambda x: tags_in_sentence(tags, x), sentences))
    sentence_strings = list(map(prettify_sentence, sentences_with_tags))
    return random.choices(sentence_strings, k=k)

In [None]:
aspect_tags = ['<PERF>', '<IMPF>']

cat_tags = {
    '<PERF>': ['<SUDDENLY>', '<ATSOMEPOINT>', '<JUST>', '<INTIME>', '<K>', '<IMMEDIATELY>'],
    '<IMPF>': ['<ONCE_A>', '<FOR>', '<LONG>']
}

trans_tags = ['<TRANS>', '<INTRANS>']

pron_tags = {
    '<TRANS>': [('<PRON_nomn>', '<NOUN_accs>'),
    ('<NOUN_nomn>', '<PRON_accs>'),
    ('<NOUN_nomn>', '<NOUN_accs>')],
    '<INTRANS>':
    [('<NOUN_nomn>',),
     ('<PRON_nomn>',)]
}

In [None]:
sentence_samples = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
gridsearch_counter = 1

for aspect in aspect_tags:
    categories = cat_tags[aspect]
    for cat in categories:
        for trans in trans_tags:
            tag_pairs = pron_tags[trans]
            for tag_pair in tag_pairs:
                print(f"{gridsearch_counter}.")
                print([aspect, cat, trans, tag_pair])
                rand_sents = random_sentences(
                    generated_sentences,
                    [aspect, cat, trans, *tag_pair],
                    k=1000,
                    seed=22
                )
                gridsearch_counter += 1
                sentence_samples[aspect][cat][trans][tag_pair] = rand_sents

1.
['<PERF>', '<SUDDENLY>', '<TRANS>', ('<PRON_nomn>', '<NOUN_accs>')]
2.
['<PERF>', '<SUDDENLY>', '<TRANS>', ('<NOUN_nomn>', '<PRON_accs>')]
3.
['<PERF>', '<SUDDENLY>', '<TRANS>', ('<NOUN_nomn>', '<NOUN_accs>')]
4.
['<PERF>', '<SUDDENLY>', '<INTRANS>', ('<NOUN_nomn>',)]
5.
['<PERF>', '<SUDDENLY>', '<INTRANS>', ('<PRON_nomn>',)]
6.
['<PERF>', '<ATSOMEPOINT>', '<TRANS>', ('<PRON_nomn>', '<NOUN_accs>')]
7.
['<PERF>', '<ATSOMEPOINT>', '<TRANS>', ('<NOUN_nomn>', '<PRON_accs>')]
8.
['<PERF>', '<ATSOMEPOINT>', '<TRANS>', ('<NOUN_nomn>', '<NOUN_accs>')]
9.
['<PERF>', '<ATSOMEPOINT>', '<INTRANS>', ('<NOUN_nomn>',)]
10.
['<PERF>', '<ATSOMEPOINT>', '<INTRANS>', ('<PRON_nomn>',)]
11.
['<PERF>', '<JUST>', '<TRANS>', ('<PRON_nomn>', '<NOUN_accs>')]
12.
['<PERF>', '<JUST>', '<TRANS>', ('<NOUN_nomn>', '<PRON_accs>')]
13.
['<PERF>', '<JUST>', '<TRANS>', ('<NOUN_nomn>', '<NOUN_accs>')]
14.
['<PERF>', '<JUST>', '<INTRANS>', ('<NOUN_nomn>',)]
15.
['<PERF>', '<JUST>', '<INTRANS>', ('<PRON_nomn>',)]
16.
['

## Model

In [None]:
from transformers import AutoTokenizer, BertForMaskedLM
from tqdm import tqdm
import pymorphy2
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = BertForMaskedLM.from_pretrained("DeepPavlov/rubert-base-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:
def topn_predictions(sentence, model, tokenizer, n=10):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
    predicted_token_ids = logits[0, mask_token_index].argsort(axis=-1, descending=True)[0, :n]
    return predicted_token_ids

def predicted_grid(sentences, tokenizer, model, n=10):
    for i, sent in enumerate(tqdm(sentences)):
        predicted_ids = torch.unsqueeze(topn_predictions(sent, model, tokenizer, n), 0)
        if i == 0:
            grid = predicted_ids
        else:
            grid = torch.cat((grid, predicted_ids), 0)
    return grid

In [None]:
gridsearch_counter = 1

aspect = '<IMPF>'
# for aspect in aspect_tags:
categories = cat_tags[aspect]
for cat in categories:
    # for trans in trans_tags:
        trans = '<INTRANS>'
        tag_pairs = pron_tags[trans]
        for tag_pair in tag_pairs:
          print(f"{gridsearch_counter}.")
          sentences = sentence_samples[aspect][cat][trans][tag_pair]
          predicted_tensor = predicted_grid(sentences, tokenizer, model)
          pred_save_path = f'/content/drive/MyDrive/Think-BERT/data/experiment2/results/{aspect}_{cat}_{trans}_{tag_pair}.pt'
          torch.save(predicted_tensor, pred_save_path)
          gridsearch_counter += 1

### Loading

In [None]:
import pymorphy2

In [None]:
morph = pymorphy2.MorphAnalyzer()

In [None]:
def perf_at_1(preds, tokenizer, morph):
    return perf_at_n(preds, tokenizer, morph) == 0

def perf_at_n(preds, tokenizer, morph):
    for i, pred in enumerate(preds):
        word = tokenizer.decode(pred)
        parse = morph.parse(word)[0]
        if parse.tag.aspect == 'perf':
            return i
    return None

def impf_at_n(preds, tokenizer, morph):
    for i, pred in enumerate(preds):
        word = tokenizer.decode(pred)
        parse = morph.parse(word)[0]
        if parse.tag.aspect == 'impf':
            return i
    return None

def impf_before_perf(preds, tokenizer, morph):
    for pred in preds:
        word = tokenizer.decode(pred)
        parse = morph.parse(word)[0]
        aspect = parse.tag.aspect
        if aspect == 'impf':
            return True
        if aspect == 'perf':
            return False
    return None

def pres_before_past(preds, tokenizer, morph):
    for pred in preds:
        word = tokenizer.decode(pred)
        parse = morph.parse(word)[0]
        tense = parse.tag.tense
        if tense == 'pres':
            return True
        if tense == 'past':
            return False
    return None

def verb_at_n(preds, tokenizer, morph):
    for i, pred in enumerate(preds):
        word = tokenizer.decode(pred)
        parse = morph.parse(word)[0]
        if parse.tag.POS == 'VERB':
            return i
    return None

In [None]:
sentence_samples['<IMPF>']['<FOR>']['<INTRANS>'][('<NOUN_nomn>',)][5]

'Председатели [MASK] на протяжении четырёх лет.'

In [None]:
x = torch.load("/content/drive/MyDrive/Think-BERT/data/experiment2/results/<PERF>_<K>_<INTRANS>_('<NOUN_nomn>',).pt")

In [None]:
tokenizer.decode(x[5])

'избирались избираются менялись работали назначались действуют существуют работают меняются назначаются'

In [None]:
perf_at_1(x[1], tokenizer, morph)

False

In [None]:
perf_at_n(x[1], tokenizer, morph)

8

In [None]:
impf_before_perf(x[1], tokenizer, morph)

True

In [None]:
pres_before_past(x[1], tokenizer, morph)

False

In [None]:
verb_at_n(x[1], tokenizer, morph)

0

In [None]:
def map_array(arr, func, dtype):
  results = np.array([], dtype=dtype)
  for row in arr:
    results = np.append(results, func(row, tokenizer, morph))
  return results

In [None]:
torch.load("/content/drive/MyDrive/Think-BERT/data/experiment2/results/<IMPF>_<FOR>_<INTRANS>_('<NOUN_nomn>',).pt", map_location=torch.device('cpu'))

tensor([[ 32820,  10845,  88098,  ..., 115271,  11727,  58734],
        [ 91836,  60354,  54346,  ..., 100028,  30070,  19666],
        [ 33678,  22330,  17854,  ...,  11841,  22845,  26735],
        ...,
        [ 63583,  13563,  18426,  ...,  14265,  26534,  19415],
        [ 38621,  22330,  38897,  ...,  73982,  68942,  22845],
        [ 68312,  21937,  69038,  ...,  26579,  16397,  30910]])

In [None]:
sentence_results = defaultdict(
    lambda: defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(dict)
        )
      )
    )
aspect_tag_to_func = {'<PERF>': perf_at_n, '<IMPF>': impf_at_n}
gridsearch_counter = 1

for aspect in aspect_tags:
    categories = cat_tags[aspect]
    for cat in categories:
        for trans in trans_tags:
            tag_pairs = pron_tags[trans]
            for tag_pair in tag_pairs:
              print(f"\n{gridsearch_counter}.")
              print(f"{aspect}_{cat}_{trans}_{tag_pair}")
              pred_save_path = f'/content/drive/MyDrive/Think-BERT/data/experiment2/results/{aspect}_{cat}_{trans}_{tag_pair}.pt'
              preds = torch.load(pred_save_path, map_location=torch.device('cpu'))

              impf_before_perf_arr = map_array(preds, impf_before_perf, bool)
              pres_before_past_arr = map_array(preds, pres_before_past, bool)
              asp_n_arr = map_array(preds, aspect_tag_to_func[aspect], int)
              verb_n_arr = map_array(preds, verb_at_n, int)

              print("ImpfBeforePerf", impf_before_perf_arr[impf_before_perf_arr != np.array(None)].mean())
              print("PresBeforePast", pres_before_past_arr[pres_before_past_arr != np.array(None)].mean())

              (perf_ix, perf_counts) = np.unique(asp_n_arr[asp_n_arr != np.array(None)], return_counts=True)
              (verb_ix, verb_counts) = np.unique(verb_n_arr[verb_n_arr != np.array(None)], return_counts=True)
              print("Asp1", (asp_n_arr == 0).mean())
              print("Verb1", (verb_n_arr == 0).mean())
              print("AspAtAll", 1 - (asp_n_arr != np.array(None)).mean())
              print("VerbAtAll", 1 - (verb_n_arr != np.array(None)).mean())

              sentence_results[aspect][cat][trans][tag_pair]['impf_before_perf'] = impf_before_perf_arr
              sentence_results[aspect][cat][trans][tag_pair]['pres_before_past'] = pres_before_past_arr
              sentence_results[aspect][cat][trans][tag_pair]['perf_n'] = asp_n_arr
              sentence_results[aspect][cat][trans][tag_pair]['verb_at_n'] = verb_at_n

              gridsearch_counter += 1


1.
<PERF>_<SUDDENLY>_<TRANS>_('<PRON_nomn>', '<NOUN_accs>')
ImpfBeforePerf 0.437
PresBeforePast 0.43229689067201604
Asp1 0.523
Verb1 0.941
AspAtAll 0.029000000000000026
VerbAtAll 0.0

2.
<PERF>_<SUDDENLY>_<TRANS>_('<NOUN_nomn>', '<PRON_accs>')
ImpfBeforePerf 0.703
PresBeforePast 0.742
Asp1 0.297
Verb1 1.0
AspAtAll 0.14
VerbAtAll 0.0

3.
<PERF>_<SUDDENLY>_<TRANS>_('<NOUN_nomn>', '<NOUN_accs>')
ImpfBeforePerf 0.4678714859437751
PresBeforePast 0.464321608040201
Asp1 0.507
Verb1 0.951
AspAtAll 0.05700000000000005
VerbAtAll 0.007000000000000006

4.
<PERF>_<SUDDENLY>_<INTRANS>_('<NOUN_nomn>',)
ImpfBeforePerf 0.6201005025125628
PresBeforePast 0.6673366834170854
Asp1 0.375
Verb1 0.989
AspAtAll 0.015000000000000013
VerbAtAll 0.0050000000000000044

5.
<PERF>_<SUDDENLY>_<INTRANS>_('<PRON_nomn>',)
ImpfBeforePerf 0.56
PresBeforePast 0.56
Asp1 0.44
Verb1 1.0
AspAtAll 0.0
VerbAtAll 0.0

6.
<PERF>_<ATSOMEPOINT>_<TRANS>_('<PRON_nomn>', '<NOUN_accs>')
ImpfBeforePerf 0.021
PresBeforePast 0.002
Asp1 0.95

In [None]:
x = sentence_results[aspect][cat][trans][tag_pair]['perf_n']

In [None]:
hm = torch.load("/content/drive/MyDrive/Think-BERT/data/experiment2/results/<PERF>_<K>_<INTRANS>_('<NOUN_nomn>',).pt",
                map_location=torch.device('cpu'))

In [None]:
for i, row in enumerate(hm):
  if verb_at_n(row, tokenizer, morph) != 0:
    print(row)
    print(i)

tensor([21853, 10753, 24790, 28657, 21899, 36799, 40559, 30855, 58868, 37037])
1
tensor([  326, 16170,   866,   108,  2077, 37293, 51163,  1565,   122,  3422])
2
tensor([21853, 28657, 10753, 24790, 36085, 54992, 24930, 11537, 10862, 54774])
5
tensor([46644, 14872, 18067, 36222,  8959, 14072, 36836, 32855, 26727, 95240])
6
tensor([16170, 80281, 59182, 87419, 31096, 89389,   326, 68644, 51163, 40625])
8
tensor([  326,   866, 21853,   108, 16170, 89389,  6300,  5746,   132, 51163])
11
tensor([46644, 14872,  8959, 27968, 18067, 26727, 62344, 36222, 32855, 40592])
12
tensor([16170, 51163,   866,   326, 26251,   108,  3422,  3521, 89389, 21853])
13
tensor([ 16170,  51163,  89389,  51624, 111134,  47233, 110530,  50867,  26019,
         80420])
14
tensor([16170,   326,   108,   122, 51163,  3422,   866, 26251, 37293,  4414])
15
tensor([16170, 51163,  3422, 89389, 26251, 26019, 74366, 96717,   866,  6496])
16
tensor([51163,   866, 16170, 26251, 31096, 89389, 81931,   326,  9699, 38213])
18
ten