In [6]:
!pip install tensorboard==2.2.0 --use-feature=2020-resolver
!pip install tensorflow_text>=2.0.0rc0 --use-feature=2020-resolver
!pip install -U sentence-transformers --use-feature=2020-resolver
!pip install razdel
!pip install navec
!pip install slovnet

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
[31mERROR: pytorch-lightning 0.9.0 requires tensorboard==2.2.0, but you'll have tensorboard 2.3.0 which is incompatible.[0m
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
Collecting sentence-transformers
  Downloading sentence-transformers-0.3.8.tar.gz (66 kB)
[K     |████████████████████████████████| 66 kB 171 kB/s eta 0:00:011
Collecting transformers<3.4.0,>=3.1.0
  Downloading transformers-3.3.1-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 467 kB/s eta 0:00:01
Collecting tokenizers==0.8.1.rc2
  Downloading tokenizers-0.8.1rc2-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 6.0 MB/s eta 0:00:01


Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-0.3.8-py3-none-any.whl size=101994 sha256=e95d477cce4ef4cff5e3aa6b378de947892c5eaee06d76fd699176f706adb70b
  Stored in directory: /root/.cache/pip/wheels/1c/43/65/fe0f3ea9327623e749a79eb5dfad85a809c84064b1cc4682c1
Successfully built sentence-transformers
Installing collected packages: tokenizers, transformers, sentence-transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.8.1rc1
    Uninstalling tokenizers-0.8.1rc1:
      Successfully uninstalled tokenizers-0.8.1rc1
  Attempting uninstall: transformers
    Found existing installation: transformers 3.0.2
    Uninstalling transformers-3.0.2:
      Successfully uninstalled transformers-3.0.2
[31mERROR: allennlp 1.1.0 requires transformers<3.1,>=3.0, but you'll have transformers 3.3.1 which i

In [7]:
import json
import numpy as np
import pandas as pd
import os
import re
import tensorflow_hub as hub
import tensorflow_text
import torch
from navec import Navec
from razdel import sentenize, tokenize
from sentence_transformers import SentenceTransformer, util, models
from slovnet import Morph



In [None]:
threshold = 0.92
queries_path = '../input/queries/queries.json' # путь к запросам
handbook_path = '../input/handbook/handbook.xls' # путь к справочнику 
model_path_directory = '../input/pytorch-rubert/' # путь к модели

In [None]:
if not os.path.isfile('navec_news_v1_1B_250K_300d_100q.tar'):
    !wget https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar
if not os.path.isfile('slovnet_morph_news_v1.tar'):
    !conda install -y gdown 
    !gdown https://drive.google.com/uc?id=1U4TT9cGtdYL8I0G5NSd2gCvia9wpFpDr

In [None]:
with open(queries_path) as json_file:
    data = json.load(json_file)
data

In [None]:
queries = [item['Item'] for project in data['Names'] for item in project['Items']]

In [None]:
handbook = pd.read_excel(handbook_path)
handbook.head()

In [None]:
handbook.iloc[:, 2] = handbook.iloc[:, 2].fillna('')

In [None]:
# Corpus with example sentences
handbook_names = handbook.iloc[:, 2].tolist()

In [None]:
navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
morph = Morph.load('slovnet_morph_news_v1.tar', batch_size=4)
morph.navec(navec)

## RuBERT

In [None]:
word_embedding_model = models.Transformer(model_path_directory, max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

rubert_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
handbook_rubert_embeddings = rubert_model.encode(handbook_names, convert_to_tensor=True)

In [None]:
results = {'Исходный текст': [],
           'Количество': [],
           'Единицы измерения': [],
           'Предлагаемый вариант': [],
           'Коэффициент сходства, %': []}

In [None]:
def element_extraction(text):
    digit_terms = re.findall('\w*\d*[\.,]?\d+\w*', text)

    chunk = []
    for sentence in sentenize(text.lower()):
        print(sentence)
        tokens = [_.text for _ in tokenize(sentence.text)]
        chunk.append(tokens)
    markup = next(morph.map(chunk))
    
    nouns = []
    adjectives = []
    for token in markup.tokens:
        if (token.pos == 'NOUN' or token.pos == 'PNOUN') and not(re.findall('\d+', token.text)):
            nouns.append(token.text)
        if token.pos == 'ADJ' or token.pos == 'ADV':
            adjectives.append(token.text)
    return digit_terms, nouns, adjectives

In [None]:
def improved_cosine_similarity(record, addendums_coef, adj_coef):
    digit_terms, nouns, adjectives = element_extraction(record)
    digit_terms_encoded = rubert_model.encode(' '.join(digit_terms), convert_to_tensor=True)
    nouns_encoded = rubert_model.encode(' '.join(nouns), convert_to_tensor=True)
    adjectives_encoded = rubert_model.encode(' '.join(adjectives), convert_to_tensor=True)
    
    digits_cos_scores = util.pytorch_cos_sim(query_digit_terms_encoded, digit_terms_encoded).cpu()
    nouns_cos_scores = util.pytorch_cos_sim(query_nouns_encoded, nouns_encoded).cpu()
    adj_cos_scores = util.pytorch_cos_sim(query_adjectives_encoded, adjectives_encoded).cpu()
    
    return addendums_coef * (digits_cos_scores + nouns_cos_scores + adj_coef * adj_cos_scores)

In [None]:
# Find the closest 20 sentences of the corpus for each query sentence based on cosine similarity
top_k = 20
addendums_coef = 0.1
adj_coef = 0.8

for query in queries:
    query_embedding = rubert_model.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, handbook_rubert_embeddings)[0]
    cos_scores = cos_scores.cpu()
    
    top_results = torch.topk(cos_scores, k=top_k)
    top_results_values = top_results.values.numpy()
    top_results_indices = top_results.indices.numpy()

    query_digit_terms, query_nouns, query_adjectives = element_extraction(query)
    query_digit_terms_encoded = rubert_model.encode(' '.join(query_digit_terms), convert_to_tensor=True)
    query_nouns_encoded = rubert_model.encode(' '.join(query_nouns), convert_to_tensor=True)
    query_adjectives_encoded = rubert_model.encode(' '.join(query_adjectives), convert_to_tensor=True)
         
    top_results_values = [top_results_values[n] + improved_cosine_similarity(handbook_names[top_results_indices[n]],
                                                                             addendums_coef,
                                                                             adj_coef) for n in range(top_k)]
    max_cos_indices = np.argsort(top_results_values)[::-1]

    new_top_results_values = [top_results_values[n] for n in max_cos_indices][:1:]
    top_results_indices = [top_results_indices[n] for n in max_cos_indices][:1:]
    self_similarity = 1 + addendums_coef * (2 + adj_coef)
    
    similarity_coef = float(new_top_results_values[0][0][0]) / self_similarity

    if similarity_coef >= threshold:
        results['Исходный текст'].append(query)
        results['Количество'].append(1)
        results['Единицы измерения'].append('шт')
        results['Предлагаемый вариант'].append(handbook_names[top_results_indices[0]])
        results['Коэффициент сходства, %'].append(similarity_coef * 100)
    else:
        results['Исходный текст'].append(query)
        results['Количество'].append(1)
        results['Единицы измерения'].append('шт')
        results['Предлагаемый вариант'].append('')
        results['Коэффициент сходства, %'].append('')    

# without other information

In [None]:
results = pd.DataFrame(results)
results

In [None]:
results.to_csv('./results.csv', sep=';', encoding='utf-8-sig')

In [None]:
from IPython.display import FileLink
FileLink(r'results.csv')