<a href="https://colab.research.google.com/github/Ilya2raev/NLP_model/blob/main/Bert_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -e git+https://github.com/negedng/bert-embedding#egg=bert_embedding
from bert_embedding import BertEmbedding
bert_E = BertEmbedding()

In [None]:
!pip3 install spacy==2.1.9
!pip3 install pymorphy2==0.8

In [None]:
!git clone -b v2.1 https://github.com/buriy/spacy-ru.git
!cp -r ./spacy-ru/ru2/ .

In [4]:
import nltk
import nltk.translate.bleu_score as bleu
import nltk.translate.gleu_score as gleu
import spacy
import ru2

import math
import numpy as np

try:
  nltk.data.find('tokenizers/punkt')
except LookupError:
  nltk.download('punkt')

In [5]:
nlp = spacy.load('ru2')

In [6]:
def token_list(embeddings, no_sep=False):
    """
    Returns with the tokens of the embedding data from the BertEmbedding.

    Params:
        embeddings: The embedding data from BertEmbedding
        no_sep: If True, the separators are trimmed.
    Return:
        tokens: list of tokens
    """
    if no_sep:
        return embeddings[0][0][1:-1]
    return embeddings[0][0]

def sentence_embs(embeddings):
    """Return with the sentence level embeddings"""
    return embeddings[0][1][0]

def prep(sentence):
    """Return with tokens and sentence level embeddings"""
    embs = bert_E([sentence], filter_spec_tokens=False)
    tokens = token_list(embs, no_sep=True)
    se = sentence_embs(embs)
    return tokens, se

def map_function(x, type='exp', n=0.2):
    """
    Map 0-inf to 1-0 with some function
    
    Type:
        inverse: 1/(1+n*x)
        arctan: 1-2/pi*arctan(x)
        exp: (1/(1+n))^x
    """
    if type == 'inverse':
        return 1 / (1+n*x)
    elif type == 'arctan':
        return 1 - 2 / math.pi * math.atan(n*x)
    elif type == 'exp':
        return (1/(1+n)) ** x
    else:
        raise(NotImplementedError("Function not implemented"))

def square_root(x):
    return math.sqrt(sum([i*i for i in x]))

def cosine_similarity(x, y):
    numerator = sum(a * b for a, b in zip(x, y))
    denominator = square_root(x) * square_root(y)
    return numerator / float(denominator)

In [36]:
s0 = 'Экология - наука, изучающая взаимодействие живых, а также организацию и функционирование биологических систем'
s1 = 'Экология - наука, изучающая живые организмы, их функционирование и внутривидовое и внешневидовое воздействие'

In [37]:
r0, e0 = prep(s0)
r1, e1 = prep(s1)

In [38]:
t0 = nlp(s0)
t1 = nlp(s1)

In [39]:
SmoothingFunction = nltk.translate.bleu_score.SmoothingFunction()

In [None]:
print("r0-r0 bleu score: ", bleu.sentence_bleu([r0], r0, smoothing_function=SmoothingFunction.method0))
print("r0-r1 bleu score: ", bleu.sentence_bleu([r1], r0, smoothing_function=SmoothingFunction.method0))
print("__________________________________________")
print("r0-r0 bleu score: ", bleu.sentence_bleu([r0], r0, smoothing_function=SmoothingFunction.method2))
print("r0-r1 bleu score: ", bleu.sentence_bleu([r1], r0, smoothing_function=SmoothingFunction.method2))
print("__________________________________________")
print("r0-r0 gleu score: ", gleu.sentence_gleu([r0], r0))
print("r0-r1 gleu score: ", gleu.sentence_gleu([r1], r0))
print("__________________________________________")
print("e0-e0 Euclid distance:", np.linalg.norm(e0-e0))
print("e0-e1 Euclid distance:", np.linalg.norm(e1-e0))
print("__________________________________________")
print("e0-e0 Euclid distance:", str(map_function(np.linalg.norm(e0-e0))))
print("e0-e1 Euclid distance:", str(map_function(np.linalg.norm(e1-e0))))
print("__________________________________________")
print("e0-e0 cosine-similarity:", cosine_similarity(e0,e0))
print("e0-e1 cosine-similarity:", cosine_similarity(e1,e0))
print("__________________________________________")
print("t0-t0 spacy similarity", t0.similarity(t0))
print("t0-t1 spacy similarity", t1.similarity(t0))

In [41]:
def validate_bleu_gleu(value):
  return 1 if value > 0.2 else 0

def validate_euclid(value):
  return 1 if float(value) > 0.3 else 0

def validate_cosine(value):
  return 1 if value > 0.9 else 0

def validate_spacy(value):
  return 1 if value > 0.65 else 0

In [None]:
if all([validate_bleu_gleu(bleu.sentence_bleu([r1], r0, smoothing_function=SmoothingFunction.method2)),
       validate_bleu_gleu(gleu.sentence_gleu([r1], r0)),
       validate_euclid(str(map_function(np.linalg.norm(e1-e0)))),
       validate_cosine(cosine_similarity(e1,e0)),
       validate_spacy(t1.similarity(t0))
       ]) == 1:
       print('Решение можно рекомендовать')
else:
  print('Ответ неточный или требуется пояснение')