In [1]:
from tqdm import tqdm
from typing import List
import sacrebleu

In [2]:
def bleu(hypotheses, references, tokenize="13a"):
    """
    Raw corpus BLEU from sacrebleu (without tokenization)
    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :param tokenize: one of {'none', '13a', 'intl', 'zh', 'ja-mecab'}
    :return:
    """
    return sacrebleu.corpus_bleu(sys_stream=hypotheses,
                                 ref_streams=[references],
                                 tokenize=tokenize).score

In [3]:
!wget https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/generate_transformers.py

--2021-04-06 20:33:49--  https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/generate_transformers.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10474 (10K) [text/plain]
Saving to: 'generate_transformers.py'


2021-04-06 20:33:50 (6.01 MB/s) - 'generate_transformers.py' saved [10474/10474]



In [4]:
!wget https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/pretrain_transformers.py

--2021-04-06 20:33:51--  https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/pretrain_transformers.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34150 (33K) [text/plain]
Saving to: 'pretrain_transformers.py'


2021-04-06 20:33:51 (572 KB/s) - 'pretrain_transformers.py' saved [34150/34150]



In [3]:
!export CUDA_HOME=/usr/local/cuda
#!git clone https://github.com/NVIDIA/apex

In [4]:
from generate_transformers import *

class Args:
    def __init__(self):
        self.model_type = 'gpt2'
        self.model_name_or_path = 'sberbank-ai/rugpt3large_based_on_gpt2'

        self.prompt = ''
        self.length = 50
        self.stop_token = '</s>'

        self.k = 5
        self.p = .95
        self.temperature = 1

        self.repetition_penalty = 1
        self.num_return_sequences = 1

        self.device='cuda'
        self.seed=42

In [5]:
def generate_sequences(prompt_text, args, delimiter='>>>'):
    args.prompt_text = prompt_text

    
    if prompt_text.endswith('.txt'):
      with open(prompt_text, 'r') as f:
        prompt_text = f.read()

    # print(f'Input:\n{prompt_text}\n')
    
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
    encoded_prompt = encoded_prompt.to(args.device)

    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=args.length + len(encoded_prompt[0]),
        temperature=args.temperature,
        top_k=args.k,
        top_p=args.p,
        repetition_penalty=args.repetition_penalty,
        do_sample=True,
        num_return_sequences=args.num_return_sequences,
    )

    if len(output_sequences.shape) > 2:
            output_sequences.squeeze_()

    generated_sequences = []
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        text = text[: text.find(args.stop_token) if args.stop_token else None]
        text = text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]

        if delimiter in text:
            text = text.split(delimiter)[0].rstrip()
        else:
            text = text.split('\n')[0].rstrip()

        generated_sequences.append(text)
        # print(f'[{generated_sequence_idx}]ruGPT:\n{prompt_text.split('\n')[-1] + text}')

    return generated_sequences

In [14]:
!python3 -m pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.6.0-cp38-cp38-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 1.1 MB/s eta 0:00:01
[?25hCollecting tensorflow<2.7,>=2.6.0
  Downloading tensorflow-2.6.0-cp38-cp38-manylinux2010_x86_64.whl (458.4 MB)
[K     |████████████████████████████████| 458.4 MB 45 kB/s  eta 0:00:012
[?25hCollecting tensorflow-hub>=0.8.0
  Downloading tensorflow_hub-0.12.0-py2.py3-none-any.whl (108 kB)
[K     |████████████████████████████████| 108 kB 10.0 MB/s eta 0:00:01
[?25hCollecting six~=1.15.0
  Downloading six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting numpy~=1.19.2
  Downloading numpy-1.19.5-cp38-cp38-manylinux2010_x86_64.whl (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 10.1 MB/s eta 0:00:01
[?25hCollecting tensorboard~=2.6
  Downloading tensorboard-2.6.0-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 13.4 MB/s eta 0:00:01
[?25hCollecting termcolor~=1.1.0
  Down

In [7]:
import tensorflow_text
import tensorflow_hub as hub

In [8]:
def compute_use(target_comment, generated_comments):
    target_comment = embed(list([target_comment]))
    generated_comments = list(map(embed, generated_comments))

    return [np.inner(target_comment, gc)[0][0] for gc in generated_comments]


def compare_results(source_comment, target_comment, generated_comments, scores):
    print(f'Toxic : {source_comment}')
    print(f'Polite: {target_comment}\n')

    print(f'Score  Generated Comment')
    for i in np.argsort(scores):
        print(np.round(scores[i], 3), generated_comments[i])


embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

10/21/2021 11:04:13 - INFO - absl -   Using /tmp/tfhub_modules to cache modules.
2021-10-21 11:04:13.518890: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-21 11:04:13.520201: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2021-10-21 11:04:13.520269: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2021-10-21 11:04:13.520338: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic 

In [9]:
args = Args()

model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
model = model_class.from_pretrained(args.model_name_or_path)
model.to(args.device)

10/21/2021 07:23:50 - INFO - filelock -   Lock 140134280355024 acquired on /root/.cache/huggingface/transformers/3a4aea75af518e10b02aac733b25c92d979cec74d3b48edb877e13d5e4d4792f.aa4141df4e4cfca5435e8aa371aea7f575eb2d0767c1395e0d4cd6209796a705.lock


Downloading:   0%|          | 0.00/609 [00:00<?, ?B/s]

10/21/2021 07:23:51 - INFO - filelock -   Lock 140134280355024 released on /root/.cache/huggingface/transformers/3a4aea75af518e10b02aac733b25c92d979cec74d3b48edb877e13d5e4d4792f.aa4141df4e4cfca5435e8aa371aea7f575eb2d0767c1395e0d4cd6209796a705.lock
10/21/2021 07:23:53 - INFO - filelock -   Lock 140133714131744 acquired on /root/.cache/huggingface/transformers/7cf248f6c39196b677fb5b9b9ee8da3ded29f363018f4ccb1d0605721c719f4c.75e651cd6468a93822a2ca422a07b480dacd0c2d13ac194fdf771f768e6a8447.lock


Downloading:   0%|          | 0.00/2.93G [00:00<?, ?B/s]

10/21/2021 07:24:15 - INFO - filelock -   Lock 140133714131744 released on /root/.cache/huggingface/transformers/7cf248f6c39196b677fb5b9b9ee8da3ded29f363018f4ccb1d0605721c719f4c.75e651cd6468a93822a2ca422a07b480dacd0c2d13ac194fdf771f768e6a8447.lock


KeyboardInterrupt: 

In [None]:
args.length = 200
args.num_return_sequences = 10

args.k = 3
args.p = .5
args.temperature = 5


args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)

In [25]:
delimiter = " >>>> "

train_ky = []
train_ru = []
train_data = []
with open('/mnt/DATA2/grashchenkov/Checheny-Russki/be_ru/train.tok.be', 'r') as f:
    for line in f:
        line = line.strip()
        train_ky.append(line)   
with open('/mnt/DATA2/grashchenkov/Checheny-Russki/be_ru/train.tok.ru', 'r') as f:
    for line in f:
        line = line.strip()
        train_ru.append(line)
for i in range(len(train_ky)):
    line = train_ky[i] + delimiter + train_ru[i]
    train_data.append(line)
with open('/mnt/DATA2/grashchenkov/Checheny-Russki/be_ru/gpt_data/train.txt', 'w') as f:
    f.write("\n".join(train_data[:100000]))
    
    
    

valid_ky = []
valid_ru = []
valid_data = []
with open('/mnt/DATA2/grashchenkov/Checheny-Russki/be_ru/test.tok.be', 'r') as f:
    for line in f:
        line = line.strip()
        valid_ky.append(line)   
with open('/mnt/DATA2/grashchenkov/Checheny-Russki/be_ru/test.tok.ru', 'r') as f:
    for line in f:
        line = line.strip()
        valid_ru.append(line)
for i in range(len(valid_ky)):
    line = valid_ky[i] + delimiter + valid_ru[i]
    valid_data.append(line)
with open('/mnt/DATA2/grashchenkov/Checheny-Russki/be_ru/gpt_data/valid.txt', 'w') as f:
    f.write("\n".join(valid_data))
    

In [11]:
!python3 pretrain_transformers.py \
    --line_by_line \
    --output_dir=gpt_translation_model \
    --model_type=gpt2 \
    --model_name_or_path=sberbank-ai/rugpt3medium_based_on_gpt2 \
    --do_train \
    --train_data_file=/mnt/DATA2/grashchenkov/Checheny-Russki/sah_ru_data/gpt_data/train.txt \
    --fp16 \
    --eval_data_file=/mnt/DATA2/grashchenkov/Checheny-Russki/sah_ru_data/gpt_data/valid.txt \
    --per_gpu_train_batch_size 5 \
    --gradient_accumulation_steps 1 \
    --num_train_epochs 25 \
    --block_size 2048 \
    --overwrite_output_dir

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
10/11/2021 14:10:25 - INFO - __main__ -   Training/evaluation parameters Namespace(adam_epsilon=1e-08, block_size=2048, cache_dir=None, config_name=None, device=device(type='cuda'), do_eval=False, do_train=True, eval_all_checkpoints=False, eval_data_file='/mnt/DATA2/grashchenkov/Checheny-Russki/sah_ru_data/gpt_data/valid.txt', evaluate_during_training=False, fp16=True, fp16_opt_level='O1', gradient_accumulation_steps=1, learning_rate=5e-05, line_by_line=True, local_rank=-1, logging_steps=500, max_grad_norm=1.0, max_steps=-1, mlm=False, mlm_probability=0.15, model_name_or_path='sberbank-ai/rugpt3medium_based_on_gpt2', model_type='gpt2', n_gpu=1, no_cuda=False, num_train_epochs=25.0, output_dir='gpt_translation_model', overwrite_cache=False, overwrite_output_dir=True, per_gpu_eval_batch_size=4, per_gpu_train_batch_size=5, save_steps=500, save_total_limit=None, seed=42, se

In [9]:
args = Args()
args.model_name_or_path = 'gpt_translation_model'

model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
model = model_class.from_pretrained(args.model_name_or_path)
model.to(args.device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 1024)
    (wpe): Embedding(2048, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
     

In [10]:
args.length = 50
args.num_return_sequences = 3

args.k = 3
args.p = .5
args.temperature = 5.2


args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)

In [11]:
delimiter = '>>>>'

generated_sequences = generate_sequences('Мне пара ісці спаць .' + f' {delimiter} ', args)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [12]:
generated_sequences

[' Мне пора идти спать.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad',
 ' Мне пора идти спать.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad',
 ' Мне пора идти спать.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad']

In [13]:
test_ky = []
test_ru = []
with open('/mnt/DATA2/grashchenkov/Checheny-Russki/be_ru/test.tok.be', 'r') as f:
    for line in f:
        line = line.strip()
        test_ky.append(line)   
with open('/mnt/DATA2/grashchenkov/Checheny-Russki/be_ru/test.tok.ru', 'r') as f:
    for line in f:
        line = line.strip()
        test_ru.append(line)
        

In [14]:
translated_ru = []
for i in tqdm(range(len(test_ky))):
        generated_sequences = generate_sequences(test_ky[i] + f' {delimiter} ', args)
        translation = generated_sequences[1].strip().split('<pad>')[0]
        translated_ru.append(translation.replace('.','').replace('_',''))

  0%|          | 0/255 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/255 [00:00<03:11,  1.32it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 2/255 [00:01<03:32,  1.19it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 3/255 [00:02<03:18,  1.27it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 4/255 [00:03<03:10,  1.32it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 5/255 [00:03<03:06,  1.34it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 6/255 [00:04<03:05,  1.34it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  3%|▎         | 7/255 [00:05<03:03,  1.35it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  3%|▎         | 8/255 [00:05<03:00,  1.37it/s]Setting `

In [65]:
translated_ru[:10]

['Все говорят, что нет денег, а ты возьми и купи слана',
 'Это хамство! Дайте мне книгу!',
 'Она живёт на улице Будановцев',
 'Она живёт на улице Будавинко',
 'Дайте, пожалуйста, гуляш с картофелем',
 '',
 'Я купила два билета на поезд до Молодечно на львовском поезде',
 'Я купила два билета до Молодечно на львовском поезде',
 'Паспорт выдан Молодечненским РОВД Минской области 12 октября 2015 года и настоящий является 12 октября 2015 года',
 'Этот сервер доступен только через наш VPN']

In [16]:
test_ru[:10]

['Все говорят , что у них нет денег , а ты возьми и купи слона .',
 'Это хамство ! Дайте мне книгу жалоб !',
 'Она живёт на улице Будавников .',
 'Она живёт на улице Строителей .',
 'Дайте , пожалуйста , гуляш с картошкой .',
 'Два билета до Минска , пожалуйста .',
 'Я купила два билета до Молодечно на вильнюсском поезде .',
 'Я купила два билета до Молодечно на вильнюсском поезде .',
 'Паспорт выдан Молодечненским РОВД Минской области 12 марта 2015 года и действителен до 12 марта 2025 года .',
 'Этот сервер доступен только через наш VPN .']

In [66]:
sacrebleu.corpus_bleu(translated_ru[:], [test_ru[:]]).score

35.908630027379814

In [27]:
P,R,F1 = score(translated_ru, test_ru, lang='ru', verbose=True)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 0.30 seconds, 839.90 sentences/sec




In [28]:
P.mean()

tensor(0.8276)

In [29]:
R.mean()

tensor(0.8244)

In [30]:
F1.mean()

tensor(0.8257)

In [45]:
#Moses

In [21]:
moses_predict = []
with open('/mnt/DATA2/grashchenkov/Checheny-Russki/be_ru/TEST_TRANSLATED') as f:
    for line in f:
        line = line.strip()
        moses_predict.append(line)  
        
        
moses_true = []
with open('/mnt/DATA2/grashchenkov/Checheny-Russki/be_ru/test.tok.ru') as f:
    for line in f:
        line = line.strip()
        moses_true.append(line)  

In [22]:
moses_predict[:10]


['Все говорят , что не имеют денег , а ты вазьмі и купи слона .',
 'Это хамства ! Дайте мне книгу скарг !',
 'Она живёт на улице Будаўнікоў .',
 'Она живёт на улице Будаўнікоў .',
 'Дайте , пожалуйста , гуляш с бульбай .',
 'Два билеты к Мінска , пожалуйста .',
 'Я купила два билеты к Молодечно на вільнюсскім поезде .',
 'Я купила два билеты к Молодечно на вільнюсскім поезде .',
 'Паспорт выдан Маладзечанскім РОВД Минской области двенадцать марта 2015 года и настоящий к двенадцать марта 2025 года .',
 'Этот сервер даступны только из-за наш VPN .']

In [23]:
moses_true[30:40]


['Моя цель — сделать так , чтобы вы свободно заговорили на португальском языке и чтобы это было весело .',
 'С твоего позволения .',
 'Груша цвела последний год .',
 'Мы с Петром часто ходим в кино .',
 'Вы ему ничего не сказали ?',
 'Франциск Скорина из Полоцка — восточнославянский первопечатник . Он принёс предкам белорусов , украинцев и русских технологию книги в 1517 году .',
 'Хорошо . Продолжим !',
 'Почему он поехал в Алжир ?',
 'Этот сад красивый .',
 'Протоны имеют положительный электрический заряд .']

In [31]:
sacrebleu.corpus_bleu(moses_predict[:], [moses_true[:]]).score




33.97506862176753

In [32]:
P,R,F1 = score(moses_predict, moses_true, lang='ru', verbose=True)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 0.26 seconds, 999.19 sentences/sec


In [33]:
P.mean()

tensor(0.8840)

In [34]:
R.mean()

tensor(0.8961)

In [35]:
F1.mean()

tensor(0.8899)

In [None]:
# Bert BlUE

In [36]:
from bert_embedding import BertEmbedding
import math
bert_E = BertEmbedding()

In [37]:
def token_list(embeddings, no_sep=False):
    """
    Returns with the tokens of the embedding data from the BertEmbedding.

    Params:
        embeddings: The embedding data from BertEmbedding
        no_sep: If True, the separators are trimmed.
    Return:
        tokens: list of tokens
    """
    if no_sep:
        return embeddings[0][0][1:-1]
    return embeddings[0][0]


def sentence_embs(embeddings):
    """Return with the sentence level embeddings"""
    return embeddings[0][1][0]

def prep(sentence):
    """Return with tokens and sentence level embeddings"""
    embs = bert_E([sentence])
    tokens = token_list(embs, no_sep=True)
    se = sentence_embs(embs)
    return tokens, se

def cosine_similarity(x,y):
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = square_rooted(x)*square_rooted(y)
    return numerator/float(denominator)

def square_rooted(x):
    return math.sqrt(sum([a*a for a in x]))

In [38]:
s0 = "James Cook was a very good man and a loving husband."
s1 = "James Cook was a very nice man and a loving husband."
s2 = "James Cook was a bad man and a terrible husband."
s3 = "James Cook was a nice person and a good husband."
s4 = "The sky is blue today and learning history is important."

In [39]:
r0, e0 = prep(s0)
r1, e1 = prep(s1)
r2, e2 = prep(s2)
r3, e3 = prep(s3)
r4, e4 = prep(s4)

In [40]:
import nltk
SmoothingFunction = nltk.translate.bleu_score.SmoothingFunction()
import nltk.translate.bleu_score as bleu
from  nltk.translate.bleu_score import corpus_bleu 

In [41]:
print("r0-r0 bleu score: ", bleu.sentence_bleu([r0], r0, smoothing_function=SmoothingFunction.method2))
print("r0-r1 bleu score: ", bleu.sentence_bleu([r1], r0, smoothing_function=SmoothingFunction.method2))
print("r0-r2 bleu score: ", bleu.sentence_bleu([r2], r0, smoothing_function=SmoothingFunction.method2))
print("r0-r3 bleu score: ", bleu.sentence_bleu([r3], r0, smoothing_function=SmoothingFunction.method2))
print("r0-r4 bleu score: ", bleu.sentence_bleu([r4], r0, smoothing_function=SmoothingFunction.method2))

r0-r0 bleu score:  1.0
r0-r1 bleu score:  0.6999271023161167
r0-r2 bleu score:  0.3475075148610631
r0-r3 bleu score:  0.29697089145035693
r0-r4 bleu score:  0.10855926040543844


In [42]:
print("e0-e0 cosine-similarity:", cosine_similarity(e0,e0))
print("e0-e1 cosine-similarity:", cosine_similarity(e1,e0))
print("e0-e2 cosine-similarity:", cosine_similarity(e2,e0))
print("e0-e3 cosine-similarity:", cosine_similarity(e3,e0))
print("e0-e4 cosine-similarity:", cosine_similarity(e4,e0))

e0-e0 cosine-similarity: 1.0
e0-e1 cosine-similarity: 0.9900622593588156
e0-e2 cosine-similarity: 0.965961241983015
e0-e3 cosine-similarity: 0.9760124353647561
e0-e4 cosine-similarity: 0.30749654649663904


In [46]:
gpt_cosines = []
moses_cosines = []
for i in tqdm(range(len(translated_ru[:]))):#
    try:
        gpt_r0, gpt_e0 = prep(translated_ru[i])
        test_r0, test_e0 = prep(test_ru[i])
        moses_r0, moses_e0 = prep(moses_predict[i])

        gpt_cosines.append(cosine_similarity(test_e0,gpt_e0))
        moses_cosines.append(cosine_similarity(test_e0,moses_e0))
    except:
        pass

100%|██████████| 255/255 [00:58<00:00,  4.36it/s]


In [47]:
np.mean(moses_cosines)

0.9563928450112729

In [48]:
np.mean(gpt_cosines)

0.9208955715238076

In [93]:
# LaBSE

In [49]:
from sentence_transformers import SentenceTransformer

In [61]:
# sentences = ["This is an example sentence", "Each sentence is converted"]
# model = SentenceTransformer('sentence-transformers/LaBSE')
# #embeddings = model.encode(sentences)

# gpt_cosines_labse = []
# moses_cosines_labse = []
# for i in tqdm(range(260)):#len(translated_ru))
#     try:
#         gpt_embed = model.encode(translated_ru[i])
#         test_embed = model.encode(test_ru[i])
#         moses_embed = model.encode(moses_predict[i])

#         gpt_cosines.append(cosine_similarity(test_embed,gpt_embed))
#         moses_cosines.append(cosine_similarity(test_embed,moses_embed))
#     except:
#         pass

In [62]:
np.mean(moses_cosines)


0.9509852994308419

In [63]:
np.mean(gpt_cosines)

0.8515383112034733

In [55]:
from bert_score import score

In [56]:
original = 'My mother washed the car'
s_1 = 'Моя мать мыла машину'
s_2 = 'My mother мыла car'

orig_embed = model.encode(original)
s1_embed = model.encode(s_1)
s2_embed = model.encode(s_2)

print('Orig vs s1',cosine_similarity(orig_embed,s1_embed))
print('Orig vs s2',cosine_similarity(orig_embed,s2_embed))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Orig vs s1 0.9252627695789337
Orig vs s2 0.9089925800423423


In [57]:
P, R, F1 = score([s_1], [original], lang='en', verbose=True)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 29.19 sentences/sec


In [58]:
P, R, F1

(tensor([0.6758]), tensor([0.8634]), tensor([0.7582]))

In [59]:
P, R, F1 = score([s_2], [original], lang='en', verbose=True)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 27.52 sentences/sec


In [60]:
P, R, F1

(tensor([0.7891]), tensor([0.9188]), tensor([0.8491]))