In [1]:
import os
import re
import sys
import Levenshtein
import openai
import torch
import torch.nn as nn
import math
import copy
import time
import pandas as pd
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
import torch.distributed as dist
import torch.multiprocessing as mp
import numpy as np
import matplotlib.pyplot as plt


from transformers import MarianMTModel, MarianTokenizer, GPT2Tokenizer, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from nltk.tokenize import sent_tokenize, LineTokenizer
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from pathlib import Path
from torch.nn.functional import log_softmax, pad
from Levenshtein import distance
from torch.optim.lr_scheduler import LambdaLR
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from translate.storage.tmx import tmxfile
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization, UtilityFunction
import warnings


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True



In [2]:
#szybkie sprawdzenie czy model działa

model_checkpoint = "Helsinki-NLP/opus-mt-en-de"
translator = pipeline("translation", model=model_checkpoint)
translator("How are you?")

[{'translation_text': 'Wie geht es dir?'}]

In [3]:
#przygotowanie plików .txt z korpusu równoległego .tmx

with open("de-en.tmx", 'rb') as fin:
    tmx_file = tmxfile(fin, 'en', 'de')
file_de = open("de_origin/de.txt",'w', encoding="utf-8")
file_en = open("en_origin/en.txt",'w', encoding="utf-8")

for node in tmx_file.unit_iter():
    file_en.write(node.target + '\n')
    file_de.write(node.source + '\n')
    # print(node.target, node.source)
    
file_de.close()
file_en.close()

In [10]:
#sprawdzenie czy pliki zostały odpowiednio utworzone

with open("en_origin/en.txt", encoding="utf-8") as myfile:
    head = [next(myfile) for x in range(3)]
print(head)

with open("de_origin/de.txt", encoding="utf-8") as myfile:
    head = [next(myfile) for x in range(3)]
print(head)

['ANNUAL ACTIVITY REPORT 2007 EUROPEAN COURT OF AUDITORS \n', 'Luxembourg: Office for Official Publications of the European Communities, 2008 ISBN 978-92-9207-005-2 © European Communities, 2008 Reproduction is authorised provided the source is acknowledged. \n', '3 4–5 6–7 8–13 14–17 18–25 26–27 28–29 30–35 36–37 38–43 44–45 PRESIDENT’S FOREWORD MISSION, VISION, VALUES AND STRATEGIC OBJECTIVES THE COURT’S ROLE AND WORK GOVERNANCE AND ORGANISATION OVERVIEW OF AUDIT REPORTS AND OPINIONS FOLLOW-UP AND IMPACT THE COURT’S VIEW THE COURT’S WORK IN 2007 AND BEYOND INTERNATIONAL COOPERATION HUMAN RESOURCES FINANCIAL INFORMATION CONTENTS \n', 'PRESIDENT’S FOREWORD 4 Bringing the European Court of Auditors closer to EU citizens is one of our key objectives and part of our mission to promote transparency and accountability. \n', 'I, therefore, take great pleasure in welcoming you to the first annual activity report of the European Court of Auditors. \n', 'It provides an overview of the Court and 

In [11]:
#ograniczenie wielkości dostępnego korpusu do 10 tys. zdań w celu przyśpieszenia obliczeń.

with open("de-en.tmx", 'rb') as fin:
    tmx_file = tmxfile(fin, 'en', 'de')
file_de = open("de_origin/de_10000.txt",'w', encoding="utf-8")

i = 0
for node in tmx_file.unit_iter():
    if(i<10000):
        file_de.write(node.source + '\n')
    i+=1
    
file_de.close()

In [10]:
###Direct translation to English

#przygotowanie modelu 

model_checkpoint = "Helsinki-NLP/opus-mt-de-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
 
#zainicjowanie CUDA
    
dev = "cuda:0"
device = torch.device(dev)
model.to(device)

#odpowiednio przygotowane parametry modeli
args = [
        'model.generate(**model_inputs)', 
        'model.generate(**model_inputs, do_sample=True, top_k=0)',
        'model.generate(**model_inputs, do_sample=True, top_k=0, temperature=1)',
        'model.generate(**model_inputs, do_sample=True, top_k=50)',
        'model.generate(**model_inputs, do_sample=True, top_k=6)',
        'model.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.40)',
        'model.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.92)',
        'model.generate(**model_inputs, do_sample=True, top_k=0, repetition_penalty=0.5)',
        'model.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.92, repetition_penalty=0.5)'
       ]

#nazwy plików zapisowych
titles = [
          'no_params',
          'sample',
          'temperature', 
          'top_k50',
          'top_k6',
          'top_p40',
          'top_p92',
          'repetition_penalty05',
          'repetition_penalty_top_k6'
         ]

for arg,title in zip(args,titles):
    print(arg, title)

    if Path("Translation\Auto_"+title+".txt").is_file():
        print("File exists: " + "Translation\Auto_"+title+".txt \n")
    else:
        print("No file: " + "Translation\Auto_"+title+".txt")
        file = open("Translation\Auto_"+title+".txt",'w', encoding="utf-8")
        with open("de-en.tmx", 'rb') as fin:
            tmx_file = tmxfile(fin, 'en', 'de')

        i = 0
        for node in tmx_file.unit_iter():
            if i == 10000:
                break

            model_inputs = tokenizer(node.source, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
            translated = eval(arg)
            for t in translated:
                if(i%100==0):
                    print(repr(i) + " " + tokenizer.decode(t, skip_special_tokens=True) + '\n')
                file.write(tokenizer.decode(t, skip_special_tokens=True) + '\n') 
            i+=1


        file.close()

model.generate(**model_inputs) no_params
File exists: Translation\Auto_no_params.txt 

model.generate(**model_inputs, do_sample=True, top_k=0) sample
File exists: Translation\Auto_sample.txt 

model.generate(**model_inputs, do_sample=True, top_k=0, temperature=1) temperature
File exists: Translation\Auto_temperature.txt 

model.generate(**model_inputs, do_sample=True, top_k=50) top_k50
File exists: Translation\Auto_top_k50.txt 

model.generate(**model_inputs, do_sample=True, top_k=6) top_k6
File exists: Translation\Auto_top_k6.txt 

model.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.40) top_p40
File exists: Translation\Auto_top_p40.txt 

model.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.92) top_p92
File exists: Translation\Auto_top_p92.txt 

model.generate(**model_inputs, do_sample=True, top_k=0, repetition_penalty=0.5) repetition_penalty05
File exists: Translation\Auto_repetition_penalty05.txt 

model.generate(**model_inputs, do_sample=True, top_k=6, top_p=0

In [26]:
###BLEU for Single language translation

total = 10000

file_org = open("En_origin\en.txt",'r+', encoding="utf-8")
file_eu = open("En_origin\en_EU.txt",'r+', encoding="utf-8")
file_google = open("En_origin\en_google.txt",'r+', encoding="utf-8")

reference1 = [] 
reference2 = []
reference3 = []

for i in range (total):
    reference1.append(file_org.readline().split())
    reference2.append(file_eu.readline().split())
    reference3.append(file_google.readline().split())

for x,filename in enumerate(os.listdir(Path(os.getcwd() + "\Translation"))):
    with open(Path(os.getcwd() + "\Translation\\"+filename), 'r+', encoding="utf-8") as results:
        score = 0
        for i in range (total):
            translated = (results.readline().split())
            score += sentence_bleu([reference1[i],reference2[i],reference3[i]], translated)
        print(filename + ": BLEU = " + repr(score/total))
    results.close()
file_org.close()
file_eu.close()
file_google.close()

Auto_no_params.txt: BLEU = 0.7150162205110255
Auto_repetition_penalty05.txt: BLEU = 0.6606883748361918
Auto_repetition_penalty_top_k6.txt: BLEU = 0.6843873512995613
Auto_sample.txt: BLEU = 0.6916898352809548
Auto_temperature.txt: BLEU = 0.6901345568236875
Auto_top_k50.txt: BLEU = 0.7016499119886753
Auto_top_k6.txt: BLEU = 0.7068425277875855
Auto_top_p40.txt: BLEU = 0.6813655959475835
Auto_top_p92.txt: BLEU = 0.7056663074717361


In [30]:
###Two languages translation

model_checkpoint_pl = "Helsinki-NLP/opus-mt-de-pl"
model_checkpoint_en = "Helsinki-NLP/opus-mt-pl-en"

tokenizerPl = AutoTokenizer.from_pretrained(model_checkpoint_pl)
modelPl = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_pl)

tokenizerEn = AutoTokenizer.from_pretrained(model_checkpoint_en)
modelEn = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_en)
 
dev = "cuda:0"
device = torch.device(dev)
modelPl.to(device)
modelEn.to(device)

argsPl = [
        'modelPl.generate(**model_inputs)', 
#        'modelPl.generate(**model_inputs, do_sample=True, top_k=0)',
#        'modelPl.generate(**model_inputs, do_sample=True, top_k=0, temperature=1)',
#        'modelPl.generate(**model_inputs, do_sample=True, top_k=50)',
#        'modelPl.generate(**model_inputs, do_sample=True, top_k=6)',
#        'modelPl.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.40)',
#        'modelPl.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.92)',
        'modelPl.generate(**model_inputs, do_sample=True, top_k=0, repetition_penalty=0.5)',
        'modelPl.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.92, repetition_penalty=0.5)'
       ]
argsEn = [
        'modelEn.generate(**model_inputs)', 
#        'modelEn.generate(**model_inputs, do_sample=True, top_k=0)',
#        'modelEn.generate(**model_inputs, do_sample=True, top_k=0, temperature=1)',
#        'modelEn.generate(**model_inputs, do_sample=True, top_k=50)',
#        'modelEn.generate(**model_inputs, do_sample=True, top_k=6)',
#        'modelEn.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.40)',
#        'modelEn.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.92)',
        'modelEn.generate(**model_inputs, do_sample=True, top_k=0, repetition_penalty=0.5)',
        'modelEn.generate(**model_inputs, do_sample=True, top_k=6, top_p=0.92, repetition_penalty=0.5)'
       ]
titles = [
          'no_params',
#          'sample',
#          'temperature', 
#          'top_k50',
#          'top_k6',
#          'top_p40',
#          'top_p92',
          'repetition_penalty05',
          'repetition_penalty_top_k6'
         ]


for i in range(len(titles)):
    
    if Path("TranslationTwoLanguages\Marian_"+titles[i]+".txt").is_file():
        print("File exists: " + "TranslationTwoLanguages\Marian_"+titles[i]+".txt \n")
    else:
        print(argsPl[i], titles[i])

        file = open("TranslationTwoLanguages\Marian_"+titles[i]+".txt",'w', encoding="utf-8")
        with open("de-en.tmx", 'rb') as fin:
            tmx_file = tmxfile(fin, 'en', 'de')
            
        x = 0
        for node in tmx_file.unit_iter():
            if x == 10000:
                break

            model_inputs = tokenizerPl(node.source, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
            translatedPl = eval(argsPl[i])

            textPl = ''
            for t in translatedPl:
                textPl += tokenizerPl.decode(t, skip_special_tokens=True)

            model_inputs = tokenizerEn(textPl, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
            translatedEn = eval(argsEn[i])

            for t in translatedEn:
                file.write(tokenizerEn.decode(t, skip_special_tokens=True) + '\n') 
                if(x%100==0):
                    print(repr(x) + " " + tokenizerEn.decode(t, skip_special_tokens=True) + '\n')
            x+=1


        file.close()

File exists: TranslationTwoLanguages\Marian_no_params.txt 

File exists: TranslationTwoLanguages\Marian_repetition_penalty05.txt 

File exists: TranslationTwoLanguages\Marian_repetition_penalty_top_k6.txt 



In [33]:
###BLEU for Two languages translation
total = 10000



file_org = open("En_origin\en.txt",'r+', encoding="utf-8")
file_eu = open("En_origin\en_EU.txt",'r+', encoding="utf-8")
file_google = open("En_origin\en_google.txt",'r+', encoding="utf-8")

reference1 = [] 
reference2 = []
reference3 = []

for i in range (total):
    reference1.append(file_org.readline().split())
    reference2.append(file_eu.readline().split())
    reference3.append(file_google.readline().split())

for x,filename in enumerate(os.listdir(Path(os.getcwd() + "\TranslationTwoLanguages"))):
    with open(Path(os.getcwd() + "\TranslationTwoLanguages\\"+filename), 'r+', encoding="utf-8") as results:
        score = 0
        for i in range (total):
            translated = (results.readline().split())
            score += sentence_bleu([reference1[i],reference2[i],reference3[i]], translated)

        print(filename + ": BLEU = " + repr(score/total))

    results.close()
file_org.close()
file_eu.close()
file_google.close()

Marian_no_params.txt: BLEU = 0.5095654999135777
Marian_repetition_penalty05.txt: BLEU = 0.46046313477259965
Marian_repetition_penalty_top_k6.txt: BLEU = 0.47982806203191647


In [5]:
## https://github.com/fmfn/BayesianOptimization
#BayesianOptimization

model_checkpoint = "Helsinki-NLP/opus-mt-de-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
 
dev = "cuda:0"
device = torch.device(dev)
model.to(device)

total = 100

file_org = open("En_origin\en.txt",'r+', encoding="utf-8")
file_eu = open("En_origin\en_EU.txt",'r+', encoding="utf-8")
file_google = open("En_origin\en_google.txt",'r+', encoding="utf-8")
file_de = open("De_origin/de_10000.txt", 'r+', encoding="utf-8")

to_translate = []
reference1 = []
reference2 = []
reference3 = []

for i in range (total):
    to_translate.append(file_de.readline().split("\n")[0])
    reference1.append(file_org.readline().split())
    reference2.append(file_eu.readline().split())
    reference3.append(file_google.readline().split())
    
def black_box_function(top_p=1,top_k=6,repetition_penalty=1):
    score = 0
    for i in range (total):
        model_inputs = tokenizer(to_translate[i], return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
        translated = model.generate(**model_inputs, do_sample=True, top_k=math.floor(top_k), top_p=top_p,repetition_penalty=repetition_penalty)
        for t in translated:
            translated = tokenizer.decode(t, skip_special_tokens=True).split()
        score += sentence_bleu([reference1[i],reference2[i],reference3[i]], translated)
    return(score/total)


pbounds = {#"top_p": [0.1, 1.0],
            "top_k": [1, 50],
          #  "repetition_penalty": [0.1, 1.0],
          #  "temperature": [0.1, 1.0]
          }

optimizer = BayesianOptimization(f = black_box_function,
                                     pbounds = pbounds, verbose = 2,
                                     random_state = 4)


optimizer.maximize(init_points = 5, n_iter = 10)

print(optimizer.max)

file_de.close()
file_org.close()
file_eu.close()
file_google.close()

|   iter    |  target   |   top_k   |
-------------------------------------
| [0m 1       [0m | [0m 0.7434  [0m | [0m 48.38   [0m |
| [0m 2       [0m | [0m 0.7362  [0m | [0m 27.81   [0m |
| [0m 3       [0m | [0m 0.7326  [0m | [0m 48.66   [0m |
| [0m 4       [0m | [0m 0.7304  [0m | [0m 36.03   [0m |
| [0m 5       [0m | [0m 0.7409  [0m | [0m 35.19   [0m |
| [0m 6       [0m | [0m 0.7251  [0m | [0m 48.76   [0m |
| [0m 7       [0m | [0m 0.732   [0m | [0m 48.37   [0m |
| [0m 8       [0m | [0m 0.7252  [0m | [0m 42.2    [0m |
| [0m 9       [0m | [0m 0.7334  [0m | [0m 35.54   [0m |
| [0m 10      [0m | [0m 0.737   [0m | [0m 20.67   [0m |
| [0m 11      [0m | [0m 0.7408  [0m | [0m 44.9    [0m |
| [0m 12      [0m | [0m 0.7398  [0m | [0m 42.62   [0m |
| [0m 13      [0m | [0m 0.7244  [0m | [0m 35.3    [0m |
| [0m 14      [0m | [0m 0.7398  [0m | [0m 26.81   [0m |
| [0m 15      [0m | [0m 0.7201  [0m | [0m 18.7    