# Config

In [None]:
import torch
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

model_checkpoint, batch_size = [
    [("google/t5-v1_1-small",200),("google/t5-v1_1-base",180),("google/t5-v1_1-large",70)],
    [("facebook/bart-base",300),("facebook/bart-large",40)],
    [("gpt2-medium",100),("gpt2-large",20)],
    [("tuner007/pegasus_paraphrase",100)],
    [("idm",100)],
    [("identity",100)]] [1][0]

#batch_size = math.ceil(batch_size/3) #force different batchsize if GPU not empty

model_name = model_checkpoint.split("/")[-1]
print('model: ',model_name)
dataset_name= ['para-1-1-small','para-1-1','idm-small','idm'][3]
print('dataset: ',dataset_name)
torch_device='cuda:0' if torch.cuda.is_available() else 'cpu'

learning_rate = 2e-05 #'-' for GPT or identity
weight_decay= 0.001   #'-' for GPT

model_output_name =f'{model_name}-finetuned-{dataset_name}-lr-{learning_rate}-wd-{weight_decay}'
model_path = f"/media/data3/proj_scisen/models/style/{model_output_name}"
dataset_path = '/media/nvme3n1/proj_scisen/datasets/'
output_path = '/media/data3/proj_scisen/results/'

In [None]:
#remove if converted to python file
if 't5' in model_name or 'bart' in model_name:
    %env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

# Load (fine-tuned) model


## Pegasus Paraphraser

load model and create method for sample outputs

In [None]:
if 'pegasus' in model_name:
    import torch
    from transformers import PegasusForConditionalGeneration, PegasusTokenizer
    tokenizer = PegasusTokenizer.from_pretrained(model_checkpoint)
    model = PegasusForConditionalGeneration.from_pretrained(model_checkpoint)
    model.to(torch_device)

#create sample outputs for single sentences
def get_response(input_text,num_return_sequences,num_beams):
    batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

## IDM

In [None]:
if 'idm' in model_name:
    import random
    from transformers import pipeline
    from transformers.pipelines.pt_utils import KeyDataset
    import datasets
    from datasets import Dataset

    unmasker = pipeline('fill-mask', model='bert-base-uncased',device=0)

In [None]:
def modify_replace(sentences,replace=1,add_mask=True):#example: sentences=['Sentence one.','I am a sentence.']
    #create batch of sentence to allow batched unmasking if required 
    new_sentences=[]
    for sentence in sentences:
        #unmask only accepts one MASK token, this is a problem if the original sentence already contains one 
        #handle [MASK] like any other word
        sentence = sentence.replace('[MASK]','MASK')
            
        split = sentence.split(' ')
        
        selected_word = random.randrange(0, len(split))

        #find words larger than three characters as long as such words exits and only if we want to replace/delete a word
        while (len(split[selected_word]) < 4) and (replace > 0) and (max([len(x) for x in split])>3):
            selected_word = random.randrange(0, len(split))
            
        split1 = split[:selected_word]
        split2 = split[selected_word+replace:]

        if add_mask:
            sentence = ' '.join(split1 +['[MASK]']+ split2) # unmasker fehlt
        else:
            sentence = ' '.join(split1 + split2)

        new_sentences.append(sentence)
          
    if add_mask:
        #join here -> later only unmask or return
        my_dict = {"text": new_sentences}
        dataset_sentences = Dataset.from_dict(my_dict)

        out_sentences=[]
        unmasked_sentences = unmasker(KeyDataset(dataset_sentences, "text"), batch_size=1000,top_k=3,)
        for idx, sentence in enumerate(unmasked_sentences):
            #ensure that the sentence changed
            if (sentences[idx]!=sentence[0]['sequence']) and ('\'' not in sentence[0]['sequence']):
                out_sentences.extend([sentence[0]['sequence']])
            elif (sentences[idx]!=sentence[1]['sequence']) and ('\'' not in sentence[1]['sequence']):
                out_sentences.extend([sentence[1]['sequence']])
            else:
                out_sentences.extend([sentence[2]['sequence']])
        return out_sentences
    else:
        return [out for out in new_sentences]

In [None]:
def modify_delete(sentence):
    return modify_replace(sentence,add_mask=False)

In [None]:
def modify_insert(sentence):
    return modify_replace(sentence,replace = 0)

In [None]:
def modify(sentences):# sentences=['Sentence one.','I am a sentence.']
    sentences = np.array(sentences)

    max_changes_all = math.ceil(len(sentences[0].split(' '))/2)
    buckets = [(random.randrange(0,6)/10 ,len(x.split(' '))) for x in sentences]
    changes_bucket = [math.floor(length/2)*changes for changes, length in buckets]    
        
    for changes in range(0,max_changes_all):
        #change all senteces that have not the required amount of changes for their bucket
        selected_sentences = [bucket >= changes for bucket in changes_bucket]
        r = random.randrange(0,3)
        if r==0:
            sentences[selected_sentences] =  modify_replace(sentences[selected_sentences])
        elif r==1:
            sentences[selected_sentences] =  modify_delete(sentences[selected_sentences])
        else:
            sentences[selected_sentences] =  modify_insert(sentences[selected_sentences])
    return sentences, buckets

## T5 or BART

In [None]:
if 't5' in model_name or 'bart' in model_name:
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
    import torch
    import os
    import re

In [None]:
if 't5' in model_name or 'bart' in model_name:
    #get latest checkpoint
    def extract_number(f):
        s = re.findall("\d+$",f)
        return (int(s[0]) if s else -1,f)
    latest_checkpoint = max(os.listdir(model_path),key=extract_number)
    print(latest_checkpoint)
    model_checkpoint=model_path+'/'+latest_checkpoint

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    model.to(torch_device)
    model.num_parameters() #reduces output

## GPT 2

In [None]:
if 'gpt' in model_name:
    from transformers import pipeline, set_seed
    generator = pipeline('text-generation',model='gpt2', device=0)

In [None]:
if 'gpt' in model_name:
    set_seed(42)
    entry_for_manual_testing = 10
    for input_text in gyafc['f_r']['text'][:entry_for_manual_testing]: #,dataset['test']['text'][entry_for_manual_testing]
        print(input_text)
        #task_prefix = "scientific version: " # 2 von 10 waren rephrasings/brauchbar
        #task_prefix = "more scientific: " #0,0!
        #task_prefix = "write scientific text: " # 4 von 10
        #task_prefix = "write as a scientific sentence: " # 2 von 10
        #task_prefix = "In scientific language, " # 4 von 10
        task_prefix = "Write in shakespeare language: "
        res = generator(task_prefix + input_text , max_length = 45, num_return_sequences = 1)
        for x in res:
            print(x['generated_text'].replace(task_prefix+input_text,'').replace('\n','').split('.')[0])
        print()

# Define output function

In [None]:
def print_output(results):
    res_string = ('\n'+model_name+' & '+
          str(learning_rate)+' & '+
          str(weight_decay)+' & '+
          str(round(results['score_bleu']['bleu']*100,2))+' & '+
          str(round(results['score_self_bleu']['bleu']*100,2))+' & '+ 
          str(round(results['score_meteor']['meteor']*100,2))+' & '+
          str(round(np.mean(results['score_bertscore']['f1'])*100,2))+' & '+
          str(round(math.log(results['perplexity']['mean_perplexity']),3))+' \\\\\n')
    print(res_string)
    f = open("eval_results_style.txt", "a")
    f.write(res_string)
    f.close()
    return res_string

# Load data
we only consider the family and relation dataset because model outputs for this are available and therefore allow easy comparability

## GYAFC

In [None]:
#load datasets to test metrics
from datasets import load_from_disk, load_dataset, DatasetDict, concatenate_datasets
fr_informal = load_dataset("text",data_files=f'{dataset_path}GYAFC_Corpus/Family_Relationships/test/informal')

for rewriter in range(4):
    load = load_dataset("text",data_files=f'{dataset_path}GYAFC_Corpus/Family_Relationships/test/formal.ref{rewriter}')['train']['text']
    fr_informal['train'] = fr_informal['train'].add_column(f'ref{rewriter}', load)

model_outputs = ['nmt_baseline','nmt_combined','nmt_copy','pbmt','rule_based']
for model_output in model_outputs:
    load = load_dataset("text",data_files=f'{dataset_path}GYAFC_Corpus/Family_Relationships/model_outputs/formal.{model_output}')['train']['text']
    fr_informal['train'] = fr_informal['train'].add_column(f'{model_output}', load)

model_outputs = ['dast-c','dualRL','drlst']
for model_output in model_outputs:
    load = load_dataset("text",data_files=f'{dataset_path}GYAFC_Corpus/tst_survey/{model_output}')['train']['text']
    fr_informal['train'] = fr_informal['train'].add_column(f'{model_output}', load[len(load)-len(fr_informal['train']['text']):])

gyafc = DatasetDict({'f_r':fr_informal['train']})
gyafc

In [None]:
entrie = 5
[gyafc['f_r'][entrie].get(key) for key in ['ref0','ref1','ref2','ref3']]
gyafc['f_r']['drlst'][:3]

In [None]:
gyafc['f_r']['ref0'][:3]

## Dataset idm or pegasus

In [None]:
from datasets import load_from_disk
dataset = load_from_disk(f'{dataset_path}style/{dataset_name}')

if('idm' in dataset_name):
    #idmBuckets is only generated for the test split! Therefore, the renaming is also limited!
    if('idm-small' in dataset_name): 
        dataset['test'] = dataset['test'].rename_column('idm','para-1-1')
    else:
        #original 'idm' if we do buckets use idmBucket 
        dataset['test'] = dataset['test'].rename_column('idmBucket','para-1-1')
    dataset = dataset.shuffle(seed=42)

dataset['test'] = dataset['test']

In [None]:
dataset

# Load metrics

In [None]:
import nltk
from datasets import load_metric
#https://huggingface.co/metrics

def calculate_all_metrics(model_input,model_output,model_reference):
    metric_bleu = load_metric("bleu") 
    metric_self_bleu = load_metric("bleu") 
    metric_rouge = load_metric("rouge") 
    metric_meteor = load_metric("meteor") 
    metric_bertscore = load_metric("bertscore")
    metric_ppl = load_metric("perplexity") 
    
    for entry in range(len(model_input)):
        x_in = model_input[entry].lower().split(' ')
        x_out = model_output[entry].lower().split(' ')
        x_ref = model_reference[entry]
        x_ref = [x.lower().split(' ') for x in x_ref]
        
        x_out_bert = model_output[entry][:512].lower()
        x_ref_bert = model_reference[entry][:512]
        x_ref_bert = [x.lower() for x in x_ref_bert]  #split wurde hier weggelassen!

        metric_bleu.add_batch(predictions = [x_out], references= [x_ref])
        metric_self_bleu.add_batch(predictions = [x_out], references= [[x_in]]) 
        metric_meteor.add_batch(predictions = [x_out], references= [x_ref])
        metric_bertscore.add_batch(predictions = [x_out_bert], references= [[x_ref_bert]],)

    return {'score_bleu' : metric_bleu.compute(),
    'score_self_bleu' : metric_self_bleu.compute(),
    'score_meteor' : metric_meteor.compute(),
    'score_bertscore' : metric_bertscore.compute(model_type='allenai/scibert_scivocab_uncased',),
    'perplexity': metric_ppl.compute(input_texts = [x[:512].lower().split(' ') for x in model_output if (len(x.lower().split(' '))>2) ], model_id='allenai/scibert_scivocab_uncased',add_start_token=False)}

# Human evaluation

Checklist:
* Is the sentence a correct english sentence?
    * Are words meaninglessly repeated?
* Is the sentence meaningful?
* Is the original meaning preserved?
    * loss of information 
    * addition of unrelated information
* Is it more scientific?
    * scientific words/wording

In [None]:
num_beams = 5 # less word repetition if larger 1 
num_return_sequences = 1
entry_for_manual_testing = 13

In [None]:
for input_text in [gyafc['f_r']['text'][entry_for_manual_testing],dataset['test']['text'][entry_for_manual_testing],dataset['test']['para-1-1'][entry_for_manual_testing]]:
    #input_text = '>> My dog ate my homework'
    print(input_text)
    batch = tokenizer([input_text],truncation=True,padding='max_length',max_length=100, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch,max_length=100,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5) #TODO test output_scores = True
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    print(tgt_text)
    print()

# Apply metric to test set

### T5 or BART

In [None]:
tgt_text= []
if (('t5' in model_name or 'bart' in model_name):
    and not os.path.exists(f'{output_path}style/{model_output_name}')):
    for x in tqdm(range(0, len(dataset['test']['text']), batch_size)):
        subset=dataset['test']['para-1-1'][x:x+batch_size]
        batch = tokenizer(subset,truncation=True,padding='max_length',max_length=100, return_tensors="pt").to(torch_device)
        translated = model.generate(**batch,max_length=100,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
        tgt_text.extend(tokenizer.batch_decode(translated, skip_special_tokens=True))

    dataset['test']=dataset['test'].add_column(f'{model_output_name}', tgt_text)

    #save results really redundand!!!
    dataset.save_to_disk(f'{output_path}style/{model_output_name}')

In [None]:
#load results if save task was performed before
dataset = dataset.load_from_disk(f'{output_path}style/{model_output_name}')
dataset

### GPT 2

In [None]:
tgt_text= []
if 'gpt' in model_name:
    for input_text in tqdm(dataset['test']['text']):
        #task_prefix = "Write in shakespeare language:  
        #task_prefix = "write scientific text: "
        task_prefix = "In scientific language, "
        res = generator(task_prefix + input_text , max_length = 45, num_return_sequences = 1)
        tgt_text.append(res[0]['generated_text'].replace(task_prefix+input_text,'').replace('\n','').split('.')[0])

    dataset['test']= dataset['test'].add_column(f'gpt2 + {task_prefix}', tgt_text)
    model_output_name = f'gpt2 + {task_prefix}'

### identity

In [None]:
if 'identity' in model_name: 
    model_output_name = 'para-1-1'

### with buckets

In [None]:
werScores = dataset.load_from_disk(f'{dataset_path}style/para-1-1')['test']
werScores = werScores['wer-score']

In [None]:
#Pegasus dataset
if 'para-1-1' in dataset_name:
    buckets=[0,1,2,3,4,5]#[0,0.1,0.2,0.3,0.4,0.5]
    bins = pd.qcut(werScores,6,labels=buckets)
#IDM dataset
elif 'idm' in dataset_name:
    buckets=[0,0.1,0.2,0.3,0.4,0.5]
    bins = dataset['test']['bucket']

In [None]:
dataset_bucket= []
for bucket in buckets:
    idx_buckets = [idx for idx, bbin in enumerate(bins) if (bucket == bbin)]
    dataset_bucket.append(dataset['test'].select(idx_buckets))

dataset_bucket[0]

#use the entire test set e.g. this was used to calculate the overall metrics during hyperparameter tuning
dataset_bucket = []
dataset_bucket.append(dataset['test'])

In [None]:
#Information on how the different buckets look
selector = 0 # selects a bucket
sentence_idx=3
print('Config: ',selector,sentence_idx,model_output_name)
print('Gold Standard:')
print(dataset_bucket[selector]['text'][sentence_idx])
print('\nInput')
print(dataset_bucket[selector]['para-1-1'][sentence_idx])
#print(dataset_bucket[selector]['idm'][sentence_idx])
print('\nModel output')
print(dataset_bucket[selector][model_output_name][sentence_idx])

In [None]:
def output_idx(idx):
    model_checkpoint = ["google/t5-v1_1-small","google/t5-v1_1-base","google/t5-v1_1-large","facebook/bart-base"][idx]
    model_name = model_checkpoint.split("/")[-1]
    dataset_name = 'idm'

    model_output_name =f'{model_name}-finetuned-{dataset_name}-lr-{learning_rate}-wd-{weight_decay}'
    
    try:
        dataset = load_from_disk(f'{output_path}style/{model_output_name}')
        
        #Pegasus dataset
        if 'para-1-1' in dataset_name:
            buckets=[0,1,2,3,4,5]#[0,0.1,0.2,0.3,0.4,0.5]
            bins = pd.qcut(werScores,6,labels=buckets)
        #IDM dataset
        elif 'idm' in dataset_name:
            buckets=[0,0.1,0.2,0.3,0.4,0.5]
            bins = dataset['test']['bucket']
    
        dataset_bucket= []
        for bucket in buckets:
            idx_buckets = [idx for idx, bbin in enumerate(bins) if (bucket == bbin)]
            dataset_bucket.append(dataset['test'].select(idx_buckets))
        
        
        if(idx==0):
            print(f'\multirow{{6}}{{*}}{{{selector}}}')
            print(' & gold standard & ',dataset_bucket[selector]['text'][sentence_idx+i],'\\\\')
            print(' & input sentence & ',dataset_bucket[selector]['para-1-1'][sentence_idx+i],'\\\\')
        print('&',model_name, '&',dataset_bucket[selector][model_output_name][sentence_idx+i],'\\\\')
    
    except (FileNotFoundError, KeyError) as error:
        print('\nERROR\n',model_name)
    return dataset_bucket

In [None]:
#Information on how the different buckets look
selector = 5 # selects a bucket
#pegasus bucket: 0 sentence: 80, 110, 1123
#pegasus bucket: 1 sentence: 4
#pegasus bucket: 2-5 sentence: 3

#idm bucket: 0 sentence: 0, 1002?
#idm bucket: 1 sentence: 1001
#idm bucket: 2 sentence: 
#idm bucket: 3 sentence: 1002
#idm bucket: 4 sentence: 1001
#idm bucket: 5 sentence: 1051-> good, 1298, change meaning: 1323, 1440

sentence_idx=1001
for selector in range(0,6):
    for i in range(0,1000):
        if(len(dataset_bucket[selector]['text'][sentence_idx+i])<115):
            for idx in range(0,4):
                output_idx(idx)
            print('\midrule')

### calculate metrics

In [None]:
results_test_set= f"\n\nRESULTS\n"
for idx, bucket in enumerate(dataset_bucket):
    print('Bucket: ',idx)
    model_input=bucket['para-1-1']#[String]
    model_output=bucket[model_output_name]#[String]
    model_reference=[[x] for x in bucket['text']]#[[String]]
    
    results = calculate_all_metrics(model_input,model_output,model_reference)
    resAsString = print_output(results)
    results_test_set=f'{results_test_set} Bucket:{idx}\n {resAsString}\n'
print(results_test_set)

#  Generate gyafc output with our models

### T5 or BART

In [None]:
from tqdm import tqdm
tgt_text= []
#max_length=60 for Pegasus 
num_beams=5
for x in tqdm(range(0, len(gyafc['f_r']['text']), batch_size)):
    subset=gyafc['f_r']['text'][x:x+batch_size]
    batch = tokenizer(subset,truncation=True,padding='max_length',max_length=100, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch,max_length=100,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text.extend(tokenizer.batch_decode(translated, skip_special_tokens=True))

gyafc['f_r']=gyafc['f_r'].add_column(f'{model_output_name}', tgt_text)

### self-idm

In [None]:
#self-idm
if identity in model_name:
    tgt_text = modify(gyafc['f_r']['text'])
    gyafc['f_r']=gyafc['f_r'].add_column(f'{model_output_name}', tgt_text)

### GPT 2

In [None]:
tgt_text= []
from tqdm import tqdm
if 'gpt' in model_name:
    for input_text in tqdm(gyafc['f_r']['text']):
        #subset=gyafc['f_r']['text'][x:x+batch_size]
        #task_prefix = "Write in shakespeare language:  
        #task_prefix = "write scientific text: "
        task_prefix = "In scientific language, "
        res = generator(task_prefix + input_text , max_length = 45, num_return_sequences = 1)
        tgt_text.append(res[0]['generated_text'].replace(task_prefix+input_text,'').replace('\n','').split('.')[0])
        
        #translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
        #tgt_text.extend(tokenizer.batch_decode(translated, skip_special_tokens=True))

    gyafc['f_r']=gyafc['f_r'].add_column(f'gpt2 + {task_prefix}', tgt_text)
    model_output_name = f'gpt2 + {task_prefix}'

## Apply metric to gyafc dataset

In [None]:
import math
model_input=gyafc['f_r']['text']#[String]
model_reference=[[x.get(key)  for key in ['ref0','ref1','ref2','ref3']] for x in gyafc['f_r']] #[[String]]

#allows to get the metrics for the reference datasets
model_outputs = [model_output_name]#['text','nmt_combined','rule_based','dualRL','drlst','dast-c']#[model_output_name]#['t5finetuned-1-1','pegasus-1-1','nmt_combined','rule_based','dast-c','dualRL','drlst','text']#[]#['text']
for model_output_name in model_outputs:
    model_output=gyafc['f_r'][model_output_name]#[String]
    results = calculate_all_metrics(model_input,model_output,model_reference)
    results_gyafc_set=results # to access results later for pretty and combined print
    print_output(results_gyafc_set)

# Combined output

In [None]:
print('test set:')
print(results_test_set)
print()
print('gyafc dataset:')
print_output(results_gyafc_set)