# evaluate performance along various axes of sentence complexity

## data

1. load dataset to csv
2. load the model 
3. run the model function on each sentence in dataset...
4. batch this process... 

In [1]:
print("start")

start


In [2]:
lang='de' #cs: 19359 #en: 19360  #de: 19387
dataset_max_len = 200
len_of_dataset = 100000


In [3]:
import pandas as pd

data_path = f"news.2013.{lang}.trainlen.{dataset_max_len}.merged.csv"
data = pd.read_csv(data_path)
# print len
print("len:", len(data))

# filter out rows which 'lang' != True
data = data[data['lang'] == True]
print("len:", len(data))
# weird is false
data = data[data['weird'] == False]

# print len
print("len:", len(data))
print("data loaded")

# sample data
data = data.sample(n=len_of_dataset, random_state=1)
print("len:", len(data))



len: 898502
len: 850701
len: 847588
data loaded
len: 256


## load model

In [10]:
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
from src.utils import levensthein_distance, print_avg_median_mode_error
from transformers import pipeline, logging
from src.ByT5Dataset import ByT5CaesarRandomDataset, ByT5ConstEnigmaDataset
import torch
import time

logging.set_verbosity(logging.ERROR)


tokenizer = ByT5Tokenizer()

# aaaaaa
model_ids = {
    'caesar': 'slurm_16677',
    'en': 'slurm_17510',
    'de': 'slurm_18065',
    'cs': 'slurm_18066'
}
model_id = model_ids[lang]
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = T5ForConditionalGeneration.from_pretrained(f"./logs/{model_id}/model")
model.to(device)
dataset_class = ByT5ConstEnigmaDataset  # for 17510 model
# dataset_class = ByT5CaesarRandomDataset # for 16677 model
dataset = dataset_class(data.text, dataset_max_len)
data['generated_text'] = ''
data['error_count'] = 0



translate = pipeline("translation", model=model, tokenizer=tokenizer, device=device)

batch_size = 64
data = data.reset_index(drop=True)
for i in range(0, len(dataset), batch_size):
    t0 = time.time()
    batch = dataset[i:i+batch_size]
    input_texts = batch['input_text'] 
    output_texts = batch['output_text'] 

    # Generate translations in batches
    generated_texts = translate(input_texts, max_length=(dataset_max_len + 1) * 2)
    generated_texts = [t['translation_text'] for t in generated_texts]

    # Calculate errors and update DataFrame
    errors = [levensthein_distance(gen, out) for gen, out in zip(generated_texts, output_texts)]
    data.loc[i:i+batch_size-1, 'generated_text'] = generated_texts
    data.loc[i:i+batch_size-1, 'error_count'] = errors
    t1 = time.time()

    print(f"Processed batch {i // batch_size + 1}/{len(dataset) // batch_size} in {t1 - t0:.2f} seconds")


avg, med, mode = print_avg_median_mode_error(data['error_count'].tolist())
print("#############################################")

print("avg:", avg)
print("med:", med)
print("mode:", mode)



Processed batch 1/16 in 77.57 seconds
Processed batch 2/16 in 153.34 seconds
Processed batch 3/16 in 218.47 seconds
Processed batch 4/16 in 285.92 seconds
Processed batch 5/16 in 353.50 seconds
Processed batch 6/16 in 430.63 seconds
Processed batch 7/16 in 491.84 seconds
Processed batch 8/16 in 551.48 seconds
Processed batch 9/16 in 614.72 seconds
Processed batch 10/16 in 674.50 seconds
Processed batch 11/16 in 731.88 seconds
Processed batch 12/16 in 797.07 seconds
Processed batch 13/16 in 871.31 seconds
Processed batch 14/16 in 941.20 seconds
Processed batch 15/16 in 1013.33 seconds
Processed batch 16/16 in 1087.18 seconds
Average errors: 88.78125
Median errors: 87
Mode errors: 71
#############################################
avg: 88.78125
med: 87
mode: 71


In [None]:

data.to_csv(f"news.2013.{lang}.trainlen.{dataset_max_len}.evaluation.{len_of_dataset}.csv", index=False)