# evaluate performance along various axes of sentence complexity

## data
## this is for the case where we already have the clean dataset from before and just want to evaluate other models

1. load dataset to csv
2. load the model 
3. run the model function on each sentence in dataset...
4. batch this process
5. hope for no race condition when saving

In [1]:
print("start")

start


In [2]:
dataset_max_len = 200
len_of_dataset = 100000


In [3]:
import pandas as pd
from src.ByT5Dataset import ByT5ConstEnigmaDataset, ByT5CaesarRandomDataset, ByT5NoisyVignere2Dataset
from src.evaluation import Model
from src.ByT5Dataset import ByT5Dataset

models = {
    'caesar': Model(ByT5CaesarRandomDataset, 'caesar', 'en', 16677),
    'en_constenigma': Model(ByT5ConstEnigmaDataset, 'en_constenigma', 'en', 17510),
    'de_constenigma': Model(ByT5ConstEnigmaDataset, 'de_constenigma', 'de', 18065),
    'cs_constenigma': Model(ByT5ConstEnigmaDataset, 'cs_constenigma', 'cs', 18066),
    'en_noisevignere_checkpoint-5000': Model(ByT5NoisyVignere2Dataset, 'en_noisevignere_checkpoint-5000', 'en', 20145, True, 5000, .15), #20196
    'en_noisevignere_checkpoint-10000': Model(ByT5NoisyVignere2Dataset, 'en_noisevignere_checkpoint-10000', 'en', 20145, True, 10000, .15) # 20228
}

evaluated_name = 'en_noisevignere_checkpoint-10000'
model_metadata = models[evaluated_name]

data_path = f'news.2013.{model_metadata.language}.trainlen.200.evaluation.100000.csv'
data = pd.read_csv(data_path)

print("data loaded")
print(model_metadata)




len: 898502
len: 850701
len: 847588
data loaded
len: 256


## load model

In [10]:
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
from src.utils import levensthein_distance, print_avg_median_mode_error
from transformers import pipeline, logging
from src.ByT5Dataset import ByT5CaesarRandomDataset, ByT5ConstEnigmaDataset
import torch
import time

logging.set_verbosity(logging.ERROR)

tokenizer = ByT5Tokenizer()

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = T5ForConditionalGeneration.from_pretrained(model_metadata.path())
model.to(device) # type: ignore

dataset_class = model_metadata.dataset_class
if model_metadata.noise_proportion is not None:
    dataset = dataset_class(dataset=data.text, max_length=dataset_max_len, noise_proportion=model_metadata.noise_proportion) # type: ignore , not sound but will work in my usecase
else:
    dataset = dataset_class(dataset=data.text, max_length=dataset_max_len) # type: ignore , not sound but will work in my usecase



translate = pipeline("translation", model=model, tokenizer=tokenizer, device=device)

error_col_name = f'{model_metadata.name}_error_count'
generated_text_col_name = f'{model_metadata.name}_generated_text'

data[error_col_name] = 0
data[generated_text_col_name] = ''

batch_size = 64
data = data.reset_index(drop=True)
for i in range(0, len(dataset), batch_size):
    t0 = time.time()
    batch = dataset[i:i+batch_size]
    input_texts = batch['input_text'] 
    output_texts = batch['output_text'] 

    # Generate translations in batches
    generated_texts = translate(input_texts, max_length=(dataset_max_len + 1) * 2)
    generated_texts = [t['translation_text'] for t in generated_texts] # type: ignore 

    # Calculate errors and update DataFrame
    errors = [levensthein_distance(gen, out) for gen, out in zip(generated_texts, output_texts)]
    data.loc[i:i+batch_size-1, generated_text_col_name] = generated_texts
    data.loc[i:i+batch_size-1, error_col_name] = errors
    t1 = time.time()

    print(f"Processed batch {i // batch_size + 1}/{len(dataset) // batch_size} in {t1 - t0:.2f} seconds")


avg, med, mode = print_avg_median_mode_error(data[error_col_name].tolist())
print("#############################################")

print("avg:", avg)
print("med:", med)
print("mode:", mode)



Processed batch 1/16 in 77.57 seconds
Processed batch 2/16 in 153.34 seconds
Processed batch 3/16 in 218.47 seconds
Processed batch 4/16 in 285.92 seconds
Processed batch 5/16 in 353.50 seconds
Processed batch 6/16 in 430.63 seconds
Processed batch 7/16 in 491.84 seconds
Processed batch 8/16 in 551.48 seconds
Processed batch 9/16 in 614.72 seconds
Processed batch 10/16 in 674.50 seconds
Processed batch 11/16 in 731.88 seconds
Processed batch 12/16 in 797.07 seconds
Processed batch 13/16 in 871.31 seconds
Processed batch 14/16 in 941.20 seconds
Processed batch 15/16 in 1013.33 seconds
Processed batch 16/16 in 1087.18 seconds
Average errors: 88.78125
Median errors: 87
Mode errors: 71
#############################################
avg: 88.78125
med: 87
mode: 71


In [None]:

data.to_csv(f"news.2013.{model_metadata.language}.trainlen.200.evaluation.100000.intermediate.{model_metadata.name}.csv", index=False)