In [125]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import zscore

In [126]:
base_folder = Path('./result/readability-indexes')

scaler = MinMaxScaler()

generated_simplified = pd.read_csv(base_folder / 'generated-simplified.csv')
reference_simplified  = pd.read_csv(base_folder / 'reference-simplified.csv')
reference_complete = pd.read_csv(base_folder / 'reference-complete.csv')

In [127]:
reference_complete['name']

0    2025_ufc_inova_1
1    2025_ufc_inova_8
2    2025_ufc_inova_7
3    2025_ufc_inova_3
4    2025_ufc_inova_2
5    2025_ufc_inova_6
6    2025_ufc_inova_5
7    2025_ufc_inova_4
Name: name, dtype: object

In [128]:
scale_readability_cols = ['flesch_ease','gulpease'] 
grade_level_readability_cols = ['flesch_kincaid','ari','gunning_fog','coleman_liau']
metrics_cols = scale_readability_cols + grade_level_readability_cols


### Comparando arquivos originais com os gerados por cada modelo

In [134]:
merged = reference_complete.merge(generated_simplified,on='name',how='inner',suffixes=("",'_generated'))

for col in scale_readability_cols:
    merged[col+"_improvement"] = ((merged[col+"_generated"]- merged[col]) / merged[col+"_generated"]) * 100
for col in grade_level_readability_cols:
    merged[col+"_improvement"] = ((merged[col] -merged[col+"_generated"]) / merged[col]) * 100 

improvement_methods = [col+"_improvement" for col in metrics_cols]
agg_methods = {col:"mean" for col in improvement_methods}

average_improvement_per_model = merged.groupby('generated_with_generated').agg(agg_methods)

zscores = average_improvement_per_model[improvement_methods].apply(zscore)

exclude_invalid_models = ~((zscores.index.str.startswith('gemini')) | (zscores.index.str.startswith('phi3')))
zscores['zscore_mean'] = zscores[exclude_invalid_models].sum(axis=1)
best_zscore = zscores.loc[zscores['zscore_mean'].idxmax()]

minmaxes = pd.DataFrame(scaler.fit_transform(average_improvement_per_model[improvement_methods]), columns = improvement_methods, index = average_improvement_per_model.index)
exclude_invalid_models = ~(minmaxes.index.str.startswith('gemini') |(minmaxes.index.str.startswith("phi3")))
minmaxes['minmax_mean'] = minmaxes[exclude_invalid_models].sum(axis=1)
best_minmax = minmaxes.loc[minmaxes['minmax_mean'].idxmax()]


zscores.sort_values(by='zscore_mean',ascending=False)['zscore_mean'],minmaxes.sort_values(by='minmax_mean',ascending=False)['minmax_mean']

(generated_with_generated
 phi4:latest                       1.165699
 qwen2.5:14b                       0.839890
 deepseek-r1:14b                  -0.130338
 qwen2.5-coder:32b                -1.589942
 gemma3:4b                        -2.167815
 llama3.2:latest                  -3.365564
 granite3-dense:8b                -3.598144
 cow/gemma2_tools:2b              -4.451446
 granite3-dense:2b                -5.360311
 granite-code:8b                  -6.026275
 gemini-2.5-flash-preview-04-17         NaN
 gemini-2.5-pro-preview-05-06           NaN
 phi3:latest                            NaN
 Name: zscore_mean, dtype: float64,
 generated_with_generated
 phi4:latest                       2.510296
 qwen2.5:14b                       2.424960
 deepseek-r1:14b                   2.148312
 qwen2.5-coder:32b                 1.703727
 gemma3:4b                         1.512010
 llama3.2:latest                   1.179181
 granite3-dense:8b                 1.106616
 cow/gemma2_tools:2b            

### Comparando arquivos originais resumidos com os gerados por cada modelo

In [133]:
merged = reference_simplified.merge(generated_simplified,on='name',how='inner',suffixes=("",'_generated'))

for col in scale_readability_cols:
    merged[col+"_improvement"] = ((merged[col+"_generated"]- merged[col]) / merged[col+"_generated"]) * 100
for col in grade_level_readability_cols:
    merged[col+"_improvement"] = ((merged[col] -merged[col+"_generated"]) / merged[col]) * 100 

improvement_methods = [col+"_improvement" for col in metrics_cols]
agg_methods = {col:"mean" for col in improvement_methods}

average_improvement_per_model = merged.groupby('generated_with_generated').agg(agg_methods)

zscores = average_improvement_per_model[improvement_methods].apply(zscore)

zscores['zscore_mean'] = zscores[~zscores.index.str.startswith('gemini')].sum(axis=1)
best_zscore = zscores.loc[zscores['zscore_mean'].idxmax()]

minmaxes = pd.DataFrame(scaler.fit_transform(average_improvement_per_model[improvement_methods]), columns = improvement_methods, index = average_improvement_per_model.index)
minmaxes['minmax_mean'] = minmaxes[~minmaxes.index.str.startswith('gemini')].sum(axis=1)
best_minmax = minmaxes.loc[minmaxes['minmax_mean'].idxmax()]

zscores.sort_values(by='zscore_mean',ascending=False)['zscore_mean'],minmaxes.sort_values(by='minmax_mean',ascending=False)['minmax_mean']

(generated_with_generated
 qwen2.5:14b                        1.628393
 phi4:latest                        0.838930
 deepseek-r1:14b                    0.041306
 granite3-dense:2b                 -0.019110
 gemma3:4b                         -0.094447
 qwen2.5-coder:32b                 -0.118496
 granite3-dense:8b                 -1.250926
 llama3.2:latest                   -1.515129
 granite-code:8b                   -4.498649
 cow/gemma2_tools:2b               -5.581902
 phi3:latest                      -10.396431
 gemini-2.5-flash-preview-04-17          NaN
 gemini-2.5-pro-preview-05-06            NaN
 Name: zscore_mean, dtype: float64,
 generated_with_generated
 qwen2.5:14b                       3.415003
 phi4:latest                       3.151866
 deepseek-r1:14b                   2.953923
 qwen2.5-coder:32b                 2.895531
 granite3-dense:2b                 2.881012
 gemma3:4b                         2.875202
 granite3-dense:8b                 2.534994
 llama3.2:latest   