# Translations metrics analysis

## Setup

In [1]:
import re
from pathlib import Path

import pandas as pd
import plotly.express as px

## Prepare data

In [2]:
# Reads all TSV files in the current directory
def summarize_metric(directory_path: str, metric_name: str) -> pd.DataFrame:
    """
    Summarizes a specific metric from all TSV files in a directory using pandas.Series.describe().

    Args:
        directory_path (str): Path to the directory containing TSV files.
        metric_name (str): The name of the column (metric) to summarize.

    Returns:
        pd.DataFrame: A DataFrame where each row corresponds to a TSV file and each column is an index from describe().
    """
    directory = Path(directory_path)
    if not directory.is_dir():
        raise ValueError(f"The provided path '{directory_path}' is not a valid directory.")

    summary_data = []

    for tsv_file in directory.glob("*.tsv"):
        try:
            df = pd.read_csv(tsv_file, sep='\t')
            if metric_name not in df.columns:
                print(f"Metric '{metric_name}' not found in file '{tsv_file.name}'. Skipping.")
                continue

            # Apply describe() to the specified metric column
            metric_summary = df[metric_name].describe()

            # Append the summary as a row with the file name as the index
            summary_data.append(pd.DataFrame(metric_summary).T.assign(file_name=tsv_file.name))
        except Exception as e:
            print(f"Error processing file '{tsv_file.name}': {e}")

    # Combine all summaries into a single DataFrame
    if summary_data:
        result_df = pd.concat(summary_data, ignore_index=True)
        result_df.set_index('file_name', inplace=True)
        result_df.reset_index(inplace=True)
        return result_df
    else:
        print("No valid TSV files processed.")
        return pd.DataFrame()

In [25]:
def split_filename(filename, shorten: bool = False):
    # Remove file extension
    base = filename.replace('.tsv', '')
    parts = base.split('_')
    if len(parts) == 11 and parts[0] == 'meta-llama':
        if shorten:
            parts.pop(0)
        else:
            parts[0] = parts[0] + '_' + parts[1]
            parts.pop(1)
    if len(parts) == 10:
        return {
            'model': parts[0],
            'configs': parts[1],
            'metaphorical_system_prompt': parts[2],
            'literal_system_prompt': parts[3],
            'user_prompt': parts[4],
            'dataset_name': parts[5],
            'partition': parts[6],
            'metaphoricity': parts[7], 
            'type': parts[8],
            'size': parts[9]
        }
    elif len(parts) == 9:
        return {
            'model': parts[0],
            'configs': parts[1],
            'metaphorical_system_prompt': parts[2],
            'literal_system_prompt': parts[2],
            'user_prompt': parts[3],
            'dataset_name': parts[4],
            'partition': parts[5],
            'metaphoricity': parts[6], 
            'type': parts[7],
            'size': parts[8]
        }
    else:
        # Return None for all if unexpected format
        return {k: None for k in ['model', 'configs', 'metaphorical_system_prompt', 'literal_system_prompt', 'user_prompt', 'dataset_name', 'partition', 'metaphoricity', 'type', 'size']}

In [None]:
#TODO: sort by median
#TODO: use colors to differentiate between configs and prompts combinations
def plot_metric_distribution(directory_path: str, metric_name: str, metaphoricity: str = None):
    directory = Path(directory_path)
    if not directory.is_dir():
        raise ValueError(f"The provided path '{directory_path}' is not a valid directory.")
    
    data = []
    for tsv_file in directory.glob("*.tsv"):
        try:
            df = pd.read_csv(tsv_file, sep='\t')
            if metric_name not in df.columns:
                print(f"Metric '{metric_name}' not found in file '{tsv_file.name}'. Skipping.")
                continue
            # Add all metric values with file name
            for value in df[metric_name]:
                experiment_info = split_filename(tsv_file.name, shorten=True)
                if metaphoricity and experiment_info['metaphoricity'] != metaphoricity:
                    continue
                # Shorten and wrap experiment label for readability
                label_parts = [
                    experiment_info.get('model', ''),
                    experiment_info.get('configs', ''),
                    experiment_info.get('metaphorical_system_prompt', ''),
                    experiment_info.get('literal_system_prompt', ''),
                    experiment_info.get('user_prompt', ''),
                    experiment_info.get('metaphoricity', '')
                ]
                experiment_name = '<br>'.join([str(p)[:15] for p in label_parts if p])
                data.append({'experiment_name': experiment_name, metric_name: value})
        except Exception as e:
            print(f"Error processing file '{tsv_file.name}': {e}")
    
    if not data:
        print("No valid metric data found.")
        return
    
    plot_df = pd.DataFrame(data)
    fig = px.box(
        plot_df,
        x="experiment_name",
        y=metric_name,
        points="all",
        title=f"Box plot of '{metric_name}' values per experiment",
        labels={"experiment_name": "Experiment", metric_name: metric_name}
    )
    fig.update_layout(xaxis_tickangle=-45, width=max(1800, 120*len(plot_df['experiment_name'].unique())), height=600, margin=dict(l=40, r=40, t=80, b=200), xaxis=dict(tickfont=dict(size=10)))
    fig.show()

In [5]:
df = summarize_metric('results/en2es/full_results', 'comet_ref')

In [6]:
split_filename('gemma2-9b-it_r3_spt4m_spt4l_upt1_meta4xnli_train_literal_sample_1000.tsv')

{'model': 'gemma2-9b-it',
 'configs': 'r3',
 'metaphorical_system_prompt': 'spt4m',
 'literal_system_prompt': 'spt4l',
 'user_prompt': 'upt1',
 'dataset_name': 'meta4xnli',
 'partition': 'train',
 'metaphoricity': 'literal',
 'type': 'sample',
 'size': '1000'}

In [7]:
split_cols = df['file_name'].apply(split_filename).apply(pd.Series)
df = pd.concat([df, split_cols], axis=1)

## Analysis

In [None]:
describe_columns = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
main_experiment_columns = ['model', 'configs', 'metaphorical_system_prompt', 'literal_system_prompt', 'user_prompt', 'metaphoricity', 'size']
additional_experiment_columns = ['dataset_name', 'partition', 'type']
display_columns = main_experiment_columns + describe_columns

### Literal examples

In [46]:
with pd.option_context('display.max_colwidth', None):
    display(df[df.metaphoricity.eq('literal')][display_columns].sort_values('mean', ascending=False).head(5))

Unnamed: 0,model,configs,metaphorical_system_prompt,literal_system_prompt,user_prompt,metaphoricity,size,count,mean,std,min,25%,50%,75%,max
24,gpt-4.1-nano,r3,spt4m,spt4l,upt1,literal,1000,1000.0,0.970698,0.057449,0.545068,0.971431,0.9912,1.0,1.0
27,meta-llama_llama-4-scout-17b-16e-instruct,r4l,spt4m,spt4l,upt1,literal,1000,1000.0,0.97023,0.059183,0.373493,0.970412,0.9912,1.0,1.0
22,meta-llama_llama-4-scout-17b-16e-instruct,r5l,spt4m,spt4l,upt1,literal,1000,1000.0,0.969661,0.059987,0.373493,0.969706,0.9912,1.0,1.0
7,meta-llama_llama-4-scout-17b-16e-instruct,r3,spt4m,spt4l,upt1,literal,1000,1000.0,0.969648,0.061357,0.333111,0.969975,0.9912,1.0,1.0
17,meta-llama_llama-4-maverick-17b-128e-instruct,r5l,spt4m,spt4l,upt1,literal,1000,1000.0,0.968827,0.059328,0.493226,0.966798,0.9912,1.0,1.0


In [None]:
plot_metric_distribution('results/en2es/full_results', 'comet_ref', 'literal')

### Metaphorical examples

In [48]:
with pd.option_context('display.max_colwidth', None):
    display(df[df.metaphoricity.eq('metaphorical')][display_columns].sort_values('mean', ascending=False).head(5))

Unnamed: 0,model,configs,metaphorical_system_prompt,literal_system_prompt,user_prompt,metaphoricity,size,count,mean,std,min,25%,50%,75%,max
20,gpt-4.1-nano,r3,spt4m,spt4l,upt1,metaphorical,1000,1000.0,0.959605,0.070247,0.49332,0.956625,0.98943,1.0,1.0
10,meta-llama_llama-4-scout-17b-16e-instruct,r3,spt4m,spt4l,upt1,metaphorical,1000,1000.0,0.958324,0.074486,0.231261,0.9555,0.988597,1.0,1.0
1,meta-llama_llama-4-maverick-17b-128e-instruct,r6m,spt4m,spt4l,upt1,metaphorical,1000,1000.0,0.958008,0.072298,0.511672,0.95289,0.989601,1.0,1.0
25,meta-llama_llama-4-maverick-17b-128e-instruct,r7m,spt4m,spt4l,upt1,metaphorical,1000,1000.0,0.957881,0.072383,0.492707,0.951127,0.990218,1.0,1.0
11,meta-llama_llama-4-maverick-17b-128e-instruct,r3,spt4m,spt4l,upt1,metaphorical,1000,1000.0,0.957382,0.071643,0.493566,0.948683,0.9894,1.0,1.0


In [41]:
plot_metric_distribution('results/en2es/full_results', 'comet_ref', 'metaphorical')