In [1]:
import pandas as pd
from sklearn.metrics import classification_report

In [32]:
def generate_classification_report_per_dataset(dataset, model_columns, true_label_column='label'):
    
    detailed_results = []
    error_counts = {}  
    
    true_labels = dataset[true_label_column]

    for model_col in model_columns:

        predictions = dataset[model_col]
        
        # Count the 'error' occurrences
        error_count = (predictions == 'error').sum() + (predictions == '').sum()
        error_counts[model_col] = error_count  
        
        # Replace 'error' with a special class for classification report
        predictions = predictions.replace('error', 'error_class')
        predictions = predictions.replace('', 'error_class')
        report = classification_report(true_labels, predictions, output_dict=True, zero_division=0)
        
        # Add detailed results
        for label, metrics in report.items():
            if isinstance(metrics, dict):  
                detailed_results.append({
                    'Model': model_col,
                    'Class': label,
                    'Precision': metrics.get('precision'),
                    'Recall': metrics.get('recall'),
                    'F1-Score': metrics.get('f1-score'),
                    'Support': metrics.get('support'),
                })


    detailed_df = pd.DataFrame(detailed_results)

    # Update support for 'error' class using the error_counts dictionary
    for model, error_count in error_counts.items():
        detailed_df.loc[(detailed_df['Model'] == model) & (detailed_df['Class'] == 'error_class'), 'Support'] = error_count

    return detailed_df


In [43]:
ds_name = 

In [44]:
df = pd.read_feather(f'datasets/{ds_name}.feather')

In [15]:
models = ['Llama-3.1-8B-Instruct','Llama-3.1-70B-Instruct', 'Ministral-8B-Instruct-2410', 'gemma-2-9b-it',
       'SauerkrautLM-gemma-2-9b-it','Teuken-7B-instruct-research-v0.4','Llama-3.3-70B-Instruct','Llama-3.1-SauerkrautLM-8b-Instruct']

In [51]:
class_repo= generate_classification_report_per_dataset(df,models)

In [53]:
class_repo.to_csv(f'results/{ds_name}_classrepo.csv')

In [54]:
import os
# Initialize lists to store results
error_per_model_per_dataset = []
error_per_model_total = {}

# Iterate through all CSV files in the folder
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        file_path = os.path.join(folder_path, file)
        
        df = pd.read_csv(file_path)

        error_df = df[df['Class'] == 'error_class']
        
        # Sum errors per Model in the dataset
        for model, support_sum in error_df.groupby('Model')['Support'].sum().items():
            error_per_model_per_dataset.append({
                'Dataset': file, 
                'Model': model, 
                'Total Errors': support_sum
            })
            
            # Aggregate total errors per Model across all datasets
            if model in error_per_model_total:
                error_per_model_total[model] += support_sum
            else:
                error_per_model_total[model] = support_sum

df_per_dataset = pd.DataFrame(error_per_model_per_dataset)
df_total_errors = pd.DataFrame(error_per_model_total.items(), columns=['Model', 'Total Errors Across Datasets'])

df_per_dataset.to_csv('error_ds.csv',index=False)
df_total_errors.to_csv('error_sum.csv',index=False)