# Converting the result .txt file into a .csv file

In [None]:
import csv
import re
import os
from pathlib import Path
import pandas as pd

In [None]:
def clean_raw_result_file(input_file_path, output_file_path):
    
    with open(input_file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    lines = [line.strip() for line in content.split('\n') if line.strip()]
    lines = [line for line in lines if not set(line.strip()) <= {'|', '-'}]
    
    header_line = lines[0]
    all_headers = [h.strip() for h in header_line.strip('|').split('|')]
    headers = [h for h in all_headers if h]
    lines = lines[1:]
    
    rows = []
    last_task = ""
    for line in lines:
        all_items = [item.strip() for item in line.strip('|').split('|')]
        items = [all_items[i] for i in range(len(all_items)) if i < len(all_headers) and all_headers[i].strip()]
        
        if items[0] == '':
            items[0] = last_task
        else:
            last_task = items[0]
        rows.append(items)
    
    output_file_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_file_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        writer.writerows(rows)

In [None]:
def further_clean_csv_file(csv_path):
    df = pd.read_csv(csv_path)
    if 'Filter' in df.columns:
        df = df[df['Filter'] != 'remove_whitespace']
    for col in ['Filter', 'n-shot', 'Version']:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)
    # Remove the first row (index 0)
    df = df.iloc[1:]
    df.to_csv(csv_path, index=False)


In [None]:
def process_all_raw_results():

    raw_dir = Path('data/results/result_raw')
    cleaned_dir = Path('data/results/result_cleaned')
    
    if not raw_dir.exists():
        print(f"Raw results directory not found: {raw_dir}")
        return
    
    for model_dir in raw_dir.iterdir():
        if model_dir.is_dir():
            model_name = model_dir.name
            
            cleaned_model_dir = cleaned_dir / model_name
            cleaned_model_dir.mkdir(parents=True, exist_ok=True)
            
            for txt_file in model_dir.glob('*.txt'):
                task_name = txt_file.stem 
                output_file = cleaned_model_dir / f"{task_name}_{model_name.lower()}.csv"
                
                try:
                    clean_raw_result_file(txt_file, output_file)
                except Exception as e:
                    print(f"Error processing {txt_file}: {e}")
                try:
                    further_clean_csv_file(output_file)
                except Exception as e:
                    print(f"Error further cleaning {output_file}: {e}")


# Result Analysis

In [None]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

def extract_language(task_name):
    match = re.search(r'_(amh|eng|ewe|fra|hau|ibo|kin|lin|lug|orm|sna|sot|swa|twi|vai|wol|xho|yor|zul)_', task_name)
    if match:
        return match.group(1)
    return None

def load_and_process_data():
    results_dir = Path('data/results/result_cleaned')
    all_data = []
    
    for model_dir in results_dir.iterdir():
        if model_dir.is_dir():
            model_name = model_dir.name
            
            for csv_file in model_dir.glob('*.csv'):
                task_name = csv_file.stem  
                
                try:
                    df = pd.read_csv(csv_file)
                    
                    df['Model'] = model_name
                    df['Task'] = task_name
                    
                    df['Language'] = df['Tasks'].apply(extract_language)
                    
                    all_data.append(df)
                    
                except Exception as e:
                    print(f"Error processing {csv_file}: {e}")
    
    return pd.concat(all_data, ignore_index=True)

def calculate_averages(df):
    
    df['Stderr'] = pd.to_numeric(df['Stderr'], errors='coerce')
    
    language_stats = df.groupby('Language').agg({
        'Value': ['mean', 'count'],
        'Stderr': 'mean'
    }).round(4)
    
    language_stats.columns = ['Value_Avg', 'Count', 'Stderr_Avg']
    language_stats = language_stats.reset_index()
    
    model_stats = df.groupby('Model').agg({
        'Value': ['mean', 'count'],
        'Stderr': 'mean'
    }).round(4)
    
    model_stats.columns = ['Value_Avg', 'Count', 'Stderr_Avg']
    model_stats = model_stats.reset_index()
    
    task_stats = df.groupby('Task').agg({
        'Value': ['mean', 'count'],
        'Stderr': 'mean'
    }).round(4)
    
    task_stats.columns = ['Value_Avg', 'Count', 'Stderr_Avg']
    task_stats = task_stats.reset_index()
    
    return language_stats, model_stats, task_stats

def main():
    df = load_and_process_data()
    
    print(f"Total records loaded: {len(df)}")
    print(f"Languages found: {sorted(df['Language'].unique())}")
    print(f"Models found: {sorted(df['Model'].unique())}")
    print(f"Tasks found: {sorted(df['Task'].unique())}")
    print("\n" + "="*50)
    
    total_avg, language_stats, model_stats, task_stats = calculate_averages(df)
    
    print("AVERAGES BY LANGUAGE:")
    print(language_stats.to_string(index=False))
    print("\n" + "="*50)
    
    print("AVERAGES BY MODEL:")
    print(model_stats.to_string(index=False))
    print("\n" + "="*50)
    
    print("AVERAGES BY TASK:")
    print(task_stats.to_string(index=False))
    
    # Save results to CSV files
    language_stats.to_csv('results/analysis/language_averages.csv', index=False)
    model_stats.to_csv('results/analysis/model_averages.csv', index=False)
    task_stats.to_csv('results/analysis/task_averages.csv', index=False)
    
    print("\n" + "="*50)
    print("Results saved to:")
    print("- results/analysis/language_averages.csv")
    print("- results/analysis/model_averages.csv") 
    print("- results/analysis/task_averages.csv")

if __name__ == "__main__":
    main() 