In [None]:
import pandas as pd

def parse_metrics(text):
    # Split the text into lines
    lines = text.split('\n')

    # List to hold data for each model
    models_data = []
    current_model = None
    metrics = {}

    # Process each line
    for line in lines:
        line = line.strip()  # Remove leading/trailing whitespace

        # Check if line indicates a new model
        if line.startswith("Processing model:"):
            # If we were already processing a model, save its metrics
            if current_model is not None:
                models_data.append(metrics)
            # Extract model name and start new metrics dictionary
            current_model = line.split("Processing model: ")[1].strip()
            metrics = {'Model': current_model}

        # Extract specific metrics
        elif "Calculated overall WER:" in line:
            wer = float(line.split("Calculated overall WER: ")[1].strip())
            metrics['WER'] = wer
        elif "Calculated overall CER:" in line:
            cer = float(line.split("Calculated overall CER: ")[1].strip())
            metrics['CER'] = cer
        elif "Total Substitutions:" in line:
            substitutions = int(line.split("Total Substitutions: ")[1].strip())
            metrics['Total Substitutions'] = substitutions
        elif "Total Insertions:" in line:
            insertions = int(line.split("Total Insertions: ")[1].strip())
            metrics['Total Insertions'] = insertions
        elif "Total Deletions:" in line:
            deletions = int(line.split("Total Deletions: ")[1].strip())
            metrics['Total Deletions'] = deletions
        elif "Total Words:" in line:
            words = int(line.split("Total Words: ")[1].strip())
            metrics['Total Words'] = words
        elif "Substitution Rate:" in line:
            sub_rate = float(line.split("Substitution Rate: ")[1].strip())
            metrics['Substitution Rate'] = sub_rate
        elif "Insertion Rate:" in line:
            ins_rate = float(line.split("Insertion Rate: ")[1].strip())
            metrics['Insertion Rate'] = ins_rate
        elif "Deletion Rate:" in line:
            del_rate = float(line.split("Deletion Rate: ")[1].strip())
            metrics['Deletion Rate'] = del_rate

    # Append the last model's metrics
    if current_model is not None:
        models_data.append(metrics)

    # Create DataFrame
    df = pd.DataFrame(models_data)
    return df

# The input text provided in the query
text = """Processing model: asr-africa/whisper-small-Sagalee-orm-50hrs-2
Running inference: 100%|██████████| 2500/2500 [13:28<00:00,  3.09it/s]

Overall Metrics:
Calculated overall WER: 0.3680
Calculated overall CER: 0.0995
Total Substitutions: 9009
Total Insertions: 871
Total Deletions: 1293
Total Words: 30363
Substitution Rate: 0.2967
Insertion Rate: 0.0287
Deletion Rate: 0.0426
Results saved to text_expanded_whisper-small-Sagalee-orm-50hrs-2.csv
Processing model: asr-africa/whisper-small-Sagalee-orm-20hrs-3
config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]
generation_config.json:   0%|          | 0.00/3.79k [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]
merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]
normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [13:35<00:00,  3.06it/s]

Overall Metrics:
Calculated overall WER: 0.3909
Calculated overall CER: 0.0843
Total Substitutions: 9921
Total Insertions: 1023
Total Deletions: 926
Total Words: 30363
Substitution Rate: 0.3267
Insertion Rate: 0.0337
Deletion Rate: 0.0305
Results saved to text_expanded_whisper-small-Sagalee-orm-20hrs-3.csv
Processing model: asr-africa/whisper-small-Sagalee-orm-10hrs-3
config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]
generation_config.json:   0%|          | 0.00/3.79k [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]
merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]
normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [12:21<00:00,  3.37it/s]

Overall Metrics:
Calculated overall WER: 0.5418
Calculated overall CER: 0.2459
Total Substitutions: 10350
Total Insertions: 1105
Total Deletions: 4996
Total Words: 30363
Substitution Rate: 0.3409
Insertion Rate: 0.0364
Deletion Rate: 0.1645
Results saved to text_expanded_whisper-small-Sagalee-orm-10hrs-3.csv
Processing model: asr-africa/whisper-small-Sagalee-orm-5hrs-3
config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]
generation_config.json:   0%|          | 0.00/3.79k [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]
merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]
normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [11:50<00:00,  3.52it/s]

Overall Metrics:
Calculated overall WER: 0.6321
Calculated overall CER: 0.3170
Total Substitutions: 11103
Total Insertions: 1992
Total Deletions: 6098
Total Words: 30363
Substitution Rate: 0.3657
Insertion Rate: 0.0656
Deletion Rate: 0.2008
Results saved to text_expanded_whisper-small-Sagalee-orm-5hrs-3.csv
Processing model: asr-africa/whisper-small-Sagalee-orm-1hrs-3asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-1hrs-1Processing model: asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-5hrs-1
config.json:   0%|          | 0.00/2.09k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/320 [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [01:06<00:00, 37.54it/s]

Overall Metrics:
Calculated overall WER: 0.4805
Calculated overall CER: 0.1238
Total Substitutions: 11629
Total Insertions: 997
Total Deletions: 1963
Total Words: 30363
Substitution Rate: 0.3830
Insertion Rate: 0.0328
Deletion Rate: 0.0647
Results saved to text_expanded_wav2vec2-xls-r-300m-Sagalee-orm-5hrs-1.csv
Processing model: asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-10hrs-1
config.json:   0%|          | 0.00/2.09k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/320 [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [01:05<00:00, 37.94it/s]

Overall Metrics:
Calculated overall WER: 0.3778
Calculated overall CER: 0.1052
Total Substitutions: 9294
Total Insertions: 843
Total Deletions: 1335
Total Words: 30363
Substitution Rate: 0.3061
Insertion Rate: 0.0278
Deletion Rate: 0.0440
Results saved to text_expanded_wav2vec2-xls-r-300m-Sagalee-orm-10hrs-1.csv
Processing model: asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-20hrs-1
config.json:   0%|          | 0.00/2.09k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/320 [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [01:06<00:00, 37.87it/s]

Overall Metrics:
Calculated overall WER: 0.3154
Calculated overall CER: 0.0808
Total Substitutions: 8075
Total Insertions: 618
Total Deletions: 885
Total Words: 30363
Substitution Rate: 0.2659
Insertion Rate: 0.0204
Deletion Rate: 0.0291
Results saved to text_expanded_wav2vec2-xls-r-300m-Sagalee-orm-20hrs-1.csv
Processing model: asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-50hrs-1
config.json:   0%|          | 0.00/2.09k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/320 [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [01:05<00:00, 37.95it/s]

Overall Metrics:
Calculated overall WER: 0.2776
Calculated overall CER: 0.0744
Total Substitutions: 7295
Total Insertions: 489
Total Deletions: 644
Total Words: 30363
Substitution Rate: 0.2403
Insertion Rate: 0.0161
Deletion Rate: 0.0212
Results saved to text_expanded_wav2vec2-xls-r-300m-Sagalee-orm-50hrs-1.csv
Processing model: asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-85hrs-4asr-africa/mms-1b-all-Sagalee-orm-85hrs-4Processing model: asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-85hrs-4
config.json:   0%|          | 0.00/2.09k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/320 [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [01:05<00:00, 38.22it/s]

Overall Metrics:
Calculated overall WER: 0.2503
Calculated overall CER: 0.0421
Total Substitutions: 6404
Total Insertions: 477
Total Deletions: 719
Total Words: 30363
Substitution Rate: 0.2109
Insertion Rate: 0.0157
Deletion Rate: 0.0237
Results saved to text_expanded_wav2vec2-xls-r-300m-Sagalee-orm-85hrs-4.csv
Processing model: asr-africa/mms-1b-all-Sagalee-orm-85hrs-4
config.json:   0%|          | 0.00/2.05k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/320 [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [02:37<00:00, 15.87it/s]

Overall Metrics:
Calculated overall WER: 0.2122
Calculated overall CER: 0.0356
Total Substitutions: 5552
Total Insertions: 294
Total Deletions: 597
Total Words: 30363
Substitution Rate: 0.1829
Insertion Rate: 0.0097
Deletion Rate: 0.0197
Results saved to text_expanded_mms-1b-all-Sagalee-orm-85hrs-4.csv
Processing model: asr-africa/mms-1b-all-Sagalee-orm-50hrs-1
config.json:   0%|          | 0.00/2.05k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/320 [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [02:37<00:00, 15.83it/s]

Overall Metrics:
Calculated overall WER: 0.2886
Calculated overall CER: 0.0771
Total Substitutions: 7501
Total Insertions: 417
Total Deletions: 844
Total Words: 30363
Substitution Rate: 0.2470
Insertion Rate: 0.0137
Deletion Rate: 0.0278
Results saved to text_expanded_mms-1b-all-Sagalee-orm-50hrs-1.csv
Processing model: asr-africa/mms-1b-all-Sagalee-orm-20hrs-1
config.json:   0%|          | 0.00/2.05k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/320 [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [02:38<00:00, 15.82it/s]

Overall Metrics:
Calculated overall WER: 0.3094
Calculated overall CER: 0.0887
Total Substitutions: 8015
Total Insertions: 895
Total Deletions: 483
Total Words: 30363
Substitution Rate: 0.2640
Insertion Rate: 0.0295
Deletion Rate: 0.0159
Results saved to text_expanded_mms-1b-all-Sagalee-orm-20hrs-1.csv
Processing model: asr-africa/mms-1b-all-Sagalee-orm-10hrs-1
config.json:   0%|          | 0.00/2.05k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]
tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]
vocab.json:   0%|          | 0.00/320 [00:00<?, ?B/s]
added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]
preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]
Running inference: 100%|██████████| 2500/2500 [02:38<00:00, 15.82it/s]

Overall Metrics:
Calculated overall WER: 0.3033
Calculated overall CER: 0.0815
Total Substitutions: 7827
Total Insertions: 460
Total Deletions: 922
Total Words: 30363
Substitution Rate: 0.2578
Insertion Rate: 0.0152
Deletion Rate: 0.0304
Results saved to text_expanded_mms-1b-all-Sagalee-orm-10hrs-1.csv
Processing model: asr-africa/mms-1b-all-Sagalee-orm-5hrs-1
"""

# Parse the text and create the table
results_df = parse_metrics(text)

# Display the table
print("Metrics Table for Models:")
display(results_df)  # Use display() for nicer output in Colab

# Optionally, save to CSV
results_df.to_csv('model_metrics.csv', index=False)
print("\nResults saved to 'model_metrics.csv'")

Metrics Table for Models:


Unnamed: 0,Model,WER,CER,Total Substitutions,Total Insertions,Total Deletions,Total Words,Substitution Rate,Insertion Rate,Deletion Rate
0,asr-africa/whisper-small-Sagalee-orm-50hrs-2,0.368,0.0995,9009.0,871.0,1293.0,30363.0,0.2967,0.0287,0.0426
1,asr-africa/whisper-small-Sagalee-orm-20hrs-3,0.3909,0.0843,9921.0,1023.0,926.0,30363.0,0.3267,0.0337,0.0305
2,asr-africa/whisper-small-Sagalee-orm-10hrs-3,0.5418,0.2459,10350.0,1105.0,4996.0,30363.0,0.3409,0.0364,0.1645
3,asr-africa/whisper-small-Sagalee-orm-5hrs-3,0.6321,0.317,11103.0,1992.0,6098.0,30363.0,0.3657,0.0656,0.2008
4,asr-africa/whisper-small-Sagalee-orm-1hrs-3asr...,0.4805,0.1238,11629.0,997.0,1963.0,30363.0,0.383,0.0328,0.0647
5,asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-10h...,0.3778,0.1052,9294.0,843.0,1335.0,30363.0,0.3061,0.0278,0.044
6,asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-20h...,0.3154,0.0808,8075.0,618.0,885.0,30363.0,0.2659,0.0204,0.0291
7,asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-50h...,0.2776,0.0744,7295.0,489.0,644.0,30363.0,0.2403,0.0161,0.0212
8,asr-africa/wav2vec2-xls-r-300m-Sagalee-orm-85h...,0.2503,0.0421,6404.0,477.0,719.0,30363.0,0.2109,0.0157,0.0237
9,asr-africa/mms-1b-all-Sagalee-orm-85hrs-4,0.2122,0.0356,5552.0,294.0,597.0,30363.0,0.1829,0.0097,0.0197



Results saved to 'model_metrics.csv'
