In [1]:
import pandas as pd
from collections import defaultdict

In [7]:
def analyze_ocr_summary(file_path):
    """
    Analyzes OCR text extraction errors from a CSV file and provides a summary.

    Args:
        file_path (str): The path to the CSV file.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return

    if 'reference' not in df.columns or 'generated' not in df.columns:
        print("Error: The CSV file must contain 'reference' and 'generated' columns.")
        return

    # To store incorrect transformations: {'actual_char': {'incorrect_char': count}}
    error_counts = defaultdict(lambda: defaultdict(int))
    total_misidentifications = 0

    for index, row in df.iterrows():
        reference_text = str(row['reference'])
        generated_text = str(row['generated'])

        # Iterate through the strings up to the length of the shorter string
        for i in range(min(len(reference_text), len(generated_text))):
            if reference_text[i] != generated_text[i]:
                error_counts[reference_text[i]][generated_text[i]] += 1
                total_misidentifications += 1
        
        # Handle cases where lengths are different (missing or extra characters)
        if len(reference_text) > len(generated_text):
            # Characters present in reference but missing in generated
            for i in range(len(generated_text), len(reference_text)):
                error_counts[reference_text[i]]['<MISSING>'] += 1
                total_misidentifications += 1
        elif len(generated_text) > len(reference_text):
            # Extra characters in generated (not present in reference)
            for i in range(len(reference_text), len(generated_text)):
                error_counts['<EXTRA>'][generated_text[i]] += 1
                total_misidentifications += 1


    print(f"Total Number of Misidentifications: {total_misidentifications}")
    print("\nMost Erroneous Characters (based on total errors):")
    print("-" * 50)

    if not error_counts and total_misidentifications == 0:
        print("No character mismatches found.")
        return

    overall_problematic_chars = defaultdict(int)
    for ref_char, errors in error_counts.items():
        if ref_char == '<EXTRA>':
            # Sum up errors for extra characters, categorize by the extra char itself
            for gen_char, count in errors.items():
                overall_problematic_chars[f"EXTRA:'{gen_char}'"] += count
        else:
            for count in errors.values():
                overall_problematic_chars[ref_char] += count
    
    sorted_problematic = sorted(overall_problematic_chars.items(), key=lambda item: item[1], reverse=True)
    
    for char, total_errors in sorted_problematic:
        print(f"Character '{char}': {total_errors} errors")

In [9]:
analyze_ocr_summary('Surya_OCR_Results.csv')

Total Number of Misidentifications: 127225

Most Erroneous Characters (based on total errors):
--------------------------------------------------
Character ' ': 17298 errors
Character '්': 9520 errors
Character 'ි': 7181 errors
Character 'න': 6686 errors
Character 'ර': 6164 errors
Character 'ය': 5864 errors
Character 'ා': 5661 errors
Character 'ව': 5412 errors
Character 'ක': 5358 errors
Character 'ම': 4569 errors
Character 'ත': 4420 errors
Character 'ස': 3440 errors
Character 'ු': 3412 errors
Character 'ල': 2921 errors
Character 'ද': 2752 errors
Character 'ප': 2727 errors
Character 'ේ': 2546 errors
Character 'ෙ': 2271 errors
Character 'ට': 2121 errors
Character 'හ': 2065 errors
Character 'ී': 1898 errors
Character 'ග': 1761 errors
Character 'ැ': 1365 errors
Character 'අ': 1109 errors
Character 'බ': 1108 errors
Character '.': 857 errors
Character 'ො': 791 errors
Character 'ණ': 781 errors
Character 'ශ': 751 errors
Character 'ං': 727 errors
Character 'ඩ': 650 errors
Character 'ජ': 632 err