In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
def analyze_ocr_summary(file_path):
    """
    Analyzes OCR text extraction errors from a CSV file and provides a summary.

    Args:
        file_path (str): The path to the CSV file.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return

    if 'reference' not in df.columns or 'generated' not in df.columns:
        print("Error: The CSV file must contain 'reference' and 'generated' columns.")
        return

    # To store incorrect transformations: {'actual_char': {'incorrect_char': count}}
    error_counts = defaultdict(lambda: defaultdict(int))
    total_misidentifications = 0

    for index, row in df.iterrows():
        reference_text = str(row['reference'])
        generated_text = str(row['generated'])

        # Iterate through the strings up to the length of the shorter string
        for i in range(min(len(reference_text), len(generated_text))):
            if reference_text[i] != generated_text[i]:
                error_counts[reference_text[i]][generated_text[i]] += 1
                total_misidentifications += 1
        
        # Handle cases where lengths are different (missing or extra characters)
        if len(reference_text) > len(generated_text):
            # Characters present in reference but missing in generated
            for i in range(len(generated_text), len(reference_text)):
                error_counts[reference_text[i]]['<MISSING>'] += 1
                total_misidentifications += 1
        elif len(generated_text) > len(reference_text):
            # Extra characters in generated (not present in reference)
            for i in range(len(reference_text), len(generated_text)):
                error_counts['<EXTRA>'][generated_text[i]] += 1
                total_misidentifications += 1


    print(f"Total Number of Misidentifications: {total_misidentifications}")
    print("\nMost Erroneous Characters (based on total errors):")
    print("-" * 50)

    if not error_counts and total_misidentifications == 0:
        print("No character mismatches found.")
        return

    overall_problematic_chars = defaultdict(int)
    for ref_char, errors in error_counts.items():
        if ref_char == '<EXTRA>':
            # Sum up errors for extra characters, categorize by the extra char itself
            for gen_char, count in errors.items():
                overall_problematic_chars[f"EXTRA:'{gen_char}'"] += count
        else:
            for count in errors.values():
                overall_problematic_chars[ref_char] += count
    
    sorted_problematic = sorted(overall_problematic_chars.items(), key=lambda item: item[1], reverse=True)
    
    for char, total_errors in sorted_problematic:
        print(f"Character '{char}': {total_errors} errors")

In [3]:
analyze_ocr_summary('Google_DocumentAI_OCR_Results.csv')

Total Number of Misidentifications: 36349

Most Erroneous Characters (based on total errors):
--------------------------------------------------
Character '்': 7461 errors
Character 'க': 3248 errors
Character ' ': 2998 errors
Character 'த': 2458 errors
Character 'ன': 1769 errors
Character 'ர': 1627 errors
Character 'ம': 1599 errors
Character 'ட': 1552 errors
Character 'ப': 1511 errors
Character 'வ': 1219 errors
Character 'ல': 1165 errors
Character 'ற': 978 errors
Character 'ள': 962 errors
Character 'ய': 921 errors
Character 'ந': 823 errors
Character 'ச': 655 errors
Character 'அ': 557 errors
Character 'இ': 504 errors
Character 'எ': 490 errors
Character 'EXTRA:'்'': 470 errors
Character 'ண': 414 errors
Character 'ஒ': 407 errors
Character 'ங': 387 errors
Character 'உ': 241 errors
Character 'ு': 234 errors
Character 'ீ': 194 errors
Character 'ழ': 168 errors
Character 'ா': 164 errors
Character 'EXTRA:'த'': 149 errors
Character 'ி': 130 errors
Character 'ஆ': 104 errors
Character 'ே': 82 erro