In [2]:
import os
import spacy
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from difflib import SequenceMatcher
import jiwer

##### This cell below uses the spacy library to calculate the evaluation metrics (accuracy, precision, recall, F1 score).

In [3]:
# Load spaCy model and increase max_length
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1500000  # Increase this value if needed

def read_text_from_file(file_path):
    """
    Read text from a file.
    
    :param file_path: Path to the file.
    :return: Text content of the file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().strip()
    return text

def split_text(text, max_length):
    """
    Split a text into chunks not exceeding max_length.
    
    :param text: The text to split.
    :param max_length: Maximum length of each chunk.
    :return: List of text chunks.
    """
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

def align_sequences(seq1, seq2):
    """
    Align two sequences using the SequenceMatcher from difflib.
    :param seq1: First sequence (list of tokens).
    :param seq2: Second sequence (list of tokens).
    :return: Two aligned sequences.
    """
    matcher = SequenceMatcher(None, seq1, seq2)
    aligned_seq1, aligned_seq2 = [], []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            aligned_seq1.extend(seq1[i1:i2])
            aligned_seq2.extend(seq2[j1:j2])
        elif tag == 'replace':
            aligned_seq1.extend(seq1[i1:i2])
            aligned_seq2.extend([''] * (i2 - i1))
            aligned_seq1.extend([''] * (j2 - j1))
            aligned_seq2.extend(seq2[j1:j2])
        elif tag == 'delete':
            aligned_seq1.extend(seq1[i1:i2])
            aligned_seq2.extend([''] * (i2 - i1))
        elif tag == 'insert':
            aligned_seq1.extend([''] * (j2 - j1))
            aligned_seq2.extend(seq2[j1:j2])
    return aligned_seq1, aligned_seq2

def evaluate_ocr(ocr_text, ground_truth_text):
    """
    Evaluate OCR result against ground truth data using spaCy.

    :param ocr_text: OCR generated text.
    :param ground_truth_text: Ground truth text.
    :return: Dictionary with evaluation metrics.
    """
    # Split texts if they exceed the max length
    if len(ocr_text) > nlp.max_length or len(ground_truth_text) > nlp.max_length:
        ocr_text_chunks = split_text(ocr_text, nlp.max_length)
        ground_truth_text_chunks = split_text(ground_truth_text, nlp.max_length)
    else:
        ocr_text_chunks = [ocr_text]
        ground_truth_text_chunks = [ground_truth_text]

    all_aligned_ocr_tokens, all_aligned_ground_truth_tokens = [], []

    for ocr_chunk, ground_truth_chunk in zip(ocr_text_chunks, ground_truth_text_chunks):
        # Tokenize texts using spaCy
        ocr_tokens = [token.text for token in nlp(ocr_chunk)]
        ground_truth_tokens = [token.text for token in nlp(ground_truth_chunk)]

        # Align sequences
        aligned_ocr_tokens, aligned_ground_truth_tokens = align_sequences(ocr_tokens, ground_truth_tokens)
        all_aligned_ocr_tokens.extend(aligned_ocr_tokens)
        all_aligned_ground_truth_tokens.extend(aligned_ground_truth_tokens)

    # Calculate evaluation metrics
    accuracy = accuracy_score(all_aligned_ground_truth_tokens, all_aligned_ocr_tokens)
    precision = precision_score(all_aligned_ground_truth_tokens, all_aligned_ocr_tokens, average='weighted', zero_division=0)
    recall = recall_score(all_aligned_ground_truth_tokens, all_aligned_ocr_tokens, average='weighted', zero_division=0)
    f1 = f1_score(all_aligned_ground_truth_tokens, all_aligned_ocr_tokens, average='weighted', zero_division=0)

    # Return results as a dictionary
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

def process_directory(ocr_dir, ground_truth_dir, output_csv):
    """
    Process a directory of OCR and ground truth texts and save the evaluation results to a CSV file.

    :param ocr_dir: Directory containing OCRed texts.
    :param ground_truth_dir: Directory containing ground truth texts.
    :param output_csv: Path to the output CSV file.
    """
    results = []

    for filename in os.listdir(ocr_dir):
        ocr_file_path = os.path.join(ocr_dir, filename)
        ground_truth_file_path = os.path.join(ground_truth_dir, filename)

        if os.path.isfile(ocr_file_path) and os.path.isfile(ground_truth_file_path):
            ocr_text = read_text_from_file(ocr_file_path)
            ground_truth_text = read_text_from_file(ground_truth_file_path)
            
            eval_results = evaluate_ocr(ocr_text, ground_truth_text)
            eval_results['Filename'] = filename
            
            results.append(eval_results)
        else:
            print(f"Missing ground truth file for {filename}")

    # Convert results to a DataFrame and save to CSV
    df_results = pd.DataFrame(results)
    df_results.to_csv(output_csv, index=False)

if __name__ == "__main__":
    # Directories containing OCR and ground truth texts
    ocr_directory = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/TestPhi3.5VTranscriptions/'
    ground_truth_directory = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/MobyDickTranscriptions/'
    output_csv_file = 'Phi35_Evaluation.csv'

    # Process the directory and save results to CSV
    process_directory(ocr_directory, ground_truth_directory, output_csv_file)

##### This cell uses the jiwer library to calculate the character error rate and word error rate.

In [23]:
ground_truth = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/MobyDickTranscriptions/'
ocr = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/TestPhi3VTranscriptions/'

ground_truth_dir = sorted(os.listdir(ground_truth))
ocr_dir = sorted(os.listdir(ocr))

if len(ground_truth_dir) != len(ocr_dir):
    raise ValueError("Directories do not contian the same number of files")

results = []

for file1, file2 in zip(ground_truth_dir, ocr_dir):

    with open(os.path.join(ground_truth, file1), 'r', encoding = 'utf-8') as f1:
        text1 = f1.read()
    
    with open(os.path.join(ocr, file2), 'r', encoding = 'utf-8') as f2:
        text2 = f2.read()

    word_error_rate = jiwer.wer(text1, text2)
    char_error_rate = jiwer.cer(text1, text2)

    if file1 == file2:
        results.append({
            "Filename": file1,
            "CER": char_error_rate,
            "WER": word_error_rate
        })
    else:
        results.append({
            "Ground Truth": file1,
            "OCR": file2,
            "CER": char_error_rate,
            "WER": word_error_rate
        })

df = pd.DataFrame(results)

output_csv_path = "Phi_Error_Rate.csv"
df.to_csv(output_csv_path, index = False)

print(f"Results saved to {output_csv_path}")

Results saved to Phi_Error_Rate.csv


##### This cell was only used for calculating the evaluation metrics for Phi 3.5 Vision and merge that DataFrame with the page counts for each chapter.

In [15]:
import os
import pandas as pd
from difflib import SequenceMatcher
import spacy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load spaCy model and increase max_length
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1500000  # Increase this value if needed

def benchmark_ocr_files(ocr_dir, ground_truth_dir):
    """
    Benchmark OCRed text files against ground truth texts.
    
    :param ocr_dir: Directory containing OCRed text files.
    :param ground_truth_dir: Directory containing ground truth text files.
    :return: DataFrame with evaluation metrics.
    """
    ground_truth_files = os.listdir(ground_truth_dir)
    results = []

    for ocr_file in os.listdir(ocr_dir):
        ocr_file_path = os.path.join(ocr_dir, ocr_file)
        if os.path.isfile(ocr_file_path):
            ocr_text = read_text_from_file(ocr_file_path)
            best_match = soft_match(ocr_file, ground_truth_files)
            
            if best_match:
                ground_truth_file_path = os.path.join(ground_truth_dir, best_match)
                ground_truth_text = read_text_from_file(ground_truth_file_path)
                eval_results = evaluate_ocr(ocr_text, ground_truth_text)
                eval_results['OCR File'] = ocr_file
                eval_results['Ground Truth File'] = best_match
                results.append(eval_results)
            else:
                print(f"No matching ground truth file found for {ocr_file}")
    
    return pd.DataFrame(results)

def count_pages_and_match(pages_dir, ground_truth_dir):
    """
    Count the number of pages in subfolders and match them to ground truth text files.
    
    :param pages_dir: Directory containing subfolders with pages.
    :param ground_truth_dir: Directory containing ground truth text files.
    :return: DataFrame with page counts and matches.
    """
    ground_truth_files = os.listdir(ground_truth_dir)
    subfolder_counts = []

    for subfolder in os.listdir(pages_dir):
        subfolder_path = os.path.join(pages_dir, subfolder)
        if os.path.isdir(subfolder_path):
            file_count = len([name for name in os.listdir(subfolder_path) if os.path.isfile(os.path.join(subfolder_path, name))])
            best_match = soft_match(subfolder, ground_truth_files)
            subfolder_counts.append({
                'Subfolder': subfolder,
                'Number of Pages': file_count,
                'Ground Truth File': best_match
            })
    
    return pd.DataFrame(subfolder_counts)

def merge_dataframes(pages_df, ocr_df, output_csv):
    """
    Merge DataFrames containing page counts and OCR evaluation metrics.
    
    :param pages_df: DataFrame with page counts and matches.
    :param ocr_df: DataFrame with OCR evaluation metrics.
    :param output_csv: Path to the final merged CSV file.
    """
    merged_df = pd.merge(pages_df, ocr_df, on='Ground Truth File', how='inner')
    merged_df.to_csv(output_csv, index=False)

def soft_match(subfolder_name, ground_truth_files):
    """
    Perform a soft match between a subfolder name and ground truth file names.
    
    :param subfolder_name: Name of the subfolder.
    :param ground_truth_files: List of ground truth file names.
    :return: Best matching ground truth file name.
    """
    best_match = None
    highest_ratio = 0
    for gt_file in ground_truth_files:
        ratio = SequenceMatcher(None, subfolder_name, gt_file).ratio()
        if ratio > highest_ratio:
            highest_ratio = ratio
            best_match = gt_file
    return best_match

def read_text_from_file(file_path):
    """
    Read text from a file.
    
    :param file_path: Path to the file.
    :return: Text content of the file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().strip()
    return text

def evaluate_ocr(ocr_text, ground_truth_text):
    """
    Evaluate OCR result against ground truth data using spaCy.
    
    :param ocr_text: OCR generated text.
    :param ground_truth_text: Ground truth text.
    :return: Dictionary with evaluation metrics.
    """
    # Tokenize and evaluate
    ocr_tokens = [token.text for token in nlp(ocr_text)]
    ground_truth_tokens = [token.text for token in nlp(ground_truth_text)]

    # Align sequences
    aligned_ocr_tokens, aligned_ground_truth_tokens = align_sequences(ocr_tokens, ground_truth_tokens)

    # Calculate evaluation metrics
    accuracy = accuracy_score(aligned_ground_truth_tokens, aligned_ocr_tokens)
    precision = precision_score(aligned_ground_truth_tokens, aligned_ocr_tokens, average='weighted', zero_division=0)
    recall = recall_score(aligned_ground_truth_tokens, aligned_ocr_tokens, average='weighted', zero_division=0)
    f1 = f1_score(aligned_ground_truth_tokens, aligned_ocr_tokens, average='weighted', zero_division=0)

    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

def align_sequences(seq1, seq2):
    """
    Align two sequences using the SequenceMatcher from difflib.
    
    :param seq1: First sequence (list of tokens).
    :param seq2: Second sequence (list of tokens).
    :return: Two aligned sequences.
    """
    matcher = SequenceMatcher(None, seq1, seq2)
    aligned_seq1, aligned_seq2 = [], []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            aligned_seq1.extend(seq1[i1:i2])
            aligned_seq2.extend(seq2[j1:j2])
        elif tag == 'replace':
            aligned_seq1.extend(seq1[i1:i2])
            aligned_seq2.extend([''] * (i2 - i1))
            aligned_seq1.extend([''] * (j2 - j1))
            aligned_seq2.extend(seq2[j1:j2])
        elif tag == 'delete':
            aligned_seq1.extend(seq1[i1:i2])
            aligned_seq2.extend([''] * (i2 - i1))
        elif tag == 'insert':
            aligned_seq1.extend([''] * (j2 - j1))
            aligned_seq2.extend(seq2[j1:j2])
    return aligned_seq1, aligned_seq2

# Example usage
if __name__ == "__main__":
    # Directories containing pages, OCR transcriptions, and ground truth texts
    pages_directory = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/LowResolutionMobyDickImages/'
    ocr_directory = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/TestPhi3.5VTranscriptions/'
    ground_truth_directory = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/MobyDickTranscriptions/'
    output_csv_file = '/home/darshewskijadmin@consilio.com/ExperimentalLLMs/Data/final_merged_results.csv'

    # Benchmark OCR files against ground truth texts
    ocr_df = benchmark_ocr_files(ocr_directory, ground_truth_directory)

    # Count pages in subfolders and match to ground truth texts
    pages_df = count_pages_and_match(pages_directory, ground_truth_directory)

    # Merge the DataFrames and save the final output CSV
    merge_dataframes(pages_df, ocr_df, output_csv_file)