## Searchability

##### This notebook was for looking into searchability of the models, which for the eDiscovery process is the main reason why we want to look into using generative AI.

In [15]:
import os
import re
import csv

In [32]:
# Function to clean and tokenize text
def clean_and_tokenize(text):
    # Convert to lowercase, remove punctuation, and split into words
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()
    return words

# Function to read text from a file
def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Function to calculate missing and changed percentages
def calculate_missing_and_changed_percentage(ground_truth_text, ocr_text):
    # Tokenize both texts
    ground_truth_words = clean_and_tokenize(ground_truth_text)
    ocr_words = clean_and_tokenize(ocr_text)
    
    # Convert OCR text back to a single string for substring checks
    ocr_text_string = ' '.join(ocr_words)
    
    # Find words in OCR text that are not in ground truth (initially missing words)
    initially_missing_words = set(ground_truth_words) - set(ocr_words)
    extra_words_in_ocr = set(ocr_words) - set(ground_truth_words)
    changed_words = set()
    
    # Check if OCR text contains the missing words as substrings and move them to changed list if found
    final_missing_words = set()
    for word in initially_missing_words:
        if word in ocr_text_string:
            changed_words.add(word)
        else:
            final_missing_words.add(word)
    
    # Calculate the percentages
    total_words_in_ground_truth = len(ground_truth_words)
    missing_percentage = (len(extra_words_in_ocr) / total_words_in_ground_truth) * 100
    if len(initially_missing_words) > 0:
        changed_percentage = (len(changed_words) / len(initially_missing_words)) * 100
    else:
        changed_percentage = 0.0
    
    return missing_percentage, changed_percentage

# Function to process directories and save results to CSV
def process_directories_and_save_to_csv(ground_truth_dir, ocr_dir, csv_file_path):
    results = []
    
    # Get list of files in both directories
    ground_truth_files = os.listdir(ground_truth_dir)
    ocr_files = os.listdir(ocr_dir)
    
    # Match files by name and calculate missing and changed percentages
    for gt_file in ground_truth_files:
        gt_file_path = os.path.join(ground_truth_dir, gt_file)
        
        # Find matching OCR file by name
        if gt_file in ocr_files:
            ocr_file_path = os.path.join(ocr_dir, gt_file)
            
            # Read the texts
            ground_truth_text = read_text_from_file(gt_file_path)
            ocr_text = read_text_from_file(ocr_file_path)
            
            # Calculate missing and changed percentages
            missing_percentage, changed_percentage = calculate_missing_and_changed_percentage(ground_truth_text, ocr_text)
            
            # Save the result
            results.append({
                'Ground Truth File': gt_file_path,
                'OCR File': ocr_file_path,
                'Missing Percentage': f"{missing_percentage:.2f}%",
                'Changed Percentage': f"{changed_percentage:.2f}%"
            })
    
    # Save all results to CSV
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Ground Truth File', 'OCR File', 'Missing Percentage', 'Changed Percentage']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for result in results:
            writer.writerow(result)
    
    print(f"Results saved to {csv_file_path}")

In [34]:
# Sample usage
ground_truth_dir = "/home/darshewskijadmin@consilio.com/ExperimentalLLMs/MobyDickTranscriptions/"
ocr_dir = "/home/darshewskijadmin@consilio.com/ExperimentalLLMs/TestTesseractTranscriptions/"
csv_file_path = "/home/darshewskijadmin@consilio.com/ExperimentalLLMs/Data/Tesseract_Searchability.csv"

process_directories_and_save_to_csv(ground_truth_dir, ocr_dir, csv_file_path)

Results saved to /home/darshewskijadmin@consilio.com/ExperimentalLLMs/Data/Tesseract_Searchability.csv
