In [None]:
import pandas as pd

# Function to calculate accuracy for each field and overall trustworthiness
def calculate_field_wise_accuracy(ground_truth_df, extracted_data_df, critical_fields):
    # Initialize a dictionary to store field-wise accuracy and trust results
    accuracy_report = {field: [] for field in ground_truth_df.columns if field in extracted_data_df.columns}
    accuracy_report["Trustworthy"] = []

    # To store the overall accuracy of each field
    field_accuracies = {field: 0 for field in accuracy_report if field != "Trustworthy"}

    # Iterate over each invoice in the ground truth and extracted data
    for index, row in ground_truth_df.iterrows():
        invoice_number = row["Invoice Number"]
        
        # Find corresponding row in the extracted data by matching the invoice number
        extracted_row = extracted_data_df[extracted_data_df["Invoice Number"] == invoice_number]
        
        if not extracted_row.empty:
            extracted_row = extracted_row.iloc[0]
            
            # Calculate field-wise accuracy
            for field in accuracy_report:
                if field != "Trustworthy":  # Skip "Trustworthy" during field comparison
                    ground_truth_value = row[field]
                    extracted_value = extracted_row[field] if field in extracted_row else None
                    
                    # Compare values for each field and record accuracy
                    if ground_truth_value == extracted_value:
                        accuracy_report[field].append(1)  # Accurate
                        field_accuracies[field] += 1  # Increment field accuracy count
                    else:
                        accuracy_report[field].append(0)  # Inaccurate
        
            # Determine trustworthiness based on critical fields (Invoice Number, Customer Name, Total Amount)
            correct_critical_fields = sum([accuracy_report[field][-1] for field in critical_fields if field in accuracy_report])
            if correct_critical_fields / len(critical_fields) >= 0.99:
                accuracy_report["Trustworthy"].append(True)
            else:
                accuracy_report["Trustworthy"].append(False)
        else:
            # If no matching invoice in extracted data, mark all fields as inaccurate and untrusted
            for field in accuracy_report:
                if field != "Trustworthy":  # Skip "Trustworthy" during field comparison
                    accuracy_report[field].append(0)
            accuracy_report["Trustworthy"].append(False)

    # Convert the report into a dataframe
    accuracy_df = pd.DataFrame(accuracy_report)

    # Calculate percentage accuracy for each field
    total_invoices = len(ground_truth_df)
    field_accuracy_percentages = {field: (field_accuracies[field] / total_invoices) * 100 for field in field_accuracies}

    return accuracy_df, field_accuracy_percentages

# Example usage
ground_truth_path = './ground_truth.csv'
extracted_data_path = './extracted_data.csv'

# Load the datasets
ground_truth_df = pd.read_csv(ground_truth_path)
extracted_data_df = pd.read_csv(extracted_data_path)

# Define critical fields (e.g., Invoice Number, Customer Name, Total Amount)
critical_fields = ["Invoice Number", "Customer Name", "Total Amount"]

# Calculate field-wise accuracy
accuracy_df, field_accuracy_percentages = calculate_field_wise_accuracy(ground_truth_df, extracted_data_df, critical_fields)

# Print the resulting accuracy DataFrame
print("Field-wise accuracy report:")
print(accuracy_df)

# Print the percentage accuracy for each field
print("\nPercentage accuracy for each field:")
for field, accuracy in field_accuracy_percentages.items():
    print(f"{field}: {accuracy:.2f}%")