# Misclassification on "none" outputs:
We modified the wikisql dataset in "no_answer_questions.ipynb" and ran the tapex.base and tapex.large models.

In [17]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

## calculate misclassification:

In [61]:
import pandas as pd

def calculate_none_predictions(eval_file_path):
    # Read the file into a DataFrame
    df = pd.read_csv(eval_file_path, sep='\t')

    # Calculate True Positives (TP), False Positives (FP), True Negatives (TN), False Negatives (FN)
    tp = df[(df['Predict'] == 'none') & (df['Golden'] == 'none')].shape[0]  # TP: Predict 'none' and Golden 'none'
    fp = df[(df['Predict'] == 'none') & (df['Golden'] != 'none')].shape[0]  # FP: Predict 'none', Golden not 'none'
    tn = df[(df['Predict'] != 'none') & (df['Golden'] != 'none')].shape[0]  # TN: Predict not 'none', Golden not 'none'
    fn = df[(df['Predict'] != 'none') & (df['Golden'] == 'none')].shape[0]  # FN: Predict not 'none', Golden 'none'

    # Calculate recall (TP / (TP + FN)) and precision (TP / (TP + FP))
    recall = (tp / (tp + fn)) * 100 if (tp + fn) > 0 else 0
    precision = (tp / (tp + fp)) * 100 if (tp + fp) > 0 else 0

    # Calculate the total number of "none" golden rows (i.e., total "none" ground truth instances)
    total_none_golden = df[df['Golden'] == 'none'].shape[0]

    return tp, fp, tn, fn, total_none_golden, recall, precision

# Example usage:
eval_file_path = "/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/results/wikisql/tapex.base/test/generate-test.txt.eval"
tp, fp, tn, fn, total_none_golden, recall, precision = calculate_none_predictions(eval_file_path)

print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"True Negatives (TN): {tn}")
print(f"False Negatives (FN): {fn}")
print(f"Total 'none' ground truth: {total_none_golden}")
print(f"Recall: {recall:.2f}%")
print(f"Precision: {precision:.2f}%")




True Positives (TP): 1256
False Positives (FP): 237
True Negatives (TN): 17187
False Negatives (FN): 372
Total 'none' ground truth: 1628
Recall: 77.15%
Precision: 84.13%


## tapex.base TEST

In [62]:
# Example usage:
eval_file_path = "/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/results/wikisql/tapex.base/test/generate-test.txt.eval"
tp, fp, tn, fn, total_none_golden, recall, precision = calculate_none_predictions(eval_file_path)

print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"True Negatives (TN): {tn}")
print(f"False Negatives (FN): {fn}")
print(f"Total 'none' ground truth: {total_none_golden}")
print(f"Recall: {recall:.2f}%")
print(f"Precision: {precision:.2f}%")


True Positives (TP): 1256
False Positives (FP): 237
True Negatives (TN): 17187
False Negatives (FN): 372
Total 'none' ground truth: 1628
Recall: 77.15%
Precision: 84.13%


## tapex.large TEST

In [64]:
# Example usage:
large_file_path = "/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/results/wikisql/tapex.large/test/generate-test.txt.eval"
tp, fp, tn, fn, total_none_golden, recall, precision = calculate_none_predictions(large_file_path)

print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"True Negatives (TN): {tn}")
print(f"False Negatives (FN): {fn}")
print(f"Total 'none' ground truth: {total_none_golden}")
print(f"Recall: {recall:.2f}%")
print(f"Precision: {precision:.2f}%")

True Positives (TP): 1375
False Positives (FP): 253
True Negatives (TN): 17171
False Negatives (FN): 253
Total 'none' ground truth: 1628
Recall: 84.46%
Precision: 84.46%


## SCORES

In [25]:
def calculate_none_scores(eval_file_path):
    # Read the file into a DataFrame
    df = pd.read_csv(eval_file_path, sep='\t')

    # True positive: Predicted "none" and Golden "none"
    tp = df[(df['Predict'] == 'none') & (df['Golden'] == 'none')].shape[0]

    # False positive: Predicted "none" and Golden not "none"
    fp = df[(df['Predict'] == 'none') & (df['Golden'] != 'none')].shape[0]

    # False negative: Predicted not "none" and Golden "none"
    fn = df[(df['Predict'] != 'none') & (df['Golden'] == 'none')].shape[0]

    # True negative: Predicted not "none" and Golden not "none"
    tn = df[(df['Predict'] != 'none') & (df['Golden'] != 'none')].shape[0]

    # Calculate the precision, recall, and F1 score
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
    accuracy = (tp + tn) / (tp + fp + fn + tn) if tp + fp + fn + tn > 0 else 0

    return tp, fp, fn, tn, precision, recall, f1, accuracy


## TAPEX.BASE

In [None]:
# Define the paths to the generate-valid.txt.eval and generate-test.txt.eval files
base_valid_eval_file_path = "/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/results/wikisql/tapex.base/val/generate-valid.txt.eval"
base_test_eval_file_path = "/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/results/wikisql/tapex.base/test/generate-test.txt.eval"

# Calculate and print results for the valid set on tapex.base
valid_tp, valid_fp, valid_fn, valid_tn, valid_precision, valid_recall, valid_f1, valid_accuracy = calculate_none_scores(base_valid_eval_file_path)
print("TAPEX.BASE VAL:")
print(f"TP: {valid_tp}")
print(f"FP: {valid_fp}")
print(f"FN: {valid_fn}")
print(f"TN: {valid_tn}")
print(f"Precision: {valid_precision:.2f}")
print(f"Recall: {valid_recall:.2f}")
print(f"F1: {valid_f1:.2f}")
print(f"Accuracy: {valid_accuracy:.2f}")
print("\n")


# Calculate and print results for the test set on tapex.base
test_tp, test_fp, test_fn, test_tn, test_precision, test_recall, test_f1, test_accuracy = calculate_none_scores(base_test_eval_file_path)
print("TAPEX.BASE TEST:")
print(f"TP: {test_tp}")
print(f"FP: {test_fp}")
print(f"FN: {test_fn}")
print(f"TN: {test_tn}")
print(f"Precision: {test_precision:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"F1: {test_f1:.2f}")
print(f"Accuracy: {test_accuracy:.2f}")
print("\n")




TAPEX.BASE VAL:
TP: 635
FP: 102
FN: 173
TN: 9194
Precision: 0.86
Recall: 0.79
F1: 0.82
Accuracy: 0.97


TAPEX.BASE TEST:
TP: 1256
FP: 237
FN: 372
TN: 17187
Precision: 0.84
Recall: 0.77
F1: 0.80
Accuracy: 0.97




## TAPEX.LARGE

In [27]:
# Define the paths to the generate-valid.txt.eval and generate-test.txt.eval files
large_valid_eval_file_path = "/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/results/wikisql/tapex.large/val/generate-valid.txt.eval"
large_test_eval_file_path = "/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/results/wikisql/tapex.large/test/generate-test.txt.eval"

# Calculate and print results for the valid set on tapex.large
valid_tp, valid_fp, valid_fn, valid_tn, valid_precision, valid_recall, valid_f1, valid_accuracy = calculate_none_scores(large_valid_eval_file_path)
print("TAPEX.LARGE VAL:")
print(f"TP: {valid_tp}")
print(f"FP: {valid_fp}")
print(f"FN: {valid_fn}")
print(f"TN: {valid_tn}")
print(f"Precision: {valid_precision:.2f}")
print(f"Recall: {valid_recall:.2f}")
print(f"F1: {valid_f1:.2f}")
print(f"Accuracy: {valid_accuracy:.2f}")
print("\n")


# Calculate and print results for the test set on tapex.large
test_tp, test_fp, test_fn, test_tn, test_precision, test_recall, test_f1, test_accuracy = calculate_none_scores(large_test_eval_file_path)
print("TAPEX.LARGE TEST:")
print(f"TP: {test_tp}")
print(f"FP: {test_fp}")
print(f"FN: {test_fn}")
print(f"TN: {test_tn}")
print(f"Precision: {test_precision:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"F1: {test_f1:.2f}")
print(f"Accuracy: {test_accuracy:.2f}")
print("\n")


TAPEX.LARGE VAL:
TP: 666
FP: 127
FN: 142
TN: 9169
Precision: 0.84
Recall: 0.82
F1: 0.83
Accuracy: 0.97


TAPEX.LARGE TEST:
TP: 1375
FP: 253
FN: 253
TN: 17171
Precision: 0.84
Recall: 0.84
F1: 0.84
Accuracy: 0.97




## LLAMA

In [55]:
def analyze_predictions(file_path):
    # Initialize counters for the different categories
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    true_negatives = 0
    total_none = 0  # This counts how many times "none" is the true label
    correct_none_predictions = 0  # To count correct 'none' predictions (True Positives)

    # Create a list to store the predictions and true labels as lists
    data_points = []

    with open(file_path, 'r') as file:
        # Read the entire file content
        content = file.read()
        
        # Clean up the content by removing unwanted newline characters and splitting by the separator
        content = content.replace('\n', ' ').strip()  # Remove all '\n' and strip extra spaces
        data_point_strings = content.split('--------------------------------------------------')  # Split by the separator

        print(data_point_strings[:10])  # To check the first few cleaned data points

        # Iterate over each data point string
        for data_point_string in data_point_strings:
            # Strip leading/trailing whitespaces
            data_point_string = data_point_string.strip()

            if not data_point_string:
                continue  # Skip empty data points

            # Split the data point into predicted and true labels
            if 'Predicted: ' in data_point_string and 'True: ' in data_point_string:
                # Extract the predicted label and true label by splitting on the markers
                predicted_label = data_point_string.split('Predicted: ')[1].split('True: ')[0].strip()
                true_label = data_point_string.split('True: ')[1].strip()

                # Append the data point as a list of predicted and true labels
                data_points.append([predicted_label, true_label])

    print(f"Processed data points (first 10): {data_points[:10]}")

    # Iterate over the data points and calculate TP, FP, FN, TN
    for predicted_label, true_label in data_points:
        # Count true "none"
        if true_label == "none":
            total_none += 1
            # If predicted 'none', it's a true positive
            if predicted_label == "none":
                correct_none_predictions += 1

        # Calculate True Positives, False Positives, False Negatives, True Negatives
        if true_label == "none" and predicted_label == "none":
            true_positives += 1
        elif true_label == "none" and predicted_label != "none":
            false_negatives += 1
        elif true_label != "none" and predicted_label == "none":
            false_positives += 1
        elif true_label != "none" and predicted_label != "none":
            true_negatives += 1

    # Calculate Recall: TP / (TP + FN)
    recall = (true_positives / (true_positives + false_negatives)) if (true_positives + false_negatives) > 0 else 0

    # Calculate Precision: TP / (TP + FP)
    precision = (true_positives / (true_positives + false_positives)) if (true_positives + false_positives) > 0 else 0

    # Calculate percentage of correct 'none' predictions
    percentage_correct_none = (correct_none_predictions / total_none * 100) if total_none > 0 else 0

    # Print the results
    print(f"Total number of 'none' predictions (TP + FN): {total_none}")
    print(f"Correct 'none' predictions (True Positives): {correct_none_predictions}")
    print(f"Percentage of correct 'none' predictions (Recall): {percentage_correct_none:.2f}%")
    print(f"Recall (TP / (TP + FN)): {recall:.3f}")
    print(f"Precision (TP / (TP + FP)): {precision:.3f}")
    print(f"True Positives (predicted 'none' when actual 'none'): {true_positives}")
    print(f"False Positives (predicted 'none' when not 'none'): {false_positives}")
    print(f"False Negatives (predicted something else when true 'none'): {false_negatives}")
    print(f"True Negatives (predicted something else when not 'none'): {true_negatives}")


# Replace 'path_to_file.txt' with the actual path to your .txt file
file_path = '/Users/sebastiaan/Desktop/IR2_tapex_reproducibility_study/src/Table-Pretraining-main/results/wikisql/llama/test_predictions_wikisql_adverserial_epoch_3_markdown.txt'
analyze_predictions(file_path)






['Predicted: united states True: united states ', ' Predicted: arkansas True: arkansas ', ' Predicted: michigan True: michigan ', ' Predicted: 1.0 True: 1.0 ', ' Predicted: assen True: netherlands ', ' Predicted: 1.0 True: 1.0 ', ' Predicted: 17 june True: 17 june ', ' Predicted: 1.0 True: 1.0 ', ' Predicted: canada True: canada ', ' Predicted: 1.0 True: 1.0 ']
Processed data points (first 10): [['united states', 'united states'], ['arkansas', 'arkansas'], ['michigan', 'michigan'], ['1.0', '1.0'], ['assen', 'netherlands'], ['1.0', '1.0'], ['17 june', '17 june'], ['1.0', '1.0'], ['canada', 'canada'], ['1.0', '1.0']]
Total number of 'none' predictions (TP + FN): 1627
Correct 'none' predictions (True Positives): 1313
Percentage of correct 'none' predictions (Recall): 80.70%
Recall (TP / (TP + FN)): 0.807
Precision (TP / (TP + FP)): 0.866
True Positives (predicted 'none' when actual 'none'): 1313
False Positives (predicted 'none' when not 'none'): 204
False Negatives (predicted something e