# Performance Metrics Calculation

In [1]:
import csv

Combine_csvs() help to combine results from all variant together as one file

In [None]:
def combine_csvs(file_paths, output_path):
    """
    Combine multiple CSV files with the same header into a single file.
    
    :param file_paths: List of paths to the CSV files to combine.
    :param output_path: Path to the output combined CSV file.
    """
    with open(output_path, 'w', newline='', encoding="utf-8") as outfile:
        writer = None

        for i, file_path in enumerate(file_paths):
            with open(file_path, 'r', encoding="utf-8") as infile:
                reader = csv.reader(infile)
                print(i)
                # For the first file, write headers and data
                if i == 0:
                    writer = csv.writer(outfile)
                    for row in reader:
                        writer.writerow(row)
                # For subsequent files, skip the header and write data
                else:
                    next(reader)  # Skip header
                    for row in reader:
                        writer.writerow(row)

# Example
file_paths = []  # Add paths to your CSV files
for i in range(1, 15):
    input_file = f"../gpt2outputdetector/results/gpt2outputdetector_variants_prompt_{i}.csv"
    file_paths.append(input_file)
output_path = "combined.csv"
combine_csvs(file_paths, output_path)

After combining all results of same CDM, rename column so that the code can work properly
- Human classification result column to 'humanbinary'
- GPT classification result column to 'GPTbinary'

1 indicates it is Human Written

0 indicates it is AI Generated 

In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import csv

class MetricsEvaluator:
    def __init__(self, filename, show_plot = False):
        self.show_plot = show_plot
        self.data = pd.read_csv(filename)
        # print(self.data)
    
    def calculate_variant(self, column_name, output_file):
        # Open the CSV file in write mode
        with open(output_file, 'w', encoding='utf-8', newline='') as file:
            writer = csv.writer(file)
            # Write the header row
            writer.writerow(["Variant","True Positive Rate (TPR)","False Negative Rate (FNR)","True Negative Rate (TNR)","False Positive Rate (FPR)","Accuracy","Precision","Recall","F1 Score","Area Under Curve(AUC)"])
            print("            Actual   ")
            print("       |   | 1  | 0  |")
            print("Predict| 1 | tp | fp |")
            print("       | 0 | fn | tn |")
            # print(self.data.size)
            group_data = self.data.groupby("variant")
            # print(group_data.size)
            variant_data = []
            for key, variant_group in group_data:
                # print(variant_group)
                variant_data.append(self.get_variant_performance(column_name, variant_group, key, writer))
            for variant in variant_data:
                writer.writerow([variant["variant"],variant["TPR"],variant["FNR"],variant["TNR"],variant["FPR"],variant["accuracy"],variant["precision"],variant["recall"],variant["f1"],variant["auc"]])
            

    def get_variant_performance(self, column_name, dataframe, label, writer):
        pred_human_col_name = "humanbinary"
        pred_GPT_col_name = "GPTbinary"

        pred_human = dataframe[pred_human_col_name]
        pred_GPT = dataframe[pred_GPT_col_name]


        actual_human = np.ones_like(pred_human)
        actual_GPT = np.zeros_like(pred_GPT)

        

        actual = np.concatenate((actual_human, actual_GPT))
        predict = np.concatenate((pred_human, pred_GPT))
        combined_auc = roc_auc_score(actual, predict)
        cfm = confusion_matrix(actual, predict)
        tn, fp, fn, tp = cfm.ravel()
        print("\n\nSIZE: ",actual_human.size, actual_GPT.size)
        print(f"=============== {label} Performance ===============")
        
        print("For human:")
        print("True Positive Rate  (TPR): ", tp/(tp+fn))
        print("False Negative Rate (FNR): ", fn/(tp+fn))
        print("For GPT:")
        print("True Negative Rate  (TNR): ", tn/(fp+tn))
        print("False Positive Rate (FPR): ", fp/(fp+tn))

        combined = {
            "accuracy": accuracy_score(actual, predict),
            "precision": precision_score(actual, predict),
            "recall": recall_score(actual, predict, zero_division=1),
            "f1": f1_score(actual, predict),
            "auc": combined_auc
        }
        print()
        for key, value in combined.items():
            print("{:<15}: {}".format(str(key).capitalize(), value))
        return {
            "variant": label,
            "TPR": tp/(tp+fn),
            "FNR": fn/(tp+fn),
            "TNR": tn/(fp+tn),
            "FPR": fp/(fp+tn),
            "accuracy": accuracy_score(actual, predict),
            "precision": precision_score(actual, predict),
            "recall": recall_score(actual, predict, zero_division=1),
            "f1": f1_score(actual, predict),
            "auc": combined_auc
        }


Below is an example on how to run the metrics performance evaluation

In [7]:
# EXAMPLE CODE
# Evaluate Metrics
cdm_name = "gpt2outputdetector"

input_file = f"./Gpt2outputdetector/results/gpt2outputdetector_combined_result_only.csv"
evaluator = MetricsEvaluator(input_file)
evaluator.calculate_variant(cdm_name,f"{cdm_name}_results.csv")

            Actual   
       |   | 1  | 0  |
Predict| 1 | tp | fp |
       | 0 | fn | tn |


SIZE:  5069 5069
For human:
True Positive Rate  (TPR):  0.9128033142631683
False Negative Rate (FNR):  0.08719668573683172
For GPT:
True Negative Rate  (TNR):  0.09587689879660682
False Positive Rate (FPR):  0.9041231012033931

Accuracy       : 0.5043401065298876
Precision      : 0.5023887079261672
Recall         : 0.9128033142631683
F1             : 0.648084599761888
Auc            : 0.5043401065298876


SIZE:  5065 5065
For human:
True Positive Rate  (TPR):  0.9127344521224087
False Negative Rate (FNR):  0.08726554787759132
For GPT:
True Negative Rate  (TNR):  0.08825271470878579
False Positive Rate (FPR):  0.9117472852912142

Accuracy       : 0.5004935834155972
Precision      : 0.5002705334920463
Recall         : 0.9127344521224087
F1             : 0.6463022508038586
Auc            : 0.5004935834155972


SIZE:  5064 5064
For human:
True Positive Rate  (TPR):  0.9127172195892576
False Negativ

## Statistical Test
For the accuracy base, copy the base variant (variant 1) 12 time to be compare with the base variant

For the accuracy, put the accuracy for each variant of each CDM based on given example

In [1]:
import numpy as np
from scipy.stats import ttest_rel

# Sample data
# Each row in the data represents a variant and each column represents a detector
# Here's some random data for illustration purposes
data_accuracy_base = np.array([
    [0.5040, 0.5040, 0.5040, 0.5040, 0.5040, 0.5040, 0.5040, 0.5040, 0.5040, 0.5040, 0.5040, 0.5040],  # GLTR
    [0.5043, 0.5043, 0.5043, 0.5043, 0.5043, 0.5043, 0.5043, 0.5043, 0.5043, 0.5043, 0.5043, 0.5043],  # GPT-2 Detector
    [0.4971, 0.4971, 0.4971, 0.4971, 0.4971, 0.4971, 0.4971, 0.4971, 0.4971, 0.4971, 0.4971, 0.4971],  # GPTZero
    [0.6056, 0.6056, 0.6056, 0.6056, 0.6056, 0.6056, 0.6056, 0.6056, 0.6056, 0.6056, 0.6056, 0.6056],  # Sapling
    [0.4893, 0.4893, 0.4893, 0.4893, 0.4893, 0.4893, 0.4893, 0.4893, 0.4893, 0.4893, 0.4893, 0.4893]   # DetectGPT
])
data_accuracy = np.array([
    #   2       3       4       5       6       7       8       9      10      11      12      13
    [0.4936, 0.4841, 0.6569, 0.6999, 0.6920, 0.7693, 0.4908, 0.4952, 0.4881, 0.5375, 0.5020, 0.6478],  # GLTR
    [0.5005, 0.4961, 0.5013, 0.4716, 0.4788, 0.4828, 0.5134, 0.5100, 0.5252, 0.4911, 0.4958, 0.4922],  # GPT-2 Detector
    [0.4988, 0.4986, 0.4966, 0.4969, 0.4976, 0.4989, 0.5108, 0.4972, 0.4973, 0.4970, 0.4965, 0.5824],  # GPTZero
    [0.5961, 0.6031, 0.6048, 0.5425, 0.5827, 0.6083, 0.6630, 0.6258, 0.6811, 0.6187, 0.6059, 0.6528],  # Sapling
    [0.4742, 0.4685, 0.4941, 0.4055, 0.4014, 0.5132, 0.4943, 0.5278, 0.5125, 0.5354, 0.5153, 0.4373]   # DetectGPT
])

def paired_t_test(base, data):
    n_detectors = data.shape[0]
    results = {}
    
    for i in range(n_detectors):
        detector_results = []
        
        # Compare each variant against the first variant
        t_stat, p_val = ttest_rel(base[i], data[i])
        detector_results.append((t_stat, p_val))
            
        results[f"{i}"] = detector_results
    
    return results

# Get t-test results for each metric
accuracy_results = paired_t_test(data_accuracy_base, data_accuracy)

detector_dict = ["GLTR","GPT-2 Detector","GPTZero","Sapling","DetectGPT"]

# Print results
for detector, res in accuracy_results.items():
    print(f"ACC Results for {detector_dict[int(detector)]}:")
    for idx, (t_stat, p_val) in enumerate(res, 1):
        print(f"Summary: t-stat = {t_stat:.4f}, p-value = {p_val:.4f}")
        if p_val < 0.05:
            print("SIGNIFICANT")
    print("\n")

TNR Results for GLTR:
Summary: t-stat = 21.0000, p-value = 0.1763


TNR Results for GPT-2 Detector:
Summary: t-stat = 18.0000, p-value = 0.1099


TNR Results for GPTZero:
Summary: t-stat = 19.0000, p-value = 0.1294


TNR Results for Sapling:
Summary: t-stat = 30.0000, p-value = 0.5186


TNR Results for DetectGPT:
Summary: t-stat = 34.0000, p-value = 0.7334


