This step evaluation scores of the model outputs will be calculated, which includes

Accuracy

F1 score

Precision

Recall

True positive

True negative

False positive

False negative

Support (whow many word or word pairs altogather)

Detected_num (How many conventional metaphors  model has detected)

Total_Num (How many conventional metaphors are there in manual annotation)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import pandas as pd

# Load the sorted labels
df = pd.read_csv('../outputs/output_0/csv/sorted/(secret)first_time_output_sorted.csv', encoding="ISO-8859-1")

# Select non-deliberate metaphors (conventional metaphors)
#df = df[df['DELMET'] != 2]
# For the prompts only aim at detecting metaphors, Use the code below rather than the last code
df['DELMET'] = df['DELMET'].replace(2, 1)

# automatically detect all the columns starting with 'label_'
label_columns = [col for col in df.columns if col.startswith('label_')]

# 初始化一个DataFrame来存储结果
results = pd.DataFrame(index=[col.replace('label_', 'Label_') for col in label_columns],
                       columns=['Accuracy', 'F1', 'Precision', 'Recall', 'Detected_num', 'Total_Num',
                                'True_Positive', 'True_Negative', 'False_Positive', 'False_Negative'])

# 计算手动注释中的传统隐喻数量
delmet_ones_count = df['DELMET'].sum()

# 计算每个标签列的指标
for label_col in label_columns:
    y_true = df['DELMET']
    y_pred = df[label_col]
    
    # 计算性能指标
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    
    # 计算混淆矩阵
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # 计算预测为正类的数量
    detected_num = y_pred.sum()
    
    # 打印或存储每个标签的结果
    print(f"Label: {label_col}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Detected Positives: {detected_num}")
    print(f"True Positives: {tp}")
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print("\n")
    
    # 将结果添加到DataFrame
    results.loc[label_col.replace('label_', 'Label_')] = [accuracy, f1, precision, recall, detected_num, delmet_ones_count,
                                                          tp, tn, fp, fn]

# 将结果保存到新的csv文件
results.to_csv('../outputs/output_0/csv/results/(secret)seperate_multiple_time_results/(secret)first_results.csv')

The code below is an adpted version of the code above, just add two more evalutaion columns:

NVAJ_total (How many tokens of noun, verb and adjective in manual annotaion)

NVAJ_detected (How many tokens of noun, verb and adjective in manual annotaion are detected by model)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import pandas as pd

# 加载数据
df = pd.read_csv('../outputs/output_0/csv/sorted/(lexiNVAJ)first_time_output_sorted.csv', encoding="ISO-8859-1")

# Select non-deliberate metaphors (conventional metaphors)
df = df[df['DELMET'] != 2]
# For the prompts only aim at detecting metaphors, Use the code below rather than the last code
#df['DELMET'] = df['DELMET'].replace(2, 1)

# Filter rows in wordcat column that do not belong to noun, verb, adjective
excluded_tags = ['CJ', 'EX', 'AV', 'AT', 'PN', 'PR', 'DP', 'DT', 'TO', 'XX', 'CR', 'OR', 'UN', 'ZZ']
wordcat_filtered = df[~df['wordcat'].isin(excluded_tags)]

# 自动检测所有以'label_'开头的列
label_columns = [col for col in df.columns if col.startswith('label_')]

# 初始化一个DataFrame来存储结果
results = pd.DataFrame(index=[col.replace('label_', 'Label_') for col in label_columns],
                       columns=['Accuracy', 'F1', 'Precision', 'Recall', ' NVAJ_detected', 'NVAJ_total', 'Detected_num', 'Total_Num',
                                'True_Positive', 'True_Negative', 'False_Positive', 'False_Negative'])

# 计算手动注释中的传统隐喻数量
delmet_ones_count = df['DELMET'].sum()

# 计算每个标签列的指标
for label_col in label_columns:
    y_true = df['DELMET']
    y_pred = df[label_col]
    
    # 计算性能指标
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    
    # 计算混淆矩阵
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # 计算预测为正类的数量
    detected_num = y_pred.sum()
    
    # Calculate wordcat related statistics
    NVAJ_total = len(wordcat_filtered)
    NVAJ_detected = wordcat_filtered[wordcat_filtered['DELMET'] == 1][label_col].sum()
    
    # 打印或存储每个标签的结果
    print(f"Label: {label_col}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Detected Positives: {detected_num}")
    print(f"True Positives: {tp}")
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print("\n")
    
    # 将结果添加到DataFrame
    results.loc[label_col.replace('label_', 'Label_')] = [accuracy, f1, precision, recall, NVAJ_detected, NVAJ_total, detected_num, delmet_ones_count,
                                                          tp, tn, fp, fn]

# 将结果保存到新的csv文件
results.to_csv('../outputs/output_0/csv/results/(lexiNVAJ)seperate_multiple_time_results/(lexiNVAJ)first_results.csv')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from glob import glob
import sys

def process_files(fn_pattern):
    for file_path in glob(fn_pattern):
        df = pd.read_csv(file_path, encoding="ISO-8859-1")

        # 过滤掉GT列值为2的行
        df = df[df['DELMET'] != 2]

        # 找到所有以"label_"开头的列
        pred_cols = [col for col in df.columns if col.startswith('label_')]
        
        for pred_col in pred_cols:
            gt_l = list(df['DELMET'])  # 真实标签列
            pred_l = list(df[pred_col])  # 预测标签列

            # 计算混淆矩阵和分类报告
            tn, fp, fn, tp = confusion_matrix(gt_l, pred_l).ravel()
            report = classification_report(gt_l, pred_l)

            # 打印报告和混淆矩阵结果
            print(f"File: {file_path}, Label Column: {pred_col}")
            print(f"True Positives: {tp}, True Negatives: {tn}, False Positives: {fp}, False Negatives: {fn}")
            print(report)
            print("\n")

#if __name__ == "__main__":
    #pattern = "(conNVAJ)*_output_sorted.csv"  # 修改这个模式以匹配你的文件
    #main(pattern)
    # 设置文件路径模式
file_pattern = '../outputs/output_0/csv/sorted/(conventional)*_output_sorted.csv'  # 请替换为你的文件路径

# 调用函数处理文件
process_files(file_pattern)

In [None]:
from sklearn.metrics import classification_report
import pandas as pd
from glob import glob
import sys


def main(fn): 
gt_l=[]
pred_l=[]
for f in glob(fn):
df = pd.read_csv(f)
gt_l+=list(df['GT’]) # Adapt to your own column names
pred_l+=list(df['PRED’]) # Idem
report=classification_report(gt_l,pred_l)
print(report)


if __name__=="__main__":
main(sys.argv[1]) # a regex referring to all prediction files for one metaphor type, like "conventional_*.csv"

Since the model has deviations in each output result under the same prompt, the experiment was repeated three times. The following code is used to calculate the average of the three results.

In [None]:
# Load all all the multiple times results
file_names = [
    '../outputs/output_0/csv/results/(lexiNVAJ)seperate_multiple_time_results/(lexiNVAJ)third_results.csv', 
    '../outputs/output_0/csv/results/(lexiNVAJ)seperate_multiple_time_results/(lexiNVAJ)second_results.csv', 
    '../outputs/output_0/csv/results/(lexiNVAJ)seperate_multiple_time_results/(lexiNVAJ)first_results.csv'
]
dataframes = [pd.read_csv(file_name, index_col=0) for file_name in file_names]

# Detect how many Label_i columns there are in each DataFrame
label_columns = [col for col in dataframes[0].columns if col.startswith('Label_')]

# Calculate the average of the metrics for these datasets at each cue
mean_df = pd.concat(dataframes).groupby(level=0).mean()

# Save average results to CSV file
mean_df.to_csv('../outputs/output_0/csv/results/overall_results/(lexiNVAJ)average_results.csv')

For further evaluation. Details to be updated

In [None]:
# Conduct Shapiro-Wilk test for accuracy, f1, precision, recall for each prompt，and choose appropriate test
test_results = {}

for label in dataframes[0].index:  # The index of all the DataFrames are same
    test_results[label] = {}
    for column in dataframes[0].columns:  #  The column name for all the DataFrames are same
        # get the data for all the prompts of from the mltiple times reults
        group_values = [df.loc[label, column] for df in dataframes]
        
        # Conduct Shapiro-Wilk test
        shapiro_test = stats.shapiro(group_values)
        if shapiro_test.pvalue > 0.05:
            # if normall y distributed，conduct ANOVA
            f_value, p_value = stats.f_oneway(*[df[column] for df in dataframes])
            test_results[label][column] = ('ANOVA', f_value, p_value)
        else:
            # if not normall y distributed，conduct Kruskal-Wallis
            h_value, p_value = stats.kruskal(*[df[column] for df in dataframes])
            test_results[label][column] = ('Kruskal-Wallis', h_value, p_value)

# output
test_results

In [None]:
import pandas as pd

rows = []
for label, metrics in test_results.items():
    for metric, values in metrics.items():
        if metric != 'Support':  # 排除 'Support' 指标
            test_name, statistic, p_value = values
            rows.append([label, metric, test_name, statistic, p_value])

df_no_support = pd.DataFrame(rows, columns=['Label', 'Metric', 'Test', 'Statistic', 'P-Value'])
df_no_support.to_csv('../outputs/output_0/csv/results/overall_results/[metaphor]multiple_time_result_significance_comparison.csv')

Evaluation part to be finished:
 
1. Post check: if the auto processing output has tokenization problem so the length is different from ground truth in the corpus.

2. Filter out the deliberate metaphorical words.

3. Calculate the scores.For example,

ground_truth = csv_data['maunal annotation']
model_output = csv_data['model_output']

def calculate_metrics(ground_truth, model_output):

    return {
        'F1 Score': f1_score(ground_truth, model_output),
        'Accuracy': accuracy_score(ground_truth, model_output),
        'Recall': recall_score(ground_truth, model_output),
        'Precision': precision_score(ground_truth, model_output),
        'Support': classification_report(ground_truth, model_output, output_dict=True)['1']['support']
    }

metrics_results = calculate_metrics(ground_truth, model_output)

4. Store the scores in a csv.

5. Evaluation: permutation test: whether there is significant difference in comparison of the scores of the same prompts running multiple times, and comparison of the average scores of different prompts; Confidence interval of (accuracy, F1...).