# Get and process outputs

The script contain the steps from getting raw outputs from API to processing the outputs to get scores (F1, accuracy, precision, recall, support).

In [2]:
#import needed libraries
import openai
import pandas as pd
import os
import csv
import re
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from scipy import stats

This step is about for loop sentences in every prompt to get outputs through ChatGPT API.

In [None]:
#For N-shot prompting, this cell of codes will be used for automatically dividing sentences into 10-fold (details to be added)
# Load sentences
sentences = ["Sentence 1", "Sentence 2", "Sentence 3", ...] 

# create 10-fold of sentences
kf = KFold(n_splits=10, shuffle=True, random_state=1)

# Apply 10-fold split to the sentences
for train_index, test_index in kf.split(sentences):
    train_sentences = [sentences[i] for i in train_index]
    test_sentences = [sentences[i] for i in test_index]
    #Since the sentence order will be random, keep the random output sentence to get the new order for later comparison with the manual annotation (same word order for manual and model)
    print("Train:", train_sentences)
    print("Test:", test_sentences)

In [3]:
# Set your OpenAI API key
openai.api_key = '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'

# Read the sentence_forAPI.csv file
sentence_df = pd.read_csv('../data/corpus/sen4prompting.csv')

# Read the Excel file's "0-shot" column
prompts_df = pd.read_csv('../data/prompt_set/25_Prompts_Cov_met_sen.csv', usecols=['0 shot - conventional metaphor '])

# For loop every prompt in prompts_df
for prompt_index, prompt_row in prompts_df.iloc[[0,4,9,20,21]].iterrows():#iloc[21:].
    # For loop every row in sentence_df
    for sentence_index, sentence_row in sentence_df.iterrows():
        # Process every sentence
        sentence = sentence_row['query_sentence']
        # Replace{text} with every fold of sentences in the prompt
        modified_prompt = prompt_row['0 shot - conventional metaphor '].replace("{text}", sentence)
        # Print results
        #print(f"Prompt index: {prompt_index}, Sentence index: {sentence_index}, Modified prompt: {modified_prompt}")
        # Call the OpenAI API to get a response for each sentence group
        response = openai.ChatCompletion.create(
            model="gpt-4-1106-preview",
            messages=[{"role": "user", "content": modified_prompt}],
            temperature=0.7,#Default temperature
            max_tokens=4000
        )
        # Extract the generated text
        generated_text = response['choices'][0]['message']['content'] if response['choices'] else 'No response'
        
        # Print the generated text to verify it before writing to the file
        #print(generated_text)

        # Filename convention "promptIndex_sentenceGroupIndex.txt", e.g., "1_1.txt"
        filename = f"../outputs/output_0/raw/secret_held_out_raw/third_time_output/{prompt_index + 1}_{sentence_index + 1}.txt"
        
        # Write the generated text to a text file
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(generated_text)
        
        # Print information to track progress
        print(f"Written to {filename}")

print("All results have been written to text files.")

Written to ../outputs/output_0/raw/secret_held_out_raw/third_time_output/1_1.txt
Written to ../outputs/output_0/raw/secret_held_out_raw/third_time_output/1_2.txt
Written to ../outputs/output_0/raw/secret_held_out_raw/third_time_output/5_1.txt
Written to ../outputs/output_0/raw/secret_held_out_raw/third_time_output/5_2.txt
Written to ../outputs/output_0/raw/secret_held_out_raw/third_time_output/10_1.txt
Written to ../outputs/output_0/raw/secret_held_out_raw/third_time_output/10_2.txt
Written to ../outputs/output_0/raw/secret_held_out_raw/third_time_output/21_1.txt
Written to ../outputs/output_0/raw/secret_held_out_raw/third_time_output/21_2.txt
Written to ../outputs/output_0/raw/secret_held_out_raw/third_time_output/22_1.txt
Written to ../outputs/output_0/raw/secret_held_out_raw/third_time_output/22_2.txt
All results have been written to text files.


After getting the raw model output, I will check and roughly sort them out in the format for processing (word or word:label, one word or word:label per row). Details can be found in "../outputs/Output_Sorting_Guidelines.docx"

The next step involves further processing all separate output files into a single format：Assign digital serial number to every word, and for the first word of the sentence, its digital serial number is always 1; there is blank row between sentences (The blank rows exist when output from API). So I need remove the blank rows and turn the txt into a three column csv (word order, word, label). 

Content in the txt looks like:

1. Hello
2. World


1. Time
2. flies:1

In [37]:
def process_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    processed_lines = []
    new_line_number = 1
    for line in lines:
        # Only remove serial numbers starting with a number and a period
        line = re.sub(r'^\d+\.\s+', '', line)

        # Check if there is a blank row
        if line.strip():
            # Add new digital serial number
            processed_lines.append(f"{new_line_number}. {line}")
            new_line_number += 1
        else:
            # Save the blank row and reset digital serial number i
            processed_lines.append('\n')
            new_line_number = 1

    # Update the files
    with open(file_path, 'w', encoding='utf-8') as file:
        for line in processed_lines:
            file.write(line)

def process_all_files_in_directory(directory_path):
    # For loop every file in the folder
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            process_text_file(file_path)
            print(f"Processed {file_path}")

# Save in the path
directory_path = '../outputs/output_0/sorted/secret_held_out_sorted/third_time_output'
process_all_files_in_directory(directory_path)

Processed ../outputs/output_0/sorted/secret_held_out_sorted/third_time_output\10_1.txt
Processed ../outputs/output_0/sorted/secret_held_out_sorted/third_time_output\10_2.txt
Processed ../outputs/output_0/sorted/secret_held_out_sorted/third_time_output\1_1.txt
Processed ../outputs/output_0/sorted/secret_held_out_sorted/third_time_output\1_2.txt
Processed ../outputs/output_0/sorted/secret_held_out_sorted/third_time_output\21_1.txt
Processed ../outputs/output_0/sorted/secret_held_out_sorted/third_time_output\21_2.txt
Processed ../outputs/output_0/sorted/secret_held_out_sorted/third_time_output\22_1.txt
Processed ../outputs/output_0/sorted/secret_held_out_sorted/third_time_output\22_2.txt
Processed ../outputs/output_0/sorted/secret_held_out_sorted/third_time_output\5_1.txt
Processed ../outputs/output_0/sorted/secret_held_out_sorted/third_time_output\5_2.txt


Since the sentence order are random, we need to get the gold standard manual word list basd on the new sentence order right now, so we can make comparison between the manual corpus and model output. 

1.Based on the new sentence order, get "word" and "label" column (and "wordcat" cloumn for NVAJ prompts comparison) from manual corpus in new order and write in a new csv.

2.Merge the model output word lists from the txt files for every prompt.

3.Split the model output word lists into three column "order", "word_i", "label_i" (i refers to the prompt number), and write into the csv.

In [26]:
import os
import pandas as pd

# Function to merge all the outputs into a single list for every prompt
def merge_files_by_prefix(directory, prefix):
    all_lines = []# Initialize a list to store all lines from the files
    for i in range(1, 11):  # 10 files for 10-fold
        file_name = f"{prefix}_{i}.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.isfile(file_path):
            with open(file_path, 'r') as file:
                lines = file.readlines()
                all_lines.extend([line.strip() for line in lines if line.strip()])
    return all_lines

# Get unique prefixes from file names in the specified directory
def get_unique_prefixes(directory):
    prefixes = set()
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            prefix = file.split('_')[0]
            prefixes.add(prefix)
    return prefixes

unique_prefixes = get_unique_prefixes(directory_path)

merged_texts = {}
# For each prefix, call merge_files_by_prefix function and store the result in the dictionary
for prefix in unique_prefixes:
    merged_texts[prefix] = merge_files_by_prefix(directory_path, prefix)

In [20]:
# For checking
unique_prefixes

{'1', '10', '21', '22', '5'}

In [21]:
# For checking
print(merged_texts['10'])

['1. Inflation:1', '2. has', '3. reached:1', '4. the', '5. North', '6. Pole', '7. as', '8. a', '9. Santa', '10. shortage', '11. looms:1', '1. Over', '2. the', '3. last', '4. year', '5. as', '6. vaccines', '7. and', '8. COVID', '9. treatments', '10. have', '11. become', '12. more', '13. available', '14. and', '15. society', '16. has', '17. gotten', '18. closer', '19. to', '20. normal', '21. both', '22. Hire', '23. Santa', '24. and', '25. the', '26. International', '27. Brotherhood', '28. of', '29. Real-Bearded', '30. Santas', '31. have', '32. worked', '33. to', '34. rebuild:1', '35. the', '36. Santa', '37. workforce', '1. We', '2. here', '3. at', '4. Hire', '5. Santa', '6. have', '7. been', '8. working', '9. very', '10. hard', '11. to', '12. replenish', '13. the', '14. number', '15. of', '16. Santas', '17. Allen', '18. says', '1. I', "2. 've", '3. traveled', '4. the', '5. country', '6. this', '7. last', '8. year', '9. going', '10. to', '11. Santa', '12. schools', '13. speaking', '14. at

In [22]:
# For checking
merged_texts

{'10': ['1. Inflation:1',
  '2. has',
  '3. reached:1',
  '4. the',
  '5. North',
  '6. Pole',
  '7. as',
  '8. a',
  '9. Santa',
  '10. shortage',
  '11. looms:1',
  '1. Over',
  '2. the',
  '3. last',
  '4. year',
  '5. as',
  '6. vaccines',
  '7. and',
  '8. COVID',
  '9. treatments',
  '10. have',
  '11. become',
  '12. more',
  '13. available',
  '14. and',
  '15. society',
  '16. has',
  '17. gotten',
  '18. closer',
  '19. to',
  '20. normal',
  '21. both',
  '22. Hire',
  '23. Santa',
  '24. and',
  '25. the',
  '26. International',
  '27. Brotherhood',
  '28. of',
  '29. Real-Bearded',
  '30. Santas',
  '31. have',
  '32. worked',
  '33. to',
  '34. rebuild:1',
  '35. the',
  '36. Santa',
  '37. workforce',
  '1. We',
  '2. here',
  '3. at',
  '4. Hire',
  '5. Santa',
  '6. have',
  '7. been',
  '8. working',
  '9. very',
  '10. hard',
  '11. to',
  '12. replenish',
  '13. the',
  '14. number',
  '15. of',
  '16. Santas',
  '17. Allen',
  '18. says',
  '1. I',
  "2. 've",
  '3

In [10]:
# This cell of code is about extracting word and label columns (and "wordcat" cloumn for NVAJ prompts comparison) from manual corpus in new order and write in a new csv.
# Load the files
corpus_df = pd.read_csv('../data/corpus/Corpus_identification_con_met.csv')#original manual annotation corpus
prompting_df = pd.read_csv('../data/corpus/sen4prompting.csv')#Sentences after random permutation

# Extract the "query_sentence" column into a list
query_sentences = prompting_df['query_sentence'].tolist()

# Function to clean and split sentences
def clean_and_split_sentences(sentences):
    clean_sentences = []
    for sentence_group in sentences:
        for sentence in sentence_group.split('\n'):
            cleaned_sentence = re.sub(r'^\d+\.\s*', '', sentence).strip()
            if cleaned_sentence:
                clean_sentences.append(cleaned_sentence)
    return clean_sentences

sentence_list = clean_and_split_sentences(query_sentences)

# Initialize an empty DataFrame for storing matches
matched_df = pd.DataFrame(columns=['word', 'DELMET'])# Use this line if want to add more columns'sentence','wordcat'

# Iterate over each sentence in sentence_list
for sentence in sentence_list:
    # Check if the sentence matches any sentence in the 'context' column of corpus_df
    matches = corpus_df[corpus_df['context'] == sentence]
    if not matches.empty:
        # For each match, add the relevant data to the matched_df using concat
        for _, match in matches.iterrows():
            delmet_int = int(match['DELMET'])
            new_row = pd.DataFrame({'word': [match['word']], 
                            'DELMET': [delmet_int]})# Use this line if want to extract worcat column: 'wordcat': [match['wordcat']]
            matched_df = pd.concat([matched_df, new_row], ignore_index=True)
# Display the matched DataFrame
print(matched_df)
# Write into csv file
matched_df.to_csv("../outputs/output_0/csv/raw/(secret)first_time_output.csv", index=False)

              word DELMET
0              And      0
1            there      0
2               is      0
3           always      0
4              the      0
...            ...    ...
4163  distribution      0
4164       network      1
4165            of      0
4166         bogus      0
4167        prints      0

[4168 rows x 2 columns]


In [28]:
# Thia cell of code is about combining manual annotation and model output
# Process every list for every prompt into columns in the csv file
def process_list(lst, list_number):
    # splite every row into order, word, label
    parsed_data = []
    for item in lst:
        parts = item.split('.')
        order = int(parts[0])
        word_label = parts[1].strip().split(':')
        word = word_label[0]
        label = word_label[1] if len(word_label) > 1 else '0'
        parsed_data.append((order, word, label))
    
    # Adapted to DataFrame
    df = pd.DataFrame(parsed_data, columns=['order', 'word', 'label'])
    
    # Rename in the following format
    df.rename(columns={'word': f'word_{list_number}', 'label': f'label_{list_number}'}, inplace=True)
    return df

# Path for saving
csv_filename = '../outputs/output_0/csv/raw/(secret)third_time_output.csv'
# Write into existing csv file with manual annotation
existing_df = pd.read_csv(csv_filename)

# Add as new columns into DataFrame
for list_number, lst in merged_texts.items():
    df = process_list(lst, list_number)
    # Add as word_i column and label_i column
    df = df[['word_' + str(list_number), 'label_' + str(list_number)]]
    # Merge into the Dataframe
    existing_df = pd.concat([existing_df, df], axis=1)

# Update in the csv file
existing_df.to_csv(csv_filename, index=False)

When going through the model outputs, some of them may have a lot of missing words in the word list, and thus not align to the manual word list. For example,

Maunal:

smash:1

the 

highest:1 

record 
    
--------------

Model:

smash:1

record

In order to automatic align the model words to manual word lists, the next cell of code wwill be used in this situtaion.

In [64]:
# This cell of code is for automatic alignments of the model outputs with a lot of missing words in the word lists with manual word list
# Load csv file
df = pd.read_csv('../outputs/output_0/csv/raw/(conNVAJ)third_time_output - 副本.csv') 

# Initialize aligned_word_1 and aligned_label_1 column
df['aligned_word_1'] = pd.NA
df['aligned_label_1'] = pd.NA
# If there are more than one model output having this problem, use the following codes, same for markdown codes below
#df['aligned_word_i'] = pd.NA
#df['aligned_label_i'] = pd.NA

# Index used to track word_1 column
word_1_index = 0
# word_i_index = 0
# Used to track search positions in word columns
search_start_index = 0
#search_start_index_i = 0

# Initialize a list of unaligned words
unaligned_words = []
unaligned_labels = []
#unaligned_words_i = []
#unaligned_labels_i = []

# For loop word_1 and label_1
for word_1, label_1 in zip(df['word_1'], df['label_1']):
    # Start searching in the word column from the last matching word found
    for i in range(search_start_index, len(df)):
        # If match
        if word_1 == df.at[i, 'word']:
            # Fill the row
            df.at[i, 'aligned_word_1'] = word_1
            df.at[i, 'aligned_label_1'] = label_1
            # Update the search starting point to the next position where a match is currently found
            search_start_index = i + 1
            break
    else:  # If cannot find match in the word column
        unaligned_words.append(word_1)
        unaligned_labels.append(label_1)
        
# For loop word_i and label_i
#for word_i, label_i in zip(df['word_i'], df['label_i']):
#    found_match = False
#    for i in range(search_start_index_i, len(df)):
#        if word_i == df.at[i, 'word']:
#            df.at[i, 'aligned_word_21'] = word_i
#            df.at[i, 'aligned_label_21'] = label_i
#            search_start_index_2 = i + 1
#            found_match = True
#            break
#    if not found_match:
#        unaligned_words_i.append(word_i)
#        unaligned_labels_i.append(label_i)

# Create a DataFrame for unaligned words and their labels
unaligned_df = pd.DataFrame({
    'unaligned_word_1': unaligned_words,
    'unaligned_label_1': unaligned_labels
})

#unaligned_df_i = pd.DataFrame({
#    'unaligned_word_i': unaligned_words_i,
#    'unaligned_label_i': unaligned_labels_i
#})

# Save the aligned DataFrame to csv
df.to_csv('../outputs/output_0/csv/raw/aligned_file.csv', index=False) 

# Save the aligned words to csv
unaligned_df.to_csv('../outputs/output_0/csv/raw/unaligned_file.csv', index=False) 
#unaligned_df_i.to_csv('../outputs/output_0/csv/raw/unaligned_file_21.csv', index=False)

This step evaluation scores of the model outputs will be calculated, which includes

Accuracy

F1 score

Precision

Recall

Support (whow many word or word pairs altogather)

Detected_num (How many conventional metaphors  model has detected)

Total_Num (How many conventional metaphors are there in manual annotation)

True_detected_num (Among the model detected conventional metaphors, how many of them are right)

True_Nonconv_num (Among the model detected non-conventional metaphors, how many of them are right)

Wrong_detected_num (Among the model detected conventional metaphors, how many of them are wrong)

Wrong_Nonconv_num (Among the model detected non-conventional metaphors, how many of them are wrong)

In [62]:
# Load the sorted model output csv file
df = pd.read_csv('../outputs/output_0/csv/sorted/(metaphor)first_time_output_sorted.csv', encoding="ISO-8859-1")

# Select non-deliberate metaphors (conventional metaphors)
df = df[df['DELMET'] != 2]
# For the prompts only aim at detecting metaphors, Use the code beloew rather than the last code
#df['DELMET'] = df['DELMET'].replace(2, 1)

# Automatically detect all columns starting with 'label_'
label_columns = [col for col in df.columns if col.startswith('label_')]

# Select the columns with labels
columns = ['DELMET'] + label_columns
df = df[columns]

# Initialize a DataFrame to store results
results = pd.DataFrame(index=[col.replace('label_', 'Label_') for col in label_columns],
                       columns=['Accuracy', 'F1', 'Precision', 'Recall', 'Support', 'Detected_num', 'Total_Num',
                                'True_detected_num', 'True_Nonconv_num', 'Wrong_detected_num', 'Wrong_Nonconv_num'])

# Calculate the number of conventional metaphors in manual annotation 
delmet_ones_count = df['DELMET'].sum()

# Use manual label as gold standard, calculate metrics for each label column
for label_col in label_columns:
    accuracy = accuracy_score(df['DELMET'], df[label_col])
    f1 = f1_score(df['DELMET'], df[label_col], average='weighted', zero_division=0)
    precision = precision_score(df['DELMET'], df[label_col], average='weighted', zero_division=0)
    recall = recall_score(df['DELMET'], df[label_col], average='weighted', zero_division=0)
    report = classification_report(df['DELMET'], df[label_col], output_dict=True, zero_division=0)
    support = report['weighted avg']['support']
    detected_num = df[label_col].sum()

    # Calculate TP, TN, FP, FN
    tn, fp, fn, tp = confusion_matrix(df['DELMET'], df[label_col]).ravel()

    # Add results to DataFrame
    results.loc[label_col.replace('label_', 'Label_')] = [accuracy, f1, precision, recall, support, detected_num, delmet_ones_count,
                                                          tp, tn, fp, fn]

# Save the results to a new csv file
results.to_csv('../outputs/output_0/csv/results/(metaphor)seperate_multiple_time_results/(metaphor)first_results.csv')

The code below is an adpted version of the code above, just add two more evalutaion columns:

NVAJ_total (How many tokens of noun, verb and adjective in manual annotaion)

NVAJ_detected (How many tokens of noun, verb and adjective in manual annotaion are detected by model)

In [73]:
# Load the sorted model output csv file
df = pd.read_csv('../outputs/output_0/csv/sorted/(metNVAJ)third_time_output_sorted.csv', encoding="ISO-8859-1")

# Select non-deliberate metaphors (conventional metaphors)
df = df[df['DELMET'] != 2]
# For the prompts only aim at detecting metaphors, Use the code beloew rather than the last code
#df['DELMET'] = df['DELMET'].replace(2, 1)

# Automatically detect all columns starting with 'label_'
label_columns = [col for col in df.columns if col.startswith('label_')]

# Select the columns with labels
columns = ['DELMET', 'wordcat'] + label_columns
df = df[columns]

# Filter rows in wordcat column that do not belong to noun, verb, adjective
excluded_tags = ['CJ', 'EX', 'AV', 'AT', 'PN', 'PR', 'DP', 'DT', 'TO', 'XX', 'CR', 'OR', 'UN', 'ZZ']
wordcat_filtered = df[~df['wordcat'].isin(excluded_tags)]

# Initialize a DataFrame to store results
results = pd.DataFrame(index=[col.replace('label_', 'Label_') for col in label_columns],
                       columns=['Accuracy', 'F1', 'Precision', 'Recall', 'Support', ' NVAJ_detected', 'NVAJ_total', 'Detected_num', 'Total_Num',
                                'True_detected_num', 'True_Nonconv_num', 'Wrong_detected_num', 'Wrong_Nonconv_num'])

# Calculate the number of conventional metaphors in manual annotation 
delmet_ones_count = df['DELMET'].sum()

# Use manual label as gold standard, calculate metrics for each label column
for label_col in label_columns:
    accuracy = accuracy_score(df['DELMET'], df[label_col])
    f1 = f1_score(df['DELMET'], df[label_col], average='weighted', zero_division=0)
    precision = precision_score(df['DELMET'], df[label_col], average='weighted', zero_division=0)
    recall = recall_score(df['DELMET'], df[label_col], average='weighted', zero_division=0)
    report = classification_report(df['DELMET'], df[label_col], output_dict=True, zero_division=0)
    support = report['weighted avg']['support']
    detected_num = df[label_col].sum()
    
    # Calculate TP, TN, FP, FN
    tn, fp, fn, tp = confusion_matrix(df['DELMET'], df[label_col]).ravel()

    # Calculate wordcat related statistics
    NVAJ_total = len(wordcat_filtered)
    NVAJ_detected = wordcat_filtered[wordcat_filtered['DELMET'] == 1][label_col].sum()

    # Add results to DataFrame
    results.loc[label_col.replace('label_', 'Label_')] = [accuracy, f1, precision, recall, support, NVAJ_detected, NVAJ_total, detected_num, delmet_ones_count,
                                                          tp, tn, fp, fn]


# Save the results to a new csv file
results.to_csv('../outputs/output_0/csv/results/(metNVAJ)seperate_multiple_time_results/(metNVAJ)third_results.csv')

Since the model has deviations in each output result under the same prompt, the experiment was repeated three times. The following code is used to calculate the average of the three results.

In [88]:
# Load all all the multiple times results
file_names = ['../outputs/output_0/csv/results/(lexiNVAJ)seperate_multiple_time_results/(lexiNVAJ)third_results.csv', '../outputs/output_0/csv/results/(lexiNVAJ)seperate_multiple_time_results/(lexiNVAJ)second_results.csv', '../outputs/output_0/csv/results/(lexiNVAJ)seperate_multiple_time_results/(lexiNVAJ)first_results.csv']
dataframes = [pd.read_csv(file_name, index_col=0) for file_name in file_names]

# Detect how many Label_i columns there are in each DataFrame
label_columns = [col for col in dataframes[0].columns if col.startswith('Label_')]

# Calculate the average of the metrics for these datasets at each cue
mean_df = pd.concat(dataframes).groupby(level=0).mean()

# Save average results to CSV file
mean_df.to_csv('../outputs/output_0/csv/results/overall_results/(lexiNVAJ)average_results.csv')

For further evaluation. Details to be updated

In [32]:
# Conduct Shapiro-Wilk test for accuracy, f1, precision, recall for each prompt，and choose appropriate test
test_results = {}

for label in dataframes[0].index:  # The index of all the DataFrames are same
    test_results[label] = {}
    for column in dataframes[0].columns:  #  The column name for all the DataFrames are same
        # get the data for all the prompts of from the mltiple times reults
        group_values = [df.loc[label, column] for df in dataframes]
        
        # Conduct Shapiro-Wilk test
        shapiro_test = stats.shapiro(group_values)
        if shapiro_test.pvalue > 0.05:
            # if normall y distributed，conduct ANOVA
            f_value, p_value = stats.f_oneway(*[df[column] for df in dataframes])
            test_results[label][column] = ('ANOVA', f_value, p_value)
        else:
            # if not normall y distributed，conduct Kruskal-Wallis
            h_value, p_value = stats.kruskal(*[df[column] for df in dataframes])
            test_results[label][column] = ('Kruskal-Wallis', h_value, p_value)

# output
test_results



{'Label_5': {'Accuracy': ('ANOVA', 1.2230414746543845, 0.3584988147400658),
  'F1': ('ANOVA', 0.08427946763333088, 0.920243155451183),
  'Precision': ('ANOVA', 1.6029187942100094, 0.27686230167671444),
  'Recall': ('ANOVA', 1.2230414746543845, 0.3584988147400658),
  'Support': ('ANOVA', nan, nan)},
 'Label_22': {'Accuracy': ('ANOVA', 1.2230414746543845, 0.3584988147400658),
  'F1': ('ANOVA', 0.08427946763333088, 0.920243155451183),
  'Precision': ('ANOVA', 1.6029187942100094, 0.27686230167671444),
  'Recall': ('ANOVA', 1.2230414746543845, 0.3584988147400658),
  'Support': ('ANOVA', nan, nan)},
 'Label_10': {'Accuracy': ('ANOVA', 1.2230414746543845, 0.3584988147400658),
  'F1': ('ANOVA', 0.08427946763333088, 0.920243155451183),
  'Precision': ('ANOVA', 1.6029187942100094, 0.27686230167671444),
  'Recall': ('ANOVA', 1.2230414746543845, 0.3584988147400658),
  'Support': ('ANOVA', nan, nan)}}

In [33]:
import pandas as pd

rows = []
for label, metrics in test_results.items():
    for metric, values in metrics.items():
        if metric != 'Support':  # 排除 'Support' 指标
            test_name, statistic, p_value = values
            rows.append([label, metric, test_name, statistic, p_value])

df_no_support = pd.DataFrame(rows, columns=['Label', 'Metric', 'Test', 'Statistic', 'P-Value'])
df_no_support.to_csv('../outputs/output_0/csv/results/overall_results/[metaphor]multiple_time_result_significance_comparison.csv')

Evaluation part to be finished:
 
1. Post check: if the auto processing output has tokenization problem so the length is different from ground truth in the corpus.

2. Filter out the deliberate metaphorical words.

3. Calculate the scores.For example,

ground_truth = csv_data['maunal annotation']
model_output = csv_data['model_output']

def calculate_metrics(ground_truth, model_output):

    return {
        'F1 Score': f1_score(ground_truth, model_output),
        'Accuracy': accuracy_score(ground_truth, model_output),
        'Recall': recall_score(ground_truth, model_output),
        'Precision': precision_score(ground_truth, model_output),
        'Support': classification_report(ground_truth, model_output, output_dict=True)['1']['support']
    }

metrics_results = calculate_metrics(ground_truth, model_output)

4. Store the scores in a csv.

5. Evaluation: permutation test: whether there is significant difference in comparison of the scores of the same prompts running multiple times, and comparison of the average scores of different prompts; Confidence interval of (accuracy, F1...).