In [2]:
#import needed libraries
import pandas as pd
import openai
import os
import csv
import re
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from scipy import stats

In [1]:
def convert_to_iob(word_list):
    iob_tags = []
    i = 0
    while i < len(word_list):
        word = word_list[i]
        if word.endswith(':1'):
            clean_word = word[:-2]  # 去掉:1标记
            iob_tags.append(f"{clean_word}\tB-ENT")
            i += 1
            # 检查后续单词是否也需要标注为I-ENT
            while i < len(word_list) and word_list[i].endswith(':1'):
                clean_word = word_list[i][:-2]
                iob_tags.append(f"{clean_word}\tI-ENT")
                i += 1
        else:
            iob_tags.append(word)
            i += 1
    return iob_tags

def process_csv(input_file, output_file, text_col):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames  # 保持原有列名
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for row in reader:
            word_list = row[text_col].strip().split('\n')
            iob_tags = convert_to_iob(word_list)
            row[text_col] = '\n'.join(iob_tags)
            writer.writerow(row)

# 示例输入：CSV文件路径，文本列名
input_file_path = '../data/corpus/example4prompting.csv'
output_file_path = 'example_IBO.csv'
text_col = 'word_list'

# 处理CSV文件
process_csv(input_file_path, output_file_path, text_col)

print(f"IOB标注结果已保存到 {output_file_path}")


IOB标注结果已保存到 example_IBO.csv


In [14]:
import pandas as pd

# Load the data
df = pd.read_csv('example_IBO.csv')

# Select specific group and combine context and word_list into a single string function
def select_and_combine(df, split_label, group_label):
    # Select rows based on split and group
    selected_df = df[(df['split'] == split_label) & (df['group'] == group_label)]

    # Initialize lists to store sentences and labels
    sentences = []
    labels = []

    # Combine context and word_list
    for _, row in selected_df.iterrows():
        sentence = row['context']
        label = row['word_list']
        sentences.append(sentence)
        labels.append(label)
    
    return sentences, labels

# Example: Select 'train' and 5-shot
sentences, labels = select_and_combine(df, 'train', 1)

# Print the result
for sentence, label in zip(sentences, labels):
    print(sentence)
    print(label)
    print()

The hotel lies in five acres of fields and garden and the Slingos who own it make the most of their beautiful surroundings by running painting courses for about £200 a week including breakfast and dinner or £90 for a long weekend
The
hotel
lies	B-ENT
in
five
acres
of
fields
and
garden
and
the
Slingos
who
own
it
make	B-ENT
the
most
of
their
beautiful
surroundings
by
running	B-ENT
painting
courses	B-ENT
for
about	B-ENT
£200
a
week
including
breakfast
and
dinner
or
£90
for
a
long	B-ENT
weekend



In [16]:
# 加载 prompts 数据
prompts_df = pd.read_csv('25_prompts_nshot.csv')

# 加载 sen4prompting 数据
sen4prompting_df = pd.read_csv('../data/corpus/sen4prompting.csv')

# 创建一个列表来存储每组的完整输出
grouped_prompts = []

for index, prompt_row in prompts_df.iterrows():
    # 初始化每个主组的输出列表
    group_combinations = []
    
    # 处理原有的句子和 wordlist 组合，生成一个基本的模板输出
    primary_output = ""
    for sentence, label in zip(sentences, labels):
        primary_output += prompt_row['1-n shot'].replace("{text}", sentence).replace("{label}", label) + "\n\n"
    
    # 将每个 sen4prompting 句子添加到基本模板输出中，形成不同的子组
    for query_sentence in sen4prompting_df['query_sentence']:
        # 替换 {text} 且将 {label} 设置为空白
        combined_output = primary_output + prompt_row['1-n shot'].replace("{text}", query_sentence).replace("{label}", "") + "\n\n"
        
        # 每个 query_sentence 生成一个新的子组输出
        group_combinations.append(combined_output)
    
    # 将组合输出添加到列表中
    grouped_prompts.append(group_combinations)

# 打印第一个 prompt 的第一个和第二个子组输出
print("输出第一个子组的内容:")
print(grouped_prompts[0][0])

print("\n输出第二个子组的内容:")
print(grouped_prompts[23][1])


输出第一个子组的内容:
You will perform a conventional metaphor detection task.  List every word from every sentence from a set of unrelated sentences, one word per row. Mark conventional metaphors with with an IOB tagging schema. Use an empty row to separate word lists from different sentences. 
Sentences:The hotel lies in five acres of fields and garden and the Slingos who own it make the most of their beautiful surroundings by running painting courses for about £200 a week including breakfast and dinner or £90 for a long weekend
Labels: The
hotel
lies	B-ENT
in
five
acres
of
fields
and
garden
and
the
Slingos
who
own
it
make	B-ENT
the
most
of
their
beautiful
surroundings
by
running	B-ENT
painting
courses	B-ENT
for
about	B-ENT
£200
a
week
including
breakfast
and
dinner
or
£90
for
a
long	B-ENT
weekend

You will perform a conventional metaphor detection task.  List every word from every sentence from a set of unrelated sentences, one word per row. Mark conventional metaphors with with an IOB taggin

In [17]:
# 读取句子文件
sentence_df = pd.read_csv('../data/corpus/sen4prompting.csv', usecols=['query_sentence'], encoding='latin1')

# 读取prompt文件
prompts_df = pd.read_csv('25_prompts_nshot.csv', usecols=['1-n shot'])

# 遍历每个prompt
for prompt_index, prompt_row in prompts_df[23:].iterrows():
    # 初始句子和wordlist组合的输出
    primary_output = ""  # 假设这里你已经有了一个基本输出模板
    for sentence, label in zip(sentences, labels):  # 这里假设你已经定义了 sentences 和 labels
        primary_output += prompt_row['1-n shot'].replace("{text}", sentence).replace("{label}", label) + "\n\n"
    
    # 对每个sen4prompting中的句子创建一个新的子组
    for sentence_index, sentence_row in sentence_df.iterrows():
        # 替换 {text} 且将 {label} 设置为空白
        combined_output = primary_output + prompt_row['1-n shot'].replace("{text}", sentence_row['query_sentence']).replace("{label}", "") + "\n\n"
        print(combined_output)

Task: Detect conventional metaphors in the provided sentences.

Guidelines:

Check: Determine if there is a difference between the literal and contextual meanings of the words.
Confirm: Verify if the metaphorical meaning is recognized in dictionaries.
Detect: If confirmed, the word is considered a conventional metaphor.

Process:

Read: Comprehend the message of each input sentence.
Find: Search for words that are used metaphorically in each input sentence.
Check: Consult a dictionary to confirm the conventional metaphorical use.
List: Enumerate all words from each sentence.
Mark: Apply an IOB tagging schema to mark any conventional metaphorical word.

Output:

Present every word from each sentence, one word per row, with conventional metaphors labelled based on an IOB tagging schema.
Use an empty row to separate word lists from different sentences. 
Provide the results only for the "Mark" step.
Input Sentences: The hotel lies in five acres of fields and garden and the Slingos who own 

In [18]:
import pandas as pd
import openai

# 设置你的 OpenAI API 密钥
openai.api_key = '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'

# 读取句子文件
sentence_df = pd.read_csv('../data/corpus/sen4prompting.csv', usecols=['query_sentence'], encoding='latin1')

# 读取prompt文件
prompts_df = pd.read_csv('25_prompts_nshot.csv', usecols=['1-n shot'])

# 遍历每个prompt
#or prompt_index, prompt_row in prompts_df.iterrows():
# 遍历每个prompt
for prompt_index, prompt_row in prompts_df.iloc[[3, 20, 17, 19, 16]].iterrows():
    # 你的处理代码
    # 初始句子和wordlist组合的输出
    primary_output = ""  # 假设这里你已经有了一个基本输出模板
    for sentence, label in zip(sentences, labels):  # 这里假设你已经定义了 sentences 和 labels
        primary_output += prompt_row['1-n shot'].replace("{text}", sentence).replace("{label}", label) + "\n\n"
    
    # 对每个sen4prompting中的句子创建一个新的子组
    for sentence_index, sentence_row in sentence_df.iterrows():
        # 替换 {text} 且将 {label} 设置为空白
        combined_output = primary_output + prompt_row['1-n shot'].replace("{text}", sentence_row['query_sentence']).replace("{label}", "") + "\n\n"
        
        # 使用OpenAI API生成响应
        response = openai.ChatCompletion.create(
            model="gpt-4-1106-preview",
            messages=[{"role": "user", "content": combined_output}],
            temperature=0.7,  # 默认温度
            max_tokens=4000
        )
        
        # 提取生成的文本
        generated_text = response['choices'][0]['message']['content'] if response['choices'] else 'No response'
        
        # 文件命名规则 "promptIndex_sentenceGroupIndex.txt"，例如 "1_1.txt"
        filename = f"outputs/output_1/raw/{prompt_index + 1}_{sentence_index + 1}.txt"
        
        # 将生成的文本写入文本文件
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(generated_text)
        
        # 打印信息以跟踪进度
        print(f"Written to {filename}")

print("All results have been written to text files.")


Written to outputs/output_1/raw/4_1.txt
Written to outputs/output_1/raw/4_2.txt
Written to outputs/output_1/raw/4_3.txt
Written to outputs/output_1/raw/4_4.txt
Written to outputs/output_1/raw/4_5.txt
Written to outputs/output_1/raw/4_6.txt
Written to outputs/output_1/raw/4_7.txt
Written to outputs/output_1/raw/4_8.txt
Written to outputs/output_1/raw/4_9.txt
Written to outputs/output_1/raw/4_10.txt
Written to outputs/output_1/raw/21_1.txt
Written to outputs/output_1/raw/21_2.txt
Written to outputs/output_1/raw/21_3.txt
Written to outputs/output_1/raw/21_4.txt
Written to outputs/output_1/raw/21_5.txt
Written to outputs/output_1/raw/21_6.txt
Written to outputs/output_1/raw/21_7.txt
Written to outputs/output_1/raw/21_8.txt
Written to outputs/output_1/raw/21_9.txt
Written to outputs/output_1/raw/21_10.txt
Written to outputs/output_1/raw/18_1.txt
Written to outputs/output_1/raw/18_2.txt
Written to outputs/output_1/raw/18_3.txt
Written to outputs/output_1/raw/18_4.txt
Written to outputs/outpu

In [20]:
# Set your OpenAI API key
openai.api_key = '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'

# Read the sentence_forAPI.csv file
sentence_df = pd.read_csv('../data/corpus/sen4prompting.csv', usecols=['query_sentence'])

# Read the Excel file's "0-shot" column
prompts_df = pd.read_csv('25_prompts_0shot.csv', usecols=['0 shot - conventional metaphor '])

# For loop every prompt in prompts_df
for prompt_index, prompt_row in prompts_df.iloc[[3, 20, 17, 19, 16]].iterrows():#iloc[sorted_indices].
    # For loop every row in sentence_df
    for sentence_index, sentence_row in sentence_df[0:].iterrows():
        # Process every sentence
        sentence = sentence_row['query_sentence']
        # Replace{text} with every fold of sentences in the prompt
        modified_prompt = prompt_row['0 shot - conventional metaphor '].replace("{text}", sentence)
        # Print results
        #print(f"Prompt index: {prompt_index}, Sentence index: {sentence_index}, Modified prompt: {modified_prompt}")
        # Call the OpenAI API to get a response for each sentence group
        response = openai.ChatCompletion.create(
            model="gpt-4-1106-preview",
            messages=[{"role": "user", "content": modified_prompt}],
            temperature=0.7,#Default temperature
            max_tokens=4000
        )
        # Extract the generated text
        generated_text = response['choices'][0]['message']['content'] if response['choices'] else 'No response'
        
        # Print the generated text to verify it before writing to the file
        #print(generated_text)

        # Filename convention "promptIndex_sentenceGroupIndex.txt", e.g., "1_1.txt"
        filename = f"outputs/output_0/raw/{prompt_index + 1}_{sentence_index + 1}.txt"
        
        # Write the generated text to a text file
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(generated_text)
        
        # Print information to track progress
        print(f"Written to {filename}")

print("All results have been written to text files.")

Written to outputs/output_0/raw/4_1.txt
Written to outputs/output_0/raw/4_2.txt
Written to outputs/output_0/raw/4_3.txt
Written to outputs/output_0/raw/4_4.txt
Written to outputs/output_0/raw/4_5.txt
Written to outputs/output_0/raw/4_6.txt
Written to outputs/output_0/raw/4_7.txt
Written to outputs/output_0/raw/4_8.txt
Written to outputs/output_0/raw/4_9.txt
Written to outputs/output_0/raw/4_10.txt
Written to outputs/output_0/raw/21_1.txt
Written to outputs/output_0/raw/21_2.txt
Written to outputs/output_0/raw/21_3.txt
Written to outputs/output_0/raw/21_4.txt
Written to outputs/output_0/raw/21_5.txt
Written to outputs/output_0/raw/21_6.txt
Written to outputs/output_0/raw/21_7.txt
Written to outputs/output_0/raw/21_8.txt
Written to outputs/output_0/raw/21_9.txt
Written to outputs/output_0/raw/21_10.txt
Written to outputs/output_0/raw/18_1.txt
Written to outputs/output_0/raw/18_2.txt
Written to outputs/output_0/raw/18_3.txt
Written to outputs/output_0/raw/18_4.txt
Written to outputs/outpu

In [34]:
def process_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    processed_lines = []
    new_line_number = 1
    for line in lines:
        # Only remove serial numbers starting with a number and a period
        line = re.sub(r'^\d+\.\s+', '', line)

        # Check if there is a blank row
        if line.strip():
            # Add new digital serial number
            processed_lines.append(f"{new_line_number}. {line}")
            new_line_number += 1
        else:
            # Save the blank row and reset digital serial number i
            processed_lines.append('\n')
            new_line_number = 1

    # Update the files
    with open(file_path, 'w', encoding='utf-8') as file:
        for line in processed_lines:
            file.write(line)

def process_all_files_in_directory(directory_path):
    # For loop every file in the folder
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            process_text_file(file_path)
            print(f"Processed {file_path}")
# Save in the path
directory_path = 'outputs/output_10/sorted'
process_all_files_in_directory(directory_path)

Processed outputs/output_10/sorted\17_1.txt
Processed outputs/output_10/sorted\17_10.txt
Processed outputs/output_10/sorted\17_2.txt
Processed outputs/output_10/sorted\17_3.txt
Processed outputs/output_10/sorted\17_4.txt
Processed outputs/output_10/sorted\17_5.txt
Processed outputs/output_10/sorted\17_6.txt
Processed outputs/output_10/sorted\17_7.txt
Processed outputs/output_10/sorted\17_8.txt
Processed outputs/output_10/sorted\17_9.txt
Processed outputs/output_10/sorted\18_1.txt
Processed outputs/output_10/sorted\18_10.txt
Processed outputs/output_10/sorted\18_2.txt
Processed outputs/output_10/sorted\18_3.txt
Processed outputs/output_10/sorted\18_4.txt
Processed outputs/output_10/sorted\18_5.txt
Processed outputs/output_10/sorted\18_6.txt
Processed outputs/output_10/sorted\18_7.txt
Processed outputs/output_10/sorted\18_8.txt
Processed outputs/output_10/sorted\18_9.txt
Processed outputs/output_10/sorted\20_1.txt
Processed outputs/output_10/sorted\20_10.txt
Processed outputs/output_10/s

In [35]:
import os
import pandas as pd

# Function to merge all the outputs into a single list for every prompt
def merge_files_by_prefix(directory, prefix):
    all_lines = []# Initialize a list to store all lines from the files
    for i in range(1, 11):  # 10 files for 10-fold
        file_name = f"{prefix}_{i}.txt"
        file_path = os.path.join(directory, file_name)
        if os.path.isfile(file_path):
            with open(file_path, 'r') as file:
                lines = file.readlines()
                all_lines.extend([line.strip() for line in lines if line.strip()])
    return all_lines

# Get unique prefixes from file names in the specified directory
def get_unique_prefixes(directory):
    prefixes = set()
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            prefix = file.split('_')[0]
            prefixes.add(prefix)
    return prefixes

unique_prefixes = get_unique_prefixes(directory_path)

merged_texts = {}
# For each prefix, call merge_files_by_prefix function and store the result in the dictionary
for prefix in unique_prefixes:
    merged_texts[prefix] = merge_files_by_prefix(directory_path, prefix)

In [37]:
# Thia cell of code is about combining manual annotation and model output
# Process every list for every prompt into columns in the csv file
def process_list(lst, list_number):
    # splite every row into order, word, label
    parsed_data = []
    for item in lst:
        parts = item.split('.')
        order = int(parts[0])
        import re
        word_label = re.split(r'[ \t]', parts[1].strip())
        word = word_label[0]
        label = word_label[1] if len(word_label) > 1 else 'O'
        parsed_data.append((order, word, label))
    
    # Adapted to DataFrame
    df = pd.DataFrame(parsed_data, columns=['order', 'word', 'label'])
    
    # Rename in the following format
    df.rename(columns={'word': f'word_{list_number}', 'label': f'label_{list_number}'}, inplace=True)
    return df

# Path for saving
csv_filename = 'outputs/output_10/csv/10_raw_output.csv'
# Write into existing csv file with manual annotation
existing_df = pd.read_csv(csv_filename)

# Add as new columns into DataFrame
for list_number, lst in merged_texts.items():
    df = process_list(lst, list_number)
    # Add as word_i column and label_i column
    df = df[['word_' + str(list_number), 'label_' + str(list_number)]]
    # Merge into the Dataframe
    existing_df = pd.concat([existing_df, df], axis=1)

# Update in the csv file
existing_df.to_csv(csv_filename, index=False)

In [50]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import pandas as pd

# Load the sorted labels
df = pd.read_csv('outputs/output_5/csv/5_calculate_output.csv')

# Select non-deliberate metaphors (conventional metaphors)
df = df[df['DELMET'] != 2]
# For the prompts only aim at detecting metaphors, Use the code below rather than the last code
#df['DELMET'] = df['DELMET'].replace(2, 1)

# automatically detect all the columns starting with 'label_'
label_columns = [col for col in df.columns if col.startswith('label_')]

# 初始化一个DataFrame来存储结果
results = pd.DataFrame(index=[col.replace('label_', 'Label_') for col in label_columns],
                       columns=['Accuracy', 'F1', 'Precision', 'Recall', 'Detected_num', 'Total_Num',
                                'True_Positive', 'True_Negative', 'False_Positive', 'False_Negative'])

# 计算手动注释中的传统隐喻数量
delmet_ones_count = df['DELMET'].sum()

# 计算每个标签列的指标
for label_col in label_columns:
    y_true = df['DELMET']
    y_pred = df[label_col]
    
    # 计算性能指标
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    
    # 计算混淆矩阵
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # 计算预测为正类的数量
    detected_num = y_pred.sum()
    
    # 打印或存储每个标签的结果
    print(f"prompt: {label_col}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Detected Positives: {detected_num}")
    print(f"True Positives: {tp}")
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print("\n")
    
    # 将结果添加到DataFrame
    results.loc[label_col.replace('label_', 'Label_')] = [accuracy, f1, precision, recall, detected_num, delmet_ones_count,
                                                          tp, tn, fp, fn]

# 将结果保存到新的csv文件
results.to_csv('outputs/output_5/csv/5_results.csv')

prompt: label_18
Accuracy: 0.8512
F1 Score: 0.1987
Precision: 0.5139
Recall: 0.1231
Detected Positives: 144
True Positives: 74
True Negatives: 3341
False Positives: 70
False Negatives: 527


prompt: label_4
Accuracy: 0.8447
F1 Score: 0.2280
Precision: 0.4466
Recall: 0.1531
Detected Positives: 206
True Positives: 92
True Negatives: 3297
False Positives: 114
False Negatives: 509


prompt: label_21
Accuracy: 0.8509
F1 Score: 0.2411
Precision: 0.5080
Recall: 0.1581
Detected Positives: 187
True Positives: 95
True Negatives: 3319
False Positives: 92
False Negatives: 506


prompt: label_17
Accuracy: 0.8412
F1 Score: 0.2408
Precision: 0.4244
Recall: 0.1681
Detected Positives: 238
True Positives: 101
True Negatives: 3274
False Positives: 137
False Negatives: 500


prompt: label_20
Accuracy: 0.8455
F1 Score: 0.2383
Precision: 0.4554
Recall: 0.1614
Detected Positives: 213
True Positives: 97
True Negatives: 3295
False Positives: 116
False Negatives: 504


