#### A brave new ADVERSIAL FILTERING for our paper

In [1]:
# Setting
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

os.chdir(os.path.join(os.getcwd(), '..', 'model'))
print(f"Current working directory: {os.getcwd()}")

# Library
import pandas as pd
import numpy as np
import ast
import re
import os
import json
from tqdm import tqdm
from openai import OpenAI
from langchain.prompts.chat import ChatPromptTemplate
from utils_filtering import *
from maverick import Maverick

# Config
config_path = "../config.json"
with open(config_path, 'r') as f:
    config = json.load(f)
open_api_key = config['openai_api']
folder_path = "../data/LitBank_Case/"

# Augmentation Index
case_idx = 0

Using device: cuda
Current working directory: /home/gayeon39/gayeon/[연구]Coreference Resolution/coref-main/model


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Model
model = Maverick(hf_name_or_path = "sapienzanlp/maverick-mes-litbank",  device = "cuda:0") 
client = OpenAI(api_key=open_api_key)

sapienzanlp/maverick-mes-litbank loading




In [5]:
# Load Data
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]
len(csv_files)

80

In [4]:
def extract_number(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else float('inf')

csv_files = sorted(csv_files, key=extract_number)
df_case = pd.read_csv(folder_path + csv_files[0])

df_case['extracted_sentence'] = [ast.literal_eval(data) for data in df_case['extracted_sentence']]
df_case['text'] = [ast.literal_eval(data) for data in df_case['text']]
df_case['coref'] = [ast.literal_eval(data) for data in df_case['coref']]
df_case['adjusted_offsets'] = [ast.literal_eval(data) for data in df_case['adjusted_offsets']]

In [22]:
prompt_text = """You will be given a sentence in OntoNotes format along with a coreference cluster and its offsets. Your task is to add **only one adjective** that aligns with the given coreference term. The adjective must be placed **immediately before** the term within the sentence.

### Guidelines:
1. Identify the words in the sentence that correspond to each offset.
2. Updated Coreference Offsets should be calculated step by step.
3. For each remaining term (starting from the second), add **only one adjective** **immediately before** the term if it adds meaningful context.
4. **Never add articles ('the', 'a')**, only one adjective.
5. Ensure the adjective does not change the sentence's original meaning.
6. **Avoid repeating the same word multiple times in sequence** (e.g., avoid adding 'large' twice in a row like 'large large').
7. Use adjectives that are contextually relevant and meaningful. Avoid using too general adjectives like 'good', 'bad', 'nice', or nonsensical combinations.
8. Adjectives should enrich the meaning or add useful information without making the description redundant or awkward.
9. **If no suitable adjective can be added without disrupting the meaning or creating redundancy, do not add an adjective at all.** The coreference term should remain unchanged in such cases.
10. **NEVER VIOLATE THE OUTPUT TEMPLATE**

### Input:
- Sentence: {ontonotes_sentence}
- Coreference Offsets: {offsets} 
- Coreference Words: {words} 

### Output Format:
1. Updated Coreference Words : The modified OntoNotes format sentence with adjectives added.

### Example:
Input:
- Sentence: ['Barack', 'Obama', 'is', 'traveling', 'to', 'Rome', '.', 'The', 'city', 'is', 'sunny', 'and', 'the', 'president', 'plans', 'to', 'visit', 'its', 'most', 'important', 'attractions']
- Coreference Offsets: [[5, 5], [7, 8], [17, 17]]
- Coreference Words: [['Rome'], ['The', 'city'], ['its']]

**Correct Output**:
1. Updated Coreference Words : [['Rome'], ['The', 'picturesque', 'city'], ['its']]

**Explanation**: 
- 'picturesque' was added to 'city' to enrich the description without altering the intended meaning.
- No adjective was added to 'Rome' or 'its' as it was unnecessary.

Now, process the following input.
"""

In [23]:
request_prompt_text = """This prompt is that the output form of the previous request was not correct, please do it again according to the request below.
You will be given a sentence in OntoNotes format along with a coreference cluster and its offsets. Your task is to add **only one adjective** that aligns with the given coreference term. The adjective must be placed **immediately before** the term within the sentence.

### Guidelines:
1. Identify the words in the sentence that correspond to each offset.
2. Updated Coreference Offsets should be calculated step by step.
3. For each remaining term (starting from the second), add **only one adjective** **immediately before** the term if it adds meaningful context.
4. **Never add articles ('the', 'a')**, only one adjective.
5. Ensure the adjective does not change the sentence's original meaning.
6. **Avoid repeating the same word multiple times in sequence** (e.g., avoid adding 'large' twice in a row like 'large large').
7. Use adjectives that are contextually relevant and meaningful. Avoid using too general adjectives like 'good', 'bad', 'nice', or nonsensical combinations.
8. Adjectives should enrich the meaning or add useful information without making the description redundant or awkward.
9. **If no suitable adjective can be added without disrupting the meaning or creating redundancy, do not add an adjective at all.** The coreference term should remain unchanged in such cases.
10. **NEVER VIOLATE THE OUTPUT TEMPLATE**

### Input:
- Sentence: {ontonotes_sentence}
- Coreference Offsets: {offsets} 
- Coreference Words: {words} 

### Output Format:
1. Updated Coreference Words : The modified OntoNotes format sentence with adjectives added.

### Example:
Input:
- Sentence: ['Barack', 'Obama', 'is', 'traveling', 'to', 'Rome', '.', 'The', 'city', 'is', 'sunny', 'and', 'the', 'president', 'plans', 'to', 'visit', 'its', 'most', 'important', 'attractions']
- Coreference Offsets: [[5, 5], [7, 8], [17, 17]]
- Coreference Words: [['Rome'], ['The', 'city'], ['its']]

**Correct Output**:
1. Updated Coreference Words : [['Rome'], ['The', 'picturesque', 'city'], ['its']]

**Explanation**: 
- 'picturesque' was added to 'city' to enrich the description without altering the intended meaning.
- No adjective was added to 'Rome' or 'its' as it was unnecessary.

Now, process the following input.
"""

In [24]:
next_prompt_text = '''You will be given a sentence in modified OntoNotes format along with coreference information. Your task is to replace any easily identifiable adjectives (such as `beautiful`) in the **Updated Coreference Words** with more challenging and sophisticated words that make the target reference less obvious. Follow these rules:

### Guidelines:
1. Identify the words in the sentence that correspond to each offset.
2. Updated Coreference Offsets should be calculated step by step.
3. For each remaining term (starting from the second), add **only one adjective** **immediately before** the term if it adds meaningful context.
4. **Never add articles ('the', 'a')**, only one adjective.
5. Ensure the adjective does not change the sentence's original meaning.
6. **Avoid repeating the same word multiple times in sequence** (e.g., avoid adding 'large' twice in a row like 'large large').
7. Use adjectives that are contextually relevant and meaningful. Avoid using too general adjectives like 'good', 'bad', 'nice', or nonsensical combinations.
8. Adjectives should enrich the meaning or add useful information without making the description redundant or awkward.
9. **If no suitable adjective can be added without disrupting the meaning or creating redundancy, do not add an adjective at all.** The coreference term should remain unchanged in such cases.
10. **NEVER VIOLATE THE OUTPUT TEMPLATE**

### Input:
- Modified Sentence: {modified_ontonotes_sentence}
- Original Coreference Words: {original_coreference_words}
- Updated Coreference Words: {updated_coreference_words}
- Updated Coreference Offsets: {updated_coreference_offsets}

### Output Format:
1. Further Updated Coreference Words: List of coreference terms showing any adjective changes made.

### Example:
Input:
- Modified Sentence: ['Barack', 'Obama', 'is', 'traveling', 'to', 'Rome', '.', 'The', 'beautiful', 'city', 'is', 'sunny', 'and', 'the', 'president', 'plans', 'to', 'visit', 'its', 'most', 'important', 'attractions']
- Original Coreference Words: [['Rome'], ['The', 'city'], ['its']]
- Updated Coreference Words: [['Rome'], ['The', 'beautiful', 'city'], ['its']]
- Updated Coreference Offsets: [[5, 5], [7, 9], [18, 18]]

**Correct Output**:
1. Further Updated Coreference Words : [['Rome'], ['The', 'ornate', 'city'], ['its']]

### Explain the Output Example
1. Updated Corefernece Words : 'beautiful' is changed as 'ornate' which is an appropriate adjective for the meaning of a sentence 

Now, process the following input.
'''

In [25]:
request_next_prompt_text = """This prompt is that the output form of the previous request was not correct, please do it again according to the request below.
You will be given a sentence in modified OntoNotes format along with coreference information. Your task is to replace any easily identifiable adjectives (such as `beautiful`) in the **Updated Coreference Words** with more challenging and sophisticated words that make the target reference less obvious. Follow these rules:

### Guidelines:
1. Identify the words in the sentence that correspond to each offset.
2. Updated Coreference Offsets should be calculated step by step.
3. For each remaining term (starting from the second), add **only one adjective** **immediately before** the term if it adds meaningful context.
4. **Never add articles ('the', 'a')**, only one adjective.
5. Ensure the adjective does not change the sentence's original meaning.
6. **Avoid repeating the same word multiple times in sequence** (e.g., avoid adding 'large' twice in a row like 'large large').
7. Use adjectives that are contextually relevant and meaningful. Avoid using too general adjectives like 'good', 'bad', 'nice', or nonsensical combinations.
8. Adjectives should enrich the meaning or add useful information without making the description redundant or awkward.
9. **If no suitable adjective can be added without disrupting the meaning or creating redundancy, do not add an adjective at all.** The coreference term should remain unchanged in such cases.
10. **NEVER VIOLATE THE OUTPUT TEMPLATE**

### Input:
- Modified Sentence: {modified_ontonotes_sentence}
- Original Coreference Words: {original_coreference_words}
- Updated Coreference Words: {updated_coreference_words}
- Updated Coreference Offsets: {updated_coreference_offsets}

### Output Format:
1. Further Updated Coreference Words: List of coreference terms showing any adjective changes made.

### Example:
Input:
- Modified Sentence: ['Barack', 'Obama', 'is', 'traveling', 'to', 'Rome', '.', 'The', 'beautiful', 'city', 'is', 'sunny', 'and', 'the', 'president', 'plans', 'to', 'visit', 'its', 'most', 'important', 'attractions']
- Original Coreference Words: [['Rome'], ['The', 'city'], ['its']]
- Updated Coreference Words: [['Rome'], ['The', 'beautiful', 'city'], ['its']]
- Updated Coreference Offsets: [[5, 5], [7, 9], [18, 18]]

**Correct Output**:
1. Further Updated Coreference Words : [['Rome'], ['The', 'ornate', 'city'], ['its']]

### Explain the Output Example
1. Updated Corefernece Words : 'beautiful' is changed as 'ornate' which is an appropriate adjective for the meaning of a sentence 

Now, process the following input.
"""

In [29]:
def model_inference(sentence):
    result = model.predict(sentence)
    clusters_token_offsets_list = clusters_token_offsets_to_list(result['clusters_token_offsets'])
    return clusters_token_offsets_list

progress_bar = tqdm(total=(len(csv_files) - case_idx))
while case_idx<len(csv_files):
    step = 0
    coref_case = csv_files[case_idx]
    df_case = pd.read_csv(folder_path + coref_case)
    df_case['extracted_sentence'] = [ast.literal_eval(data) for data in df_case['extracted_sentence']]
    df_case['extracted_sentence'] = df_case['extracted_sentence'].apply(ontonote_to_list)
    df_case['text'] = [ast.literal_eval(data) for data in df_case['text']]
    df_case['coref'] = [ast.literal_eval(data) for data in df_case['coref']]
    df_case['adjusted_offsets'] = [ast.literal_eval(data) for data in df_case['adjusted_offsets']]
    
    New_LitBank_df = pd.DataFrame()
    
    while True:
        #Terminate
        step += 1
        if df_case.empty:
            # Maverick doesn't predict all case at first -> we need to add each col to fit other form
            for col in ['update_sentence', 'update_coref', 'update_text']:
                if col not in New_LitBank_df.columns:
                    New_LitBank_df[col] = np.nan
            break
        
        #Prevent too much augmentation. Actually, if the step is bigger than 10, it must be some problem in the extracting update words.
        if step>100:
            New_LitBank_df = pd.concat([New_LitBank_df, df_case]).reset_index(drop=True)
            break
        
        #First step
        if step == 1:
            #model inference
            df_case['inference_offsets'] = df_case['extracted_sentence'].apply(model_inference)
            df_case['labels'] = [1 if data.adjusted_offsets in data.inference_offsets else 0 for data in df_case.itertuples()]
            
            #New LitBank
            New_LitBank_df = pd.concat([New_LitBank_df, df_case[df_case['labels']==0]]).reset_index(drop=True)
            df_case = df_case[df_case['labels']==1].reset_index(drop=True)
            
            update_text_list = []
            prompt = ChatPromptTemplate.from_template(prompt_text)

            for data in df_case.itertuples():
                ontonotes_sentence = data.extracted_sentence
                offsets = data.adjusted_offsets
                words = data.text
                while True:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[{"role": "user",
                                "content": prompt.format(
                                            ontonotes_sentence=ontonotes_sentence,
                                            offsets=offsets,
                                            words=words
                                        )}])
                    output_text = response.choices[0].message.content
                    updated_coreference_words_match = re.search(r"Updated Coreference Words\s*:\s*(\[\[.*?\]\])", output_text, re.DOTALL)

                    if updated_coreference_words_match:
                        updated_coreference_words = updated_coreference_words_match.group(1) if updated_coreference_words_match else None
                        updated_coreference_words = ast.literal_eval(updated_coreference_words) if updated_coreference_words else None
                    else:
                        prompt = ChatPromptTemplate.from_template(request_prompt_text)
                        continue
                    
                    if updated_coreference_words==data.text or len(updated_coreference_words)!=len(offsets):
                        prompt = ChatPromptTemplate.from_template(request_prompt_text)
                    else:
                        prompt = ChatPromptTemplate.from_template(prompt_text)
                        break
                            
                update_text_list.append(updated_coreference_words)
            
            df_case['update_text'] = update_text_list
            df_case['update_sentence'] = df_case.apply(lambda x: replace_words_by_dynamic_indices(x['extracted_sentence'], x['adjusted_offsets'], x['update_text']), axis=1) if df_case.empty==False else []
            df_case['update_coref'] = df_case.apply(lambda x: offset_modify(x['adjusted_offsets'], x['update_text']), axis=1) if df_case.empty==False else []
            
        #After first
        else:
            #model inference
            df_case['inference_offsets'] = df_case['update_sentence'].apply(model_inference)
            df_case['labels'] = [1 if data.update_coref in data.inference_offsets else 0 for data in df_case.itertuples()]
            #New LitBank
            New_LitBank_df = pd.concat([New_LitBank_df, df_case[df_case['labels']==0]]).reset_index(drop=True)
            df_case = df_case[df_case['labels']==1].reset_index(drop=True)
            
            
            update_text_list = []
            prompt = ChatPromptTemplate.from_template(next_prompt_text)

            for data in df_case.itertuples():
                modified_ontonotes_sentence = data.update_sentence
                original_coreference_words = data.adjusted_offsets
                updated_coreference_words = data.update_text
                updated_coreference_offsets = data.update_coref
                while True:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[{"role": "user",
                                "content": prompt.format(
                                        modified_ontonotes_sentence=modified_ontonotes_sentence,
                                        original_coreference_words=original_coreference_words,
                                        updated_coreference_words=updated_coreference_words,
                                        updated_coreference_offsets=updated_coreference_offsets
                                    )}])
                    output_text = response.choices[0].message.content
                    updated_coreference_words_match = re.search(r"Further Updated Coreference Words\s*:\s*(\[\[.*?\]\])", output_text, re.DOTALL)
       

                    if updated_coreference_words_match:
                        next_updated_coreference_words = updated_coreference_words_match.group(1) if updated_coreference_words_match else None
                        next_updated_coreference_words = ast.literal_eval(next_updated_coreference_words) if next_updated_coreference_words else None
                    else:
                        prompt = ChatPromptTemplate.from_template(request_next_prompt_text)
                        continue
                    
                    if next_updated_coreference_words==data.text or len(next_updated_coreference_words)!=len(updated_coreference_offsets):
                        prompt = ChatPromptTemplate.from_template(next_prompt_text)
                    else:
                        prompt = ChatPromptTemplate.from_template(request_next_prompt_text)
                        break
                update_text_list.append(next_updated_coreference_words)
                
            df_case['update_text'] = update_text_list
            df_case['update_sentence'] = df_case.apply(lambda x: replace_words_by_dynamic_indices(x['update_sentence'], x['update_coref'], x['update_text']), axis=1) if df_case.empty==False else []
            df_case['update_coref'] = df_case.apply(lambda x: offset_modify(x['update_coref'], x['update_text']), axis=1) if df_case.empty==False else []

        df_case = df_case[df_case['labels']==1].reset_index(drop=True)
    New_LitBank_df.to_csv(f'../maverick_augmentation/datasets/Experiment/New_LitBank/New_litbank_case_{coref_case}')
    progress_bar.update(1)
    case_idx += 1

  0%|          | 0/12089 [10:31:57<?, ?it/s]


In [None]:
progress_bar.close()