In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,5,6"
from chatgpt import run_prompt
import spacy
import pandas as pd
nlp = spacy.load("en_core_web_sm")
MODEL = "GPT-4"
model = "gpt-3.5-turbo-1106" if MODEL == "GPT-3.5" else "gpt-4-0125-preview" if MODEL == "GPT-4" else None
SYSTEM_PROMPT = ""
dataset = pd.read_csv("../data/stitched.tsv", sep="\t", header=0)


In [None]:
def get_model_answer(prompts):
    outputs_raw = llm.generate(prompts, sampling_params)
    outputs = []
    for o in outputs_raw:
        outputs.append(vars(o.outputs[0])["text"].strip())
    return outputs

In [None]:
def replace_verb_with_throw(sentence_object):
    sentence_split = sentence_object['sentence'].split()
    doc = nlp(sentence_object['sentence'])
    # get tag of original verb
    for token_i, token in enumerate(doc):
        if token_i == int(sentence_object['verb_i']):
            original_verb_tag = token.tag_
            break

    # now inflect throw to be the same tense as the original verb
    if original_verb_tag == 'VBG':
        new_verb_form = 'throwing'
    elif original_verb_tag == 'VBD':
        new_verb_form = 'threw'
    elif original_verb_tag == 'VBZ':
        new_verb_form = 'throws'
    elif original_verb_tag == 'VBP':
        new_verb_form = 'throw'
    elif original_verb_tag == 'VB':
        new_verb_form = 'throw'
    elif original_verb_tag == 'VBN':
        new_verb_form = 'thrown'
    else:
        print(f"Error: verb tag {original_verb_tag} not recognized")
        return None
    sentence_split[int(sentence_object['verb_i'])] = new_verb_form
    sentence_object['sentence'] = ' '.join(sentence_split)
    return sentence_object

def get_instruction(sentence_object, sentence_string):
    sentence_split = sentence_object['sentence'].split()
    doc = nlp(sentence_object['sentence'])
    direct_object_lemmatised = sentence_object["direct_object"]
    direct_object_unlemmatised = sentence_split[int(sentence_object['direct_object_i'])]
    # get tag of original verb
    for token_i, token in enumerate(doc):
        if token_i == int(sentence_object['direct_object_i']):
            original_dobj_tag = token.tag_
            break
    if original_dobj_tag == 'NNS':
        be_form = 'are'
        direct_object = direct_object_unlemmatised
    elif original_dobj_tag == 'NN':
        be_form = 'is'
        direct_object = direct_object_unlemmatised
    elif original_dobj_tag == 'NNPS':
        be_form = 'are'
        direct_object = direct_object_unlemmatised
    elif original_dobj_tag == 'NNP':
        be_form = 'is'
        direct_object = direct_object_unlemmatised
    elif original_dobj_tag == 'PRP':
        if direct_object_lemmatised.lower() in ['it', 'he', 'she']:
            be_form = 'is'
            direct_object = direct_object_lemmatised
        elif direct_object_lemmatised.lower() in ['they', 'we', 'you']:
            be_form = 'are'
            direct_object = direct_object_lemmatised
        elif direct_object_lemmatised.lower() in ['i']:
            be_form = 'am'
            direct_object = direct_object_lemmatised
        elif direct_object_lemmatised.lower() == "himself":
            be_form = 'is'
            direct_object = "he"
        elif direct_object_lemmatised.lower() == "herself":
            be_form = 'is'
            direct_object = "she"
        elif direct_object_lemmatised.lower() == "itself":
            be_form = 'is'
            direct_object = "it"
        elif direct_object_lemmatised.lower() == "themselves":
            be_form = 'are'
            direct_object = "they"
        elif direct_object_lemmatised.lower() == "myself":
            be_form = 'am'
            direct_object = "I"
        elif direct_object_lemmatised.lower() == "yourself":
            be_form = 'are'
            direct_object = "you"
        else:
            print(f"Error: pronoun {direct_object_lemmatised} not recognized")
            return None
    else:
        print(f"{direct_object_unlemmatised} {original_dobj_tag} ")
        return None
    default_instruction = f"In the sentence '{sentence_string}', {be_form} {direct_object} moving, yes or no?.\nAnswer:"
    return default_instruction

def parse_system_answer(system_answer):
    system_answer = system_answer.lower()
    if system_answer[:3] == "yes" or "yes," in system_answer or "the answer is yes" in system_answer or "there is an implication that something is moving" in system_answer or "it seems that something is moving" in system_answer or 'the answer is "yes."' in system_answer:
        return True
    elif system_answer[:2] == "no" or "no," in system_answer or "no explicit indication" in system_answer or "the answer is no" in system_answer or "there is not an explicit mention of something moving"  in system_answer or "it is not explicitly mentioned whether something is moving or not" or 'the answer is "no."' in system_answer:   
        return False
    else:
        print(system_answer)
        return "invalid"

In [None]:
from tqdm import tqdm
total_cost = 0
results = {"both_yes": [], "both_no": [], "from_yes_to_no": [], "from_no_to_yes": []}
for sentence_i, sentence in tqdm(dataset.iterrows()):
    try:
        original_sentence = sentence["sentence"]
        replaced_sentence = replace_verb_with_throw(sentence)["sentence"]
    except Exception as e:
        continue
    # print(replaced_sentence)
    original_instruction = get_instruction(sentence, original_sentence)
    replaced_instruction = get_instruction(sentence, replaced_sentence)
    if not original_instruction or not replaced_instruction:
        continue
    result_original, cost_original = run_prompt(original_instruction,SYSTEM_PROMPT,model)
    result_replaced, cost_replaced = run_prompt(replaced_instruction,SYSTEM_PROMPT,model)
    total_cost += cost_original + cost_replaced
    parsed_result_original = parse_system_answer(result_original)
    parsed_result_replaced = parse_system_answer(result_replaced)
    if parsed_result_original == "invalid" or parsed_result_replaced == "invalid":
        # print(original_sentence, replaced_sentence, "invalid")
        continue
    else:
        if parsed_result_original and parsed_result_replaced:
            results["both_yes"].append(sentence)
        elif not parsed_result_original and not parsed_result_replaced:
            results["both_no"].append(sentence)
        elif parsed_result_original and not parsed_result_replaced:
            results["from_yes_to_no"].append(sentence)
        elif not parsed_result_original and parsed_result_replaced:
            results["from_no_to_yes"].append(sentence)
                
print(f"Total cost: {total_cost}")
# for all four classes, do a bin count of the verbs
for result_class in ["both_yes", "both_no", "from_yes_to_no", "from_no_to_yes"]:
    print(f"Class: {result_class}, count: {len(results[result_class])}")
    verb_counts = pd.Series([sentence['verb'] for sentence in results[result_class]]).value_counts()
    for verb, count in verb_counts.items():
        print(f"{verb}: {count}", end=", ")
    print("\n")
    
# print a confusion matrix
print(f"Both yes: {len(results['both_yes'])}")
print(f"Both no: {len(results['both_no'])}")
print(f"From yes to no: {len(results['from_yes_to_no'])}")
print(f"From no to yes: {len(results['from_no_to_yes'])}")
    

In [None]:
total_num = sum([len(results[result_class]) for result_class in ["both_yes", "both_no", "from_yes_to_no", "from_no_to_yes"]])
print("both yes percentage {0:.2f}".format(len(results["both_yes"])/total_num*100))

print("from no to yes percentage {0:.2f}".format(len(results["from_no_to_yes"])/total_num*100))
print("invalid percentage {0:.2f}".format((len(results["both_no"])+len(results["from_yes_to_no"]))/total_num*100))
    