In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizerFast, LlamaForCausalLM, LlamaTokenizer
from sklearn.metrics import classification_report
from peft import PeftModel

from Eval_utils import *

df_test = pd.read_csv("../Dataset/EconNLI_test.csv" )

### LLAMA

In [None]:
def get_zero_shot_results_from_llama(df_test, model, tokenizer):
    y_true = []
    y_pred = []
    for row in tqdm(df_test.iterrows(), total=len(df_test)):
        prompt = f"Conduct inference on economic events. We provide a premise and a hypothesis, both of them are economical events. Infer \
        whether the premise can cause the happening of the hypothesis. Only answer 'Yes' or 'No'. \
                premise: {row[1]['cause']}, hypothesis: {row[1]['effect']}, answer:"
        model_answer = prompt_llama_like_model(prompt,model,tokenizer,max_new_tokens = 10)
        model_answer = model_answer.split("answer:")[1].strip()
        if row[0]<10:
            print(prompt)
            print(model_answer)
        if "yes" in model_answer.strip('\n').split(" ")[0].lower():
            y_pred.append(1)
            y_true.append(row[1]['label'])
        elif "no" in model_answer.strip('\n').split(" ")[0].lower():
            y_pred.append(0)
            y_true.append(row[1]['label'])
    return y_true, y_pred

# Remove comments on your desired model

# LLAMA2-7B-chat
# model_name = "../llama/Llama-2-7b-chat-hf"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={2:"24GB",3:"24GB"}, 
#     torch_dtype=torch.float16,
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

# LLAMA2-13B-chat
# model_name = "../llama/Llama-2-13b-chat-hf"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={2:"24GB",3:"24GB"}, 
#     torch_dtype=torch.float16,
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

# FINMA
# model_name = "ChanceFocus/finma-7b-nlp"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={0:"24GB"}, 
#     torch_dtype=torch.float16
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name,unk_token ="<s>")
# tokenizer.pad_token = tokenizer.eos_token

# #Alpaca
# model_name = "../llama/alpaca-7b/"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={0:"24GB",1:"24GB"}, 
#     torch_dtype=torch.float16,
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

y_true, y_pred = get_zero_shot_results_from_llama(df_test, model, tokenizer)
print(classification_report(y_true=y_true,y_pred=y_pred,digits=4))

with open("results/LLM_results.txt","a") as f:
    f.write(model_name+", Zero-shot \n")
    f.write(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
    f.write("\n")


### ChatGPT/GPT4

In [None]:
prompt = "Conduct inference on economic events. We provide a premise and a hypothesis, both of them are economical events. Infer \
 whether the premise can cause the happening of the hypothesis. Only answer 'Yes' or 'No'. \
         premise: {}, hypothesis: {}, answer:"

for MODEL_NAME in ["ChatGPT","GPT4"]:
    
    y_pred = []
    y_true = []
    err_list = []

        
    for row in tqdm(df_test.iterrows(), total=len(df_test)):
        time.sleep(1)
        topic = row[1]["wiki_page"]
        premise = row[1]["cause"]
        hypothesis = row[1]["effect"]
        label = row[1]["label"]
        if MODEL_NAME == "ChatGPT":
            prediction = prompt_chatgpt_with_backoff(prompt.format(premise, hypothesis))
        elif MODEL_NAME == "GPT4":
            prediction = prompt_gpt4_with_backoff(prompt.format(premise, hypothesis))

        if 'yes' in prediction.strip('\n').split(" ")[0].lower():
            y_pred.append(1)
            y_true.append(label)
        elif 'no' in prediction.strip('\n').split(" ")[0].lower():
            y_pred.append(0)
            y_true.append(label)

        if ('yes' in prediction.strip('\n').split(" ")[0].lower() and label==0) or ('no' in prediction.strip('\n').split(" ")[0].lower() and label==1):
            print("premise: {}, hypothesis: {}, label: {}".format(premise, hypothesis, label))
            print("prompt: {}".format(prompt.format(premise, hypothesis)))
            print("prediction: {}".format(prediction))
            print("=========================================================")

    print(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
    with open("results/LLM_results.txt", "a") as f:
        f.write(MODEL_NAME+"\n")
        f.write(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
        f.write("\n")
        