In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizerFast, LlamaForCausalLM
from sklearn.metrics import classification_report
from peft import PeftModel

from Eval_utils import *

df_train = pd.read_csv("../Dataset/EconNLI_train.csv")
df_test = pd.read_csv("../Dataset/EconNLI_test.csv" )
#shuffle
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_train = df_train.rename(columns={"ChatGPT_label":"label"}) 

## ICL

### LLAMA

In [None]:
def get_ICL_results_from_llama(df_train, df_test, model, tokenizer):
    y_true = []
    y_pred = []
    
    
    for row in tqdm(df_test.iterrows(), total=len(df_test)):
        # sample ICL examples
        icl_pos = df_train[df_train["label"]==1].sample(1)
        icl_neg = df_train[df_train["label"]==0].sample(1)

        topic = row[1]["wiki_page"]
        premise = row[1]["cause"]
        hypothesis = row[1]["effect"]
        label = row[1]["label"]
        
        prompt = f"Conduct inference on economic events. We provide a premise and a hypothesis, both of them are economical events. Infer \
whether the premise can cause the happening of the hypothesis. Only answer 'Yes' or 'No'. \
Here are some examples: premise: {icl_pos['cause'].iloc[0]}, hypothesis: {icl_pos['effect'].iloc[0]}, answer:Yes \n \
premise: {icl_neg['cause'].iloc[0]}, hypothesis: {icl_neg['effect'].iloc[0]}, answer:No \n \
Conduct inference on the following premise and hypothesis: premise: {premise}, hypothesis: {hypothesis}, answer:"
        
        model_answer = prompt_llama_like_model(prompt,model,tokenizer,max_new_tokens =3 )
        prediction = model_answer.split("answer:")[1].strip()
        
        if row[0]<10:
            print("prompt: ",prompt)
            print("prediction: {}".format(prediction))
            print("=========================================================")

        if 'yes' in prediction.strip('\n').split(" ")[0].lower():
            y_pred.append(1)
            y_true.append(label)
        elif 'no' in prediction.strip('\n').split(" ")[0].lower():
            y_pred.append(0)
            y_true.append(label)
    
    return y_true, y_pred

# Remove comments on your desired model

# LLAMA2-7B-chat
# model_name = "../llama/Llama-2-7b-chat-hf"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={2:"24GB",3:"24GB"}, 
#     torch_dtype=torch.float16,
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

# LLAMA2-13B-chat
# model_name = "../llama/Llama-2-13b-chat-hf"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={2:"24GB",3:"24GB"}, 
#     torch_dtype=torch.float16,
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

# FINMA
# model_name = "ChanceFocus/finma-7b-nlp"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={0:"24GB"}, 
#     torch_dtype=torch.float16
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name,unk_token ="<s>")
# tokenizer.pad_token = tokenizer.eos_token

# #Alpaca
# model_name = "../llama/alpaca-7b/"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={0:"24GB",1:"24GB"}, 
#     torch_dtype=torch.float16,
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

y_true, y_pred = get_ICL_results_from_llama(df_train, df_test, model, tokenizer)
print(classification_report(y_true=y_true,y_pred=y_pred,digits=4))

with open("results/LLM_results.txt","a") as f:
    f.write(model_name+", ICL \n")
    f.write(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
    f.write("\n")


### GPT

In [None]:

for MODEL_NAME in ["ChatGPT","GPT4"]:
    
    y_pred = []
    y_true = []

    for row in tqdm(df_test.iterrows(), total=len(df_test)):
        # sample ICL examples
        icl_pos = df_train[df_train["label"]==1].sample(1)
        icl_neg = df_train[df_train["label"]==0].sample(1)

        topic = row[1]["wiki_page"]
        premise = row[1]["cause"]
        hypothesis = row[1]["effect"]
        label = row[1]["label"]

        prompt = f"Conduct inference on economic events. We provide a premise and a hypothesis, both of them are economical events. Infer \
whether the premise can cause the happening of the hypothesis. Only answer 'Yes' or 'No'. \
Here are some examples: premise: {icl_pos['cause'].iloc[0]}, hypothesis: {icl_pos['effect'].iloc[0]}, answer:Yes; \
premise: {icl_neg['cause'].iloc[0]}, hypothesis: {icl_neg['effect'].iloc[0]}, answer:No; \
Conduct inference on the following premise and hypothesis: premise: {premise}, hypothesis: {hypothesis}, answer:"

        
        if MODEL_NAME == "ChatGPT":
            prediction = prompt_chatgpt_with_backoff(prompt.format(premise, hypothesis))
        elif MODEL_NAME == "GPT4":
            prediction = prompt_gpt4_with_backoff(prompt.format(premise, hypothesis))
        
        if row[0]<10:
            print("prompt: ",prompt)
            print("prediction: {}".format(prediction))
            print("=========================================================")

        if 'yes' in prediction.strip('\n').split(" ")[0].lower():
            y_pred.append(1)
            y_true.append(label)
        elif 'no' in prediction.strip('\n').split(" ")[0].lower():
            y_pred.append(0)
            y_true.append(label)

    print(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
    with open("results/LLM_results.txt", "a") as f:
        f.write(MODEL_NAME+", ICL \n")
        f.write(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
        f.write("\n")
        
        

## COT

In [8]:
cot_prompt = '''
### Question: Conduct inference on economic events. We provide a premise and a hypothesis,\
both of them are economic events. Infer whether the premise can cause the hypothesis to happen. \
Write the reasoning chain on the first line, and summarize the answer as 'Yes' or 'No' in the second line. \
premise: demand increases, hypothesis: price increases. \n \
### Answer: When demand for a product or service increases, more people want to buy it. \
This creates a situation where there are more buyers than available supply, \
which leads to an increase in competition among buyers. \
As a result, sellers can raise their prices because they know that buyers are willing to pay more to get the product or service they want.\n\
Yes.\n\

### Question: Conduct inference on economic events. We provide a premise and a hypothesis,\
both of them are economic events. Infer whether the premise can cause the hypothesis to happen. \
Write the reasoning chain on the first line, and summarize the answer as 'Yes' or 'No' in the second line. \
premise: government borrowing creates higher demand for credit in the financial markets,\
hypothesis: interest rates decreases across the market. \n \
### Answer:When the government borrows money, it creates higher demand for credit in the financial markets. \
This is because the government is competing with other borrowers for available funds, which can drive up interest rates. \
Therefore, it is unlikely that government borrowing would cause interest rates to decrease across the market. \n \
No. \n\

### Question: Conduct inference on economic events. We provide a premise and a hypothesis,\
both of them are economic events. Infer whether the premise can cause the hypothesis to happen. \
Write the reasoning chain on the first line, and summarize the answer as 'Yes' or 'No' in the second line. \
premise:{}, hypothesis: {} \n \

### Answer:
'''

### LLAMA

In [None]:
def get_COT_results_from_llama(cot_prompt, df_test, model, tokenizer):
    y_true = []
    y_pred = [] 
    
    for row in tqdm(df_test.iterrows(), total=len(df_test)):
        
        topic = row[1]["wiki_page"]
        premise = row[1]["cause"]
        hypothesis = row[1]["effect"]
        label = row[1]["label"]
        
        model_answer = prompt_llama_like_model(cot_prompt.format(premise,hypothesis), model,tokenizer,max_new_tokens = 200)
        prediction = model_answer.split("### Answer:")[3].split("### Question:")[0].strip()
        
        if row[0]<10:
#             print("prompt: ",cot_prompt)
            print("model_output: {}".format(model_answer))
            print("prediction: {}".format(prediction))
            print("=========================================================")

#         if 'Yes.' in prediction:   # LLAMA(7B,13B),Alpaca
        if 'Yes' in prediction:   # PIXIU
            y_pred.append(1)
            y_true.append(label)
#         elif 'No.' in prediction:    # LLAMA(7B,13B),Alpaca
        if 'No' in prediction:   # PIXIU
            y_pred.append(0)
            y_true.append(label)
    
    return y_true, y_pred

# Remove comments on your desired model

# LLAMA2-7B-chat
# model_name = "../llama/Llama-2-7b-chat-hf"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={2:"24GB",3:"24GB"}, 
#     torch_dtype=torch.float16,
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

# LLAMA2-13B-chat
# model_name = "../llama/Llama-2-13b-chat-hf"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={2:"24GB",3:"24GB"}, 
#     torch_dtype=torch.float16,
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

# FINMA
# model_name = "ChanceFocus/finma-7b-nlp"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={0:"24GB"}, 
#     torch_dtype=torch.float16
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name,unk_token ="<s>")
# tokenizer.pad_token = tokenizer.eos_token

# #Alpaca
# model_name = "../llama/alpaca-7b/"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#     device_map="auto",
#     max_memory={0:"24GB",1:"24GB"}, 
#     torch_dtype=torch.float16,
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token


y_true, y_pred = get_COT_results_from_llama(cot_prompt, df_test, model, tokenizer)
print(classification_report(y_true=y_true,y_pred=y_pred,digits=4))

with open("results/LLM_results.txt","a") as f:
    f.write(model_name+", COT \n")
    f.write(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
    f.write("\n")


### ChatGPT/GPT4

In [None]:
for MODEL_NAME in ["ChatGPT","GPT4"]:
    
    y_pred = []
    y_true = []
        
    for row in tqdm(df_test.iterrows(), total=len(df_test)):
        time.sleep(1)
        topic = row[1]["wiki_page"]
        premise = row[1]["cause"]
        hypothesis = row[1]["effect"]
        label = row[1]["label"]
        if MODEL_NAME == "ChatGPT":
            prediction = prompt_chatgpt_with_backoff(cot_prompt.format(premise, hypothesis))
        elif MODEL_NAME == "GPT4":
            prediction = prompt_gpt4_with_backoff(cot_prompt.format(premise, hypothesis))
        
        if row[0]<10:
            print(cot_prompt.format(premise, hypothesis))
            print(prediction)
            print("======")

        if 'yes' in prediction.split('\n')[-1].lower():
            pred = 1
            y_pred.append(pred)
            y_true.append(label)
        elif 'no' in prediction.split('\n')[-1].lower():
            pred = 0
            y_pred.append(pred)
            y_true.append(label)

        if (pred==1 and label==0) or (pred==0 and label==1):
            print("premise: {}, hypothesis: {}, label: {}".format(premise, hypothesis, label))
            print("prediction: {}".format(prediction))
            print("=========================================================")

    print(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
    with open("results/LLM_results.txt", "a") as f:
        f.write(MODEL_NAME+", COT \n")
        f.write(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
        f.write("\n")
        