In [None]:
import pandas as pd
import numpy as np
import os
import torch
from torch import nn
import datasets
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForSequenceClassification,AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score, classification_report
from peft import LoraConfig
from Eval_utils import *

df_train = pd.read_csv("../Dataset/EconNLI_train.csv")
df_test = pd.read_csv("../Dataset/EconNLI_test.csv" )
#shuffle
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_train = df_train.rename(columns={"ChatGPT_label":"label"}) # we use ChatGPT's label for SFT

## BERT-like Models

In [4]:
def preprocess_function(examples):
    return tokenizer(examples["cause"], examples["effect"] ,truncation=True,max_length=512)


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    prec = precision_score(y_true = labels, y_pred = predictions, average=None).tolist()
    recall = recall_score(y_true = labels, y_pred = predictions, average=None).tolist()
    f1 = f1_score(y_true = labels, y_pred = predictions, average=None).tolist()
    mic_f1 = f1_score(y_true = labels, y_pred = predictions, average='micro')
    acc = accuracy_score(y_true = labels, y_pred = predictions)
    return {"precision": prec, "recall": recall, "f1": f1, "micro_f1":mic_f1, "accuracy": acc}


def train_and_evaluate_model(df_train, df_test, model, tokenizer,model_output_path):
    df_train, df_val = np.split(df_train, [int(.9*len(df_train))])
    dataset_train = datasets.Dataset.from_pandas(df_train)
    dataset_val = datasets.Dataset.from_pandas(df_val)
    dataset_test = datasets.Dataset.from_pandas(df_test)
    tokenized_train = dataset_train.map(preprocess_function)
    tokenized_val = dataset_val.map(preprocess_function)
    tokenized_test = dataset_test.map(preprocess_function)
    training_args = TrainingArguments(
        output_dir=model_output_path,
        learning_rate=2e-5,
        per_device_train_batch_size=20,
        per_device_eval_batch_size=64,
        num_train_epochs=3, 
        weight_decay=0.01,
        do_train = True,
        do_eval = True,
        save_strategy = 'epoch',
        save_total_limit=1,
        evaluation_strategy = 'epoch',
        load_best_model_at_end=True,
        overwrite_output_dir = True,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    print("Results on test set:")
    res = trainer.evaluate(eval_dataset=tokenized_test)
    print(res)
    return trainer, res


In [None]:
for MODEL_NAME_OR_PATH in ["bert-base-uncased", "roberta-base", "yiyanghkust/finbert-pretrain", "SALT-NLP/FLANG-BERT","SALT-NLP/FLANG-ELECTRA"]: 
    
    MODEL_OUTPUT_PATH = "models/FT_PLMs_"+MODEL_NAME_OR_PATH
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_OR_PATH,num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

    for i in range(3):

        trainer,pred = train_and_evaluate_model(df_train, df_test, model, tokenizer, MODEL_OUTPUT_PATH)

        dataset_test = datasets.Dataset.from_pandas(df_test)
        tokenized_test = dataset_test.map(preprocess_function)
        res = trainer.predict(tokenized_test)

        print(classification_report(y_pred = np.argmax(res[0],axis=1), y_true = res[1],digits=4))

        with open('results/FT_PLMs_results.txt', 'a') as f:
            f.write(MODEL_NAME_OR_PATH+", run "+str(i)+"\n")
            f.write(classification_report(y_pred = np.argmax(res[0],axis=1), y_true = res[1],digits=4))
            f.write('\n')

## LLAMA

In [None]:
df_train, df_val = np.split(df_train, [int(.9*len(df_train))])
ds_train = datasets.Dataset.from_pandas(df_train)
ds_val = datasets.Dataset.from_pandas(df_val)
ds_test = datasets.Dataset.from_pandas(df_test)

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['sentence'])):
        answer = "Yes" if example['label'][i]==1 else "No"
        text = f"### Question: Conduct inference on economic events. We provide a premise and a hypothesis,\
both of them are economical events. Infer \
whether the premise can cause the happening of the hypothesis. Only answer 'Yes' or 'No'. \
premise: {example['cause'][i]}, hypothesis: {example['effect'][i]}. \
\n ### Answer: {answer} " +tokenizer.eos_token
        output_texts.append(text)
#         if i==10:
    print(output_texts[:10])
    return output_texts


def get_zero_shot_results_from_llama(df_test, model, tokenizer):
    y_true = []
    y_pred = []
    for row in tqdm(df_test.iterrows(), total=len(df_test)):
        prompt = f"### Question: Conduct inference on economic events. We provide a premise and a hypothesis,\
both of them are economical events. Infer \
whether the premise can cause the happening of the hypothesis. Only answer 'Yes' or 'No'. \
premise: {row[1]['cause']}, hypothesis: {row[1]['effect']}. \
\n ### Answer:"
        model_answer = prompt_llama_like_model(prompt,model,tokenizer,max_new_tokens = 3)
        model_answer = model_answer.split("\n ### Answer:")[1].strip()
        if row[0]<10:
            print(prompt)
            print(model_answer)
        if "yes" in model_answer.strip('\n').split(" ")[0].lower():
            y_pred.append(1)
            y_true.append(row[1]['label'])
        elif "no" in model_answer.strip('\n').split(" ")[0].lower():
            y_pred.append(0)
            y_true.append(row[1]['label'])
    return y_true, y_pred



for model_name in ["../llama/Llama-2-7b-chat-hf", "../llama/Llama-2-13b-chat-hf"]:
    for run in range(3):
        y_true, y_pred = None, None
        trainer = None

        model = LlamaForCausalLM.from_pretrained(model_name,
            device_map="auto",                        
            max_memory={0:"24GB",1:"24GB",2:"24GB",3:"24GB"}, 
            torch_dtype=torch.float16,
            )
        tokenizer = LlamaTokenizer.from_pretrained(model_name,)
        tokenizer.pad_token = tokenizer.eos_token   

        response_template = "\n ### Answer:"
        collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)


        training_args = TrainingArguments(
            output_dir='models/model_name_'+str(run),
            per_device_train_batch_size=2,
            gradient_accumulation_steps=12,
            learning_rate=0.00005,
            logging_steps=10,
            remove_unused_columns=False,
        )

        peft_config = LoraConfig(
            r=16,
            lora_alpha=32,
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )

        trainer = SFTTrainer(
            model=model,
            args=training_args,
            max_seq_length=512,
            train_dataset=ds_train,
            eval_dataset=ds_val,
            peft_config=peft_config,
            formatting_func=formatting_prompts_func,
            data_collator=collator,

        )
        trainer.train()

        y_true, y_pred = get_zero_shot_results_from_llama(df_test, model, tokenizer)
        print(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
        
        with open("results/FT_PLMs_results.txt","a") as f:
            f.write(model_name+", SFT, run "+str(run)+" \n")
            f.write(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
            f.write("\n")
