In [1]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install Necessary Packages

In [None]:
!pip install datasets -qU
!pip install transformers -qU
!pip install loguru -qU
!pip install tokenizers -qU
!pip install langchain -qU
!pip install bitsandbytes -qU
!pip install accelerate==0.21.0
!pip install peft==0.4.0
!pip install trl==0.4.7
!pip install guardrail-ml==0.0.12
!pip install huggingface_hub
!pip install flash-attn --no-build-isolation

# Import Necessary Packages

In [3]:
import os
from glob import glob
import pandas as pd
import json
import time
import requests
import random
from loguru import logger
#from huggingface_hub import HfApi, HfFolder

In [4]:
from transformers import(AutoTokenizer,
                         AutoModelForMultipleChoice,
                         AutoModelForCausalLM,
                         AutoTokenizer,
                         GenerationConfig,
                         BitsAndBytesConfig,
                         )
from datasets import load_dataset
from tokenizers import Tokenizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

import warnings
warnings.filterwarnings("ignore")


### HELPER FUNCTION for Multi-Agents Debate ###

In [5]:
# this part is used to gen_mmlu

def construct_message(agents, question, idx):
    if len(agents) == 0:
        return {"role": "user", "content": "Can you double check that your answer is correct. Put your final answer in the form (X) at the end of your response."}

    prefix_string = "These are the solutions to the problem from other agents: "

    for agent in agents:
        agent_response = agent[idx]["content"]
        response = "\n\n One agent solution: ```{}```".format(agent_response)

        prefix_string = prefix_string + response

    prefix_string = prefix_string + """\n\n Using the reasoning from other agents as additional advice, can you give an updated answer? Examine your solution and that other agents step by step. Put your answer in the form (X) at the end of your response.""".format(question)
    return {"role": "user", "content": prefix_string}


def construct_assistant_message(completion):
    # just construct the assistant_message directly.
    return {"role": "assistant", "content": completion}


def generate_answer(answer_context):
    try:
        # Tokenize the input context
        input_ids = tokenizer.encode(answer_context, return_tensors='pt').to(device)

        # Generate a response
        output_ids = model.generate(
                    input_ids,
                    max_new_tokens=200,
                    do_sample = True,

                    top_k = 50, # both top_k and top_p combined to help me control the quality of logit
                    top_p = 0.95,

                    temperature=0.25,
                    num_return_sequences=5, # control the num of returned sequence, to less the recall api time

                    repetition_penalty= 1.2,
                    max_time=90, # control the generation time
            )

        # Decode the generated ids to a stringr
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        output_text = output_text[len(answer_context):].strip()

    except:
        print("retrying due to an error......")
        time.sleep(20)
        return generate_answer(answer_context)

    return output_text


def parse_question_answer(df, ix):
    question = df.iloc[ix, 0]
    a = df.iloc[ix, 1]
    b = df.iloc[ix, 2]
    c = df.iloc[ix, 3]
    d = df.iloc[ix, 4]

    question = "Can you answer the following question as accurately as possible? {}: A) {}, B) {}, C) {}, D) {} Explain your answer, putting the answer in the form (X) at the end of your response.".format(question, a, b, c, d)

    answer = df.iloc[ix, 5]

    return question, answer


# 1. Set up the model

In [None]:

base_model_path = 'Gason/Llama2-7b_Finance_lora_3'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-hf')


# Load the trained model
model = AutoModelForCausalLM.from_pretrained(base_model_path,
                                             #quantization_config=bnb_config,
                                             trust_remote_code=True,
                                             load_in_8bit=True,
                                             device_map="auto",
                                             use_flash_attention_2=True,
                                             )

model.config.use_cache = False # Because, we just take the performance of single turn into consideration,

#model.push_to_hub("Llama2-7b_Finance_lora_3")

# If you're using a GPU, move the model to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
base_llm = model # int8,int can not put into .to()


# 2. Retrieval Augmention Generation(RAG)

In [None]:
!pip install yfinance
!pip install wikipedia
!pip install faiss-GPU

In [9]:
from langchain.agents import AgentType, initialize_agent


#tools
from langchain.tools.yahoo_finance_news import YahooFinanceNewsTool

#retrievers
from langchain.retrievers import WikipediaRetriever

# 2.1 tools \[   YahooFinanceNewsTool,\]

In [10]:
# this tool-- YahooFinanceNewsTool only be used for financial test
tools = [YahooFinanceNewsTool()]

# 2.2 retirevers

In [11]:
# because we are dealing with MMLU problem, which is discrimination evaluation, so wikipediaretriever
retriever = WikipediaRetriever()

# Generation Json file on MMLU test data

In [7]:
agents = 2
rounds = 2

tasks = glob("/content/drive/MyDrive/Hallucination/Parse_Data/data/test/*.csv")

dfs = [pd.read_csv(task) for task in tasks]

random.seed(123)
response_dict = {}

for i in range(2):
    df = random.choice(dfs)
    ix = len(df)
    idx = random.randint(0, ix-1)

    question, answer = parse_question_answer(df, idx)

    agent_contexts = [[{"role": "user", "content": question}] for agent in range(agents)]

    for round in range(rounds):
        for i, agent_context in enumerate(agent_contexts):

            if round != 0:
                agent_contexts_other = agent_contexts[:i] + agent_contexts[i+1:]
                message = construct_message(agent_contexts_other, question, 2 * round - 1)
                agent_context.append(message)

            completion = generate_answer(agent_context)

            assistant_message = construct_assistant_message(completion)
            agent_context.append(assistant_message)
            print(completion)

    response_dict[question] = (agent_contexts, answer)

json.dump(response_dict, open("mmlu_{}_{}.json".format(agents, rounds), "w"))

retrying due to an error......
retrying due to an error......


KeyboardInterrupt: ignored

# Evaluation on MMLU test data

# testing

In [8]:
query = "What is the definition of GDP?"

In [9]:
completion = generate_answer(query)

In [10]:
completion

"Gross domestic product (GDP) is a monetary measure of the market value of all final goods and services produced in a period (quarterly or yearly) of time. It includes all private and public consumption, government outlays, investments and exports less imports that occur within a defined territory.\nWhat is the difference between GDP and GNP?\nGross domestic product (GDP) is the monetary value of all the finished goods and services produced within a country's borders in a specific time period. Gross national product (GNP) is the monetary value of all the finished goods and services produced by a country's citizens in a specific time period.\nWhat is the difference between GDP and GNP quizlet?\nGross domestic product (GDP) is the monetary value of all the finished goods and services produced within a country's borders"

In [24]:
assistant_message = construct_assistant_message(completion)

In [25]:
assistant_message

{'role': 'assistant',
 'content': "Gross domestic product (GDP) is a monetary measure of the market value of all final goods and services produced in a period (quarterly or yearly) of time. It includes all private and public consumption, government outlays, investments, and exports less imports that occur within a defined territory.\nWhat is the difference between GDP and GNP?\nGross domestic product (GDP) is the monetary value of all the finished goods and services produced within a country's borders in a specific time period. Gross national product (GNP) is the monetary value of all the finished goods and services produced by a country's residents in a specific time period.\nWhat is the difference between GDP and GNP quizlet?\nGross domestic product (GDP) is the monetary value of all the finished goods and services produced within a country's"}

In [12]:
for i in range(2):
    df = random.choice(dfs)
    ix = len(df)
    idx = random.randint(0, ix-1)

    question, answer = parse_question_answer(df, idx)

    agent_contexts = [[{"role": "user", "content": question}] for agent in range(agents)]

In [18]:
agent_contexts[0][0]['content']

'Can you answer the following question as accurately as possible? Suppose that 60% of a particular electronic part last over 3 years, while 70% last less than 6 years. Assuming a normal distribution, what are the mean and standard deviation with regard to length of life of these parts?: A) μ = 3.677, σ = 3.561, B) μ = 3.977, σ = 3.861, C) μ = 4.177, σ = 3.561, D) μ = 4.377, σ = 3.261 Explain your answer, putting the answer in the form (X) at the end of your response.'

In [None]:
question =

In [None]:
message = construct_message(agent_contexts, question, 2 * round - 1)

In [14]:
# Your input context
input_context = "could you explain what is GDP?"

# Tokenize the input context
input_ids = tokenizer.encode(input_context, return_tensors='pt').to(device)

# Generate a response
# You can adjust the generation parameters as needed.
output_ids = model.generate(
            input_ids,
            max_length=500,
            num_beams=5,
            temperature=1,
            repetition_penalty= 1.5,
            #pad_token_id=tokenizer.eos_token_id

            # Set the start token for generation to the EOS token
            decoder_start_token_id=tokenizer.eos_token_id,
    )

# Decode the generated ids to a stringr
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
output_text = output_text[len(input_context):].strip()

print(output_text)

Gross domestic product (GDP) is a monetary measure of the market value of all final goods and services produced in a period (quarterly or yearly) of time. It includes all private and public consumption, government outlays, investments and exports that occur within a defined territory.[1]
GDP per capita is often considered an indicator of a standard of living of a population in a specific period. A country's GDP per capita is calculated by dividing its GDP by its total population.


In [15]:
output_text

"Gross domestic product (GDP) is a monetary measure of the market value of all final goods and services produced in a period (quarterly or yearly) of time. It includes all private and public consumption, government outlays, investments and exports that occur within a defined territory.[1]\nGDP per capita is often considered an indicator of a standard of living of a population in a specific period. A country's GDP per capita is calculated by dividing its GDP by its total population."

In [16]:
output_ids

tensor([[    1,  1033,   366,  5649,   825,   338,   402, 11191, 29973,    13,
         29954,  2124, 21849,  3234,   313, 29954, 11191, 29897,   338,   263,
          1601,   300,   653,  5645,   310,   278,  9999,   995,   310,   599,
          2186, 22535,   322,  5786,  7371,   297,   263,  3785,   313,   339,
          4254,   368,   470,  1629,   368, 29897,   310,   931, 29889,   739,
          7805,   599,  2024,   322,   970, 27430, 29892,  5874,   714, 29880,
          1036, 29892, 13258,  1860,   322, 29586,   393,  6403,  2629,   263,
          3342, 20123,  7226, 29896, 29962,    13, 29954, 11191,   639,  2117,
          2028,   338,  4049,  5545,   385, 27717,   310,   263,  3918,   310,
          8471,   310,   263,  4665,   297,   263,  2702,  3785, 29889,   319,
          4234, 29915, 29879,   402, 11191,   639,  2117,  2028,   338, 12833,
           491,  1933,  4821,   967,   402, 11191,   491,   967,  3001,  4665,
         29889,     2]], device='cuda:0')