In [1]:
# Import libraries
import os

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain import PromptTemplate
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain.prompts.chat import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate, AIMessagePromptTemplate
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import pinecone

from dotenv import load_dotenv

  from tqdm.autonotebook import tqdm


In [2]:
# Load env variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')

# Initialize chatbot
chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo',
    temperature=0
)

In [None]:
# Initialize chatbot
chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo-0301',
    temperature=0
)

In [None]:
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content="I'd like to understand Parameter Efficient Fine-Tuning.")
]

In [None]:
res = chat(messages)
print(res.content)

In [None]:
# add latest AI response to messages
messages.append(res)

# now create a new user prompt
prompt = HumanMessage(
    content="Can you describe several PEFT methods?"
)
# add to messages
messages.append(prompt)

# send to chat-gpt
res = chat(messages)

print(res.content)

# New Prompt Templates for ChatOpenAI

In [None]:
# setup first system message
messages = [
    SystemMessage(content=(
        'You are a helpful assistant. You keep responses to no more than '
        '100 characters long (including whitespace), and sign off every '
        'message with a random name.'
    )),
    HumanMessage(content="Hi AI, how are you? Can you explain Parameter Efficient Fine-Tuning (PEFT)?")
]

In [None]:
res = chat(messages)

# Too long
print(f"Length: {len(res.content)}\n{res.content}")

In [None]:
human_template = HumanMessagePromptTemplate.from_template(
    '{input} Can you keep the response to no more than 100 characters '+
    '(including whitespace), and sign off with a random name.'
)

# create the human message
chat_prompt = ChatPromptTemplate.from_messages([human_template])

# format with some input
chat_prompt_value = chat_prompt.format_prompt(
    input="Hi AI, how are you? Can you explain Parameter Efficient Fine-Tuning (PEFT)?"
)

chat_prompt_value

In [None]:
chat_prompt_value.to_messages()

In [None]:
chat_prompt_value.to_string()

In [None]:
messages = [
    SystemMessage(content=(
        'You are a helpful assistant. You keep responses to no more than '
        '100 characters long (including whitespace), and sign off every '
        'message with a random name.'
    )),
    chat_prompt.format_prompt(
        input="Hi AI, how are you? Can you explain Parameter Efficient Fine-Tuning (PEFT)?"
    ).to_messages()[0]
]

res = chat(messages)

print(f"Length: {len(res.content)}\n{res.content}")

In [None]:
system_template = SystemMessagePromptTemplate.from_template(
    'You are a helpful assistant. You keep responses to no more than '
    '{character_limit} characters long (including whitespace), and sign '
    'off every message with "- {sign_off}'
)
human_template = HumanMessagePromptTemplate.from_template("{input}")
ai_template = AIMessagePromptTemplate.from_template("{response} - {sign_off}")

# create the list of messages
chat_prompt = ChatPromptTemplate.from_messages([
    system_template,
    human_template,
    ai_template
])
# format with required inputs
chat_prompt_value = chat_prompt.format_prompt(
    character_limit="100", sign_off="Your trustworthy AI",
    input="Can you explain Parameter Efficient Fine-Tuning (PEFT)?",
    response="PEFT is a method to fine-tune a pre-trained model with fewer parameters."
)

chat_prompt_value

In [None]:
messages = chat_prompt_value.to_messages()

messages.append(
    HumanMessage(content="How many parameters?")
)

res = chat(messages)

print(f"Length: {len(res.content)}\n{res.content}")

In [None]:
# this is a faster way of building the prompt via a PromptTemplate
human_template = HumanMessagePromptTemplate.from_template(
    '{input} Answer in less than {character_limit} characters (including whitespace).'
)

# create the human message
human_prompt = ChatPromptTemplate.from_messages([human_template])

# format with some input
human_prompt_value = human_prompt.format_prompt(
    input="When should I use PEFT?",
    character_limit="100"
)

human_prompt_value

In [None]:
# drop the last message
messages.pop(-1)

In [None]:
messages.extend(human_prompt_value.to_messages())
messages

In [None]:
res = chat(messages)

print(f"Length: {len(res.content)}\n{res.content}")

# Similarity Search

In [3]:
# Use embedding model "text-embedding-ada-002" from openAI to create vector embeddings
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002",
                               disallowed_special=())

In [4]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

In [5]:
text_field = "text"
index_name = "rag-ml"

index = pinecone.Index(index_name)

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)



In [6]:
query = "I'd like to understand Parameter Efficient Fine-Tuning. Can you describe several PEFT methods?"

vectorstore.similarity_search(query, k=3)

[Document(page_content='Although ICL provides a viable alternative to full fine-tuning,\nit operates at inference time, and it neither allows learning nor\nupdating any parameters, which may prevent capturing more fine-\ngrained information about the task. It can result in a potential loss\nof effectiveness. In this context, Parameter-Efficient Fine-Tuning\n(PEFT) techniques have emerged as promising solutions to ren-\nder the fine-tuning cost at the lowest while allowing the model\nto learn task-specific parameters. Prior works [ 11,64,65] in code\nintelligence have demonstrated the capability of PEFT techniques,\nand often shown their superiority over full fine-tuning across a\nwide range of tasks. However, these studies focus on small lan-\nguage models ( <0.25B parameters) such as CodeBERT [ 15] and\nCodeT5 [ 66] and overlooked the applicability of PEFT techniques toarXiv:2308.10462v1  [cs.SE]  21 Aug 2023\nConference’17, July 2017, Washington, DC, USA Weyssow et al.\nLLMs (≥1B par

In [None]:
def get_metadata(query: str):

    results = vectorstore.similarity_search(query, k=3)

    metadata = [{"title": x.metadata["title"], 
                "source": x.metadata["source"]} for x in result]

    return metadata

In [None]:
def augment_prompt(query: str):
    
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=3)
    
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    
    return augmented_prompt

In [None]:
print(augment_prompt(query))

In [None]:
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?")
]

# create a new user prompt
prompt = HumanMessage(
    content=augment_prompt(query)
)
# add to messages
messages.append(prompt)

res = chat(messages)

print(res.content)
print(get_metadata(query))

In [None]:
# prompt without RAG
prompt = HumanMessage(
    content="What can you tell me about LoRA training?"
)

res = chat(messages + [prompt])
print(res.content)

In [None]:
# prompt with RAG
prompt = HumanMessage(
    content=augment_prompt(
        "What can you tell me about LoRA training?"
    )
)

res = chat(messages + [prompt])
print(res.content)

# Insights for Naive RAG

- "Naive RAG": simplest way of implementing RAG -> assumes question in every query (sometimes bot doesn't need to access KB to answer)
- Ability to cite sources
- Faster than using agents
- Can filter number of tokens sent to LLM (with similarity threshold)
- Token usage/cost is higher due to extended context
- Too much context will degrade prompt

Next steps
- --> Agent RAG
- --> Guardrails RAG

- RAG Agent: Wrapper around LLM -> can have thoughts, internal dialogue (can reply immediately if no external knowledge is required, or access KB through )
- Agent has access to external tools (eg. retrieval tool)
- Agent decides when it has to use a specific tool
- Slower (in langchain 3x LLM generations)

- Guardrails: in the middle of Naive RAG and RAG Agent

# Generative Question-Answering

In [7]:
# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(fetch_k=20, k=5, return_source_documents=True)
)

In [8]:
print(qa.run(query))

Parameter-Efficient Fine-Tuning (PEFT) aims to optimize the fine-tuning process by efficiently utilizing computing resources and reducing the number of parameters that need to be updated. Here are several PEFT methods:

1. Prefix Tuning: This method appends a collection of prefixes to autoregressive language models. It can also incorporate prefixes for both encoder and decoder components. Prefix Tuning has been proposed as a way to improve performance while reducing the number of parameters that need to be updated.

2. Low-Rank adaptation (LoRA): LoRA introduces trainable rank decomposition matrices into each layer of the pre-trained language model. By using low-rank matrices, LoRA reduces the number of parameters that need to be updated during fine-tuning.

3. Adapters: Adapters involve inserting lightweight modules into each layer of pre-trained models. These modules, called adapters, are the only parameters that are updated during fine-tuning. Adapters have been extended across nume

In [12]:
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(fetch_k=20, k=1, return_source_documents=True)
)

In [13]:
res = qa_with_sources(query)

print(res["question"])
print(res["answer"])
print(res["sources"])

I'd like to understand Parameter Efficient Fine-Tuning. Can you describe several PEFT methods?
Several PEFT methods include Prefix Tuning, Low-Rank adaptation (LoRA), and the insertion of adapter layers in pre-trained large language models. Prefix Tuning appends a collection of prefixes to autoregressive language models, LoRA introduces trainable rank decomposition matrices into each layer, and adapters involve inserting lightweight modules into each layer of pre-trained models. Chen et al. (2023) experiment with different design spaces for PEFT, while Wang et al. (2022) focus on single-task fine-tuning using a mixture of adapters. Ponti et al. (2022) introduce Polytropon, which involves learning adapters specific to each task. These methods aim to optimize the fine-tuning process by efficiently utilizing computing resources and reducing the number of parameters that need to be updated. They allow for the deployment of large language models in a more accessible and practical manner. Ho

In [14]:
res["sources"]

''

# Conversational Agent with tool (RetrievalQA)

In [16]:
# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history', # refers to conversational agent component
    k=5,
    return_messages=True
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # place ("stuff") retrieved items item RetrievalQA (no summarization)
    retriever=vectorstore.as_retriever(fetch_k=15, k=5, return_source_documents=True)
)

In [None]:
query = "Hi, how are you?"

# *** Only RetrievalQA - NOT Conversational Agent ***
print(qa.run(query))

query = "can you tell me some facts Parameter Efficient Fine-Tuning?"

print(qa.run(query))

In [17]:
from langchain.agents import Tool

# Add retrievalQA tool to agent
tools = [
    Tool(
        name='Knowledge Base',
        func=qa.run,
        description=(
            'use this tool when answering general knowledge queries to get '
            'more information about the topic'
        )
    )
]

In [18]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)

In [19]:
# With Conversational Agent
query = "Hi, how are you?"

print(agent(query))

query = "can you tell me some facts Parameter Efficient Fine-Tuning?"

print(agent(query))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "I'm an AI, so I don't have feelings, but I'm here to help you. How can I assist you today?"
}[0m

[1m> Finished chain.[0m
{'input': 'Hi, how are you?', 'chat_history': [], 'output': "I'm an AI, so I don't have feelings, but I'm here to help you. How can I assist you today?"}


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Knowledge Base",
    "action_input": "Parameter Efficient Fine-Tuning"
}[0m
Observation: [36;1m[1;3mParameter-efficient fine-tuning is a technique in natural language processing that aims to optimize the fine-tuning process by updating only a small number of additional parameters while keeping most of the pre-trained parameters frozen. This approach is particularly useful when working with limited labeled data for a specific task, as it saves computational time and resources. It also makes the deployment of large languag

In [None]:
print(agent("what is 2 * 7?"))

In [None]:
print(agent("can you tell me some facts Parameter Efficient Fine-Tuning?"))

In [None]:
print(agent("can you summarize these facts in two short sentences"))

In [None]:
pinecone.delete_index(index_name)