In [None]:
%load_ext autoreload
%autoreload 2

### Necessary imports


In [None]:
# !pip install -q -U torch datasets transformers tensorflow langchain playwright html2text sentence_transformers faiss-cpu
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7

### Dependencies

In [1]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
from peft import LoraConfig, PeftModel

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline

  from .autonotebook import tqdm as notebook_tqdm
2024-01-31 10:34:13.302474: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-31 10:34:13.372835: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-31 10:34:13.372871: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-31 10:34:13.382924: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-31 10:34:13.4

### Load quantized Mistal 7B

In [3]:
#################################################################
# Tokenizer
#################################################################

model_name='mistralai/Mistral-7B-Instruct-v0.2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
mistral_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

Your GPU supports bfloat16: accelerate training with bf16=True


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]Error while downloading from https://cdn-lfs-us-1.huggingface.co/repos/25/f2/25f242d117fa40b7cc0b5e85e97135c923bc5665bde4204e7fabadb99a561eab/63654d601820b88b1fa8b4a98df5714f700fbc5b3df2cc4ecbabdced35096d31?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27model-00001-of-00003.safetensors%3B+filename%3D%22model-00001-of-00003.safetensors%22%3B&Expires=1706931755&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjkzMTc1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzI1L2YyLzI1ZjI0MmQxMTdmYTQwYjdjYzBiNWU4NWU5NzEzNWM5MjNiYzU2NjViZGU0MjA0ZTdmYWJhZGI5OWE1NjFlYWIvNjM2NTRkNjAxODIwYjg4YjFmYThiNGE5OGRmNTcxNGY3MDBmYmM1YjNkZjJjYzRlY2JhYmRjZWQzNTA5NmQzMT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=QwFFHD92A-FkCY-mrZ6g1OeYq8cXgio7ap7ovVy-Kxgo4ifgXacVnGvIAPudsnEj-6JFl7I70orIeUUFQEDHf6aQZAZy4qsYHmBguTgYPWEDRCd81J6zxv185ZoNSwfskzpkpph9i0tFZ7LQEb5R

### Count number of trainable parameters

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(mistral_model))

### Build Mistral text generation pipelines

In [None]:
standalone_query_generation_pipeline = pipeline(
    model=mistral_model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.0,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)
standalone_query_generation_llm = HuggingFacePipeline(pipeline=standalone_query_generation_pipeline)

response_generation_pipeline = pipeline(
    model=mistral_model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)
response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

### Load and chunk documents. Load chunked documents into FAISS index 

In [None]:
!playwright install 
!playwright install-deps 

In [None]:
import nest_asyncio
nest_asyncio.apply()

# Articles to index 
articles = ["https://www.fantasypros.com/2023/12/fantasy-football-panic-meter-patrick-mahomes-austin-ekeler-stefon-diggs-travis-etienne/",]

# Scrapes the blogs above 
loader = AsyncChromiumLoader(articles)
docs = loader.load()


In [None]:
# Converts HTML to plain text
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

# Chunk text
text_splitter = CharacterTextSplitter(chunk_size=800,
                                    chunk_overlap=0)
chunked_documents = text_splitter.split_documents(docs_transformed)

# Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents,
                        HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

retriever = db.as_retriever(k=1)


### Create PromptTemplate and LLMChain

In [None]:
from langchain.schema import format_document
from langchain_core.messages import get_buffer_string
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.memory import ConversationBufferMemory
from langchain.prompts.prompt import PromptTemplate
from langchain_core.prompts.chat import ChatPromptTemplate

from operator import itemgetter

In [None]:
_template = """
[INST]
Given the flollwing conversation and a follow up queston, rephrase the follow up question to be standalone question, This query will be used to retrieve documents with additional context.

Let me share a couple example that will be important.

If you do not see any chat history, you MUST return the "Follow Up Input" as is:

```
Chat History:
Follow Up Input: How is Lawrence doing?
Standalone Question:
How is Lawrence doing?
```

If this is the second question onwards, you should properly rephrase the question like this:

```
Chat History:
Human: How is Lawrence doing?
AI:
Lawrence is injured and out for the reason.

Follow Up Input: What was his injurt?
Standalone Question:
What was Lawrence's injury?
```

Now, with those examples, here is the actual chat history and input question.

Chat History:
{chat_history}

Follow Up Input: {question}
Standalone question:
[your response here]
[/INST]
"""

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [None]:
template = """
[INST] 
Answer the question based only on the following context:
{context}

Question: {question}
[/INST] 
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(template)

In [None]:
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

In [None]:
# Instantiate ConversationBufferMemory
memory = ConversationBufferMemory(
    return_messages=True, output_key="answer", input_key="question"
)

# First we add a step to load memory
# This adds a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history")
)

# Now we calculate the standalone question
standalone_question = {
    "standalone_question": {
        "question": lambda x: x["question"],
        "chat_history": lambda x: get_buffer_string(x["chat_history"]),
    }
    | CONDENSE_QUESTION_PROMPT
    | standalone_query_generation_llm,
}

# Now we retrieve the documents
retrieved_documents = {
    "docs": itemgetter("standalone_question") | retriever,
    "question": lambda x: x["standalone_question"],
}
# Now we construct the inputs for the final prompt
final_inputs = {
    "context": lambda x: _combine_documents(x["docs"]),
    "question": itemgetter("question"),
}
# And finally, we do the part that returns the answers
answer = {
    "answer": final_inputs | ANSWER_PROMPT | response_generation_llm,
    "question": itemgetter("question"),
    "context": final_inputs["context"]
}

# And now we put it all together!
final_chain = loaded_memory | standalone_question | retrieved_documents | answer

In [None]:
def call_conversational_rag(question, chain, memory):
    """
    Calls a conversational RAG (Retrieval-Augmented Generation) model to generate an answer to a given question.

    This function sends a question to the RAG model, retrieves the answer, and stores the question-answer pair in memory 
    for context in future interactions.

    Parameters:
    question (str): The question to be answered by the RAG model.
    chain (LangChain object): An instance of LangChain which encapsulates the RAG model and its functionality.
    memory (Memory object): An object used for storing the context of the conversation.

    Returns:
    dict: A dictionary containing the generated answer from the RAG model.
    """
    
    # Prepare the input for the RAG model
    inputs = {"question": question}
    
    # Invoke the RAG model to get an model
    result = chain.invoke(inputs)
    
    # Save the current question and its answer to memory for future context
    memory.save_context(input, {"answer": result["answer"]})
    
    # Return the result
    return result


In [None]:
question = "how is maholmes doing"
call_conversational_rag(question, final_chain, memory)


In [None]:
question = "Who are some good alternatives to him?"
call_conversational_rag(question, final_chain, memory)

In [None]:
question = "How many PPG are both averaging?"
call_conversational_rag(question, final_chain, memory)

In [None]:
question = "Who did I originally ask about?"
call_conversational_rag(question, final_chain, memory)