In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer,AutoConfig,BitsAndBytesConfig
import torch
import nest_asyncio
from langchain_community.vectorstores import Chroma

from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
import transformers
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.runnables import RunnablePassthrough,RunnableLambda
from langchain.memory import ConversationBufferMemory
from langchain.memory.buffer import get_buffer_string
from operator import itemgetter
from langchain.schema import format_document
from langchain.memory import ConversationBufferMemory

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
config = AutoConfig.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant = False
)
model = AutoModelForCausalLM.from_pretrained(model_name,quantization_config=quantization_config,)

2024-02-16 15:54:40.531974: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loading checkpoint shards:  67%|██████▋   | 2/3 [01:06<00:33, 33.70s/it]

In [None]:
nest_asyncio.apply()
articles = ["https://www.fantasypros.com/2023/11/rival-fantasy-nfl-week-10/",
            "https://www.fantasypros.com/2023/11/5-stats-to-know-before-setting-your-fantasy-lineup-week-10/",
            "https://www.fantasypros.com/2023/11/nfl-week-10-sleeper-picks-player-predictions-2023/",
            "https://www.fantasypros.com/2023/11/nfl-dfs-week-10-stacking-advice-picks-2023-fantasy-football/",
            "https://www.fantasypros.com/2023/11/players-to-buy-low-sell-high-trade-advice-2023-fantasy-football/"]
loader = AsyncChromiumLoader(articles)

In [None]:
docs = loader.load()
htmlTransformer = Html2TextTransformer()
docs = htmlTransformer.transform_documents(docs)

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=100, 
                                      chunk_overlap=0)
docs_chunk = text_splitter.split_documents(docs)

Created a chunk of size 130, which is longer than the specified 100
Created a chunk of size 4294, which is longer than the specified 100
Created a chunk of size 131, which is longer than the specified 100
Created a chunk of size 230, which is longer than the specified 100
Created a chunk of size 500, which is longer than the specified 100
Created a chunk of size 207, which is longer than the specified 100
Created a chunk of size 365, which is longer than the specified 100
Created a chunk of size 312, which is longer than the specified 100
Created a chunk of size 515, which is longer than the specified 100
Created a chunk of size 584, which is longer than the specified 100
Created a chunk of size 1119, which is longer than the specified 100
Created a chunk of size 257, which is longer than the specified 100
Created a chunk of size 103, which is longer than the specified 100
Created a chunk of size 136, which is longer than the specified 100
Created a chunk of size 230, which is longer t

In [None]:
embd_func = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
vectorstore = Chroma.from_documents(documents=docs_chunk,embedding=embd_func)

In [None]:
retriever = vectorstore.as_retriever()

In [None]:


sane_pipeline= transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.0,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)

sane_pipeline = HuggingFacePipeline(pipeline=sane_pipeline)


insane_pipeline= transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.0,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)

insane_pipeline = HuggingFacePipeline(pipeline=insane_pipeline)

In [None]:
input_template = """
[INST] 
Given the following conversation and a follow up question, 
rephrase the follow up question to be a standalone question, in its original language, 
that can be used to query a FAISS index. This query will be used to retrieve documents with additional context.

Let me share a couple examples.

If you do not see any chat history, you MUST return the "Follow Up Input" as is:
```
Chat History:
Follow Up Input: How is Lawrence doing?
Standalone Question:
How is Lawrence doing?
```

If this is the second question onwards, you should properly rephrase the question like this:
```
Chat History:
Human: How is Lawrence doing?
AI: 
Lawrence is injured and out for the season.
Follow Up Input: What was his injury?
Standalone Question:
What was Lawrence's injury?
```

Now, with those examples, here is the actual chat history and input question.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:
[your response here]
[/INST] 
"""
input_prompt = PromptTemplate.from_template(input_template)

template = """
[INST] 
Answer the question based only on the following context:
{context}

Question: {standalone_question}
[/INST] 
"""
output_prompt = PromptTemplate.from_template(template)

default_prompt = PromptTemplate.from_template(template="{page_content}")

In [None]:
def doc_joiner(docs,default_prompt=default_prompt):
    res = [format_document(doc) for doc in docs] 
    return "\n\n".join(res)

In [None]:
memory = ConversationBufferMemory(
 return_messages=True, output_key="answer", input_key="question"
)

In [None]:
loaded_memory = RunnablePassthrough.assign(
 chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)

standalone_question = {
    "standalone_question":{
        "question":lambda x:x["question"],
        "chat_history":lambda x:get_buffer_string(x["chat_history"])
    }
    | input_prompt
    | sane_pipeline,
}

retrieved_documents = {
    "docs":itemgetter("standalone_question") | retriever,
    "standalone_question":lambda x:x["standalone_question"]
}

final_inputs = {
    "context":lambda x: doc_joiner(x["docs"]),
    "standalone_question":itemgetter("standalone_question")
}

answer = {
    "answer": final_inputs | output_prompt | insane_pipeline,
    "standalone_question":itemgetter("standalone_question"),
    "context":final_inputs["context"]
}

final_chain = loaded_memory | standalone_question | retrieved_documents | answer