In [1]:

  !pip install -qU \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  datasets==2.14.0 \
  accelerate==0.21.0 \
  einops==0.6.1 \
  langchain==0.0.240 \
  xformers==0.0.20 \
  bitsandbytes==0.41.0\
  InstructorEmbedding\
  chromadb



In [2]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-3.14.0-py3-none-any.whl (269 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/269.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/269.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.8/269.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-3.14.0


In [2]:
import transformers
from torch import cuda, bfloat16
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [4]:
loader = DirectoryLoader('', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [5]:
len(documents)

15

In [6]:
text_splitter = RecursiveCharacterTextSplitter (chunk_size=1000, chunk_overlap=200,separators=["\n"],length_function = len)
texts = text_splitter.split_documents(documents)

In [7]:
len(texts)

52

In [8]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large",
                                                      model_kwargs={"device": "cuda"})

Downloading (…)c7233/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)9fb15c7233/README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

Downloading (…)b15c7233/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)c7233/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

Downloading (…)15c7233/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [9]:
embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding)

In [10]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [11]:
docs = retriever.get_relevant_documents("What is multihead attention?")

In [12]:
len(docs)

3

In [13]:
print(docs[0])

page_content='output values. These are concatenated and once again projected, resulting in the final values, as\ndepicted in Figure 2.\nMulti-head attention allows the model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead( Q, K, V ) = Concat(head 1, ...,head h)WO\nwhere head i= Attention( QWQ\ni, KWK\ni, V WV\ni)\nWhere the projections are parameter matrices WQ\ni∈Rdmodel×dk,WK\ni∈Rdmodel×dk,WV\ni∈Rdmodel×dv\nandWO∈Rhdv×dmodel.\nIn this work we employ h= 8 parallel attention layers, or heads. For each of these we use\ndk=dv=dmodel/h= 64 . Due to the reduced dimension of each head, the total computational cost\nis similar to that of single-head attention with full dimensionality.\n3.2.3 Applications of Attention in our Model\nThe Transformer uses multi-head attention in three different ways:\n•In "encoder-decoder attention" layers, the queries come from the previous decode

In [14]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

Model loaded on cuda:0


In [15]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]



Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [16]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    temperature=0.0,
    max_new_tokens=512,
    repetition_penalty=1.1
)

In [17]:
llm = HuggingFacePipeline(pipeline=generate_text)

In [18]:
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever
)

In [19]:
rag_pipeline('how does multihead attention mechanism works?')

{'query': 'how does multihead attention mechanism works?',
 'result': ' Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this. The Transformer uses multi-head attention in three different ways: •In "encoder-decoder attention" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. •The encoder contains self-attention layers. In a self-attention layer all of the keys, values, and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder. •Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder u

In [20]:
from langchain.prompts import PromptTemplate

In [21]:
prompt_template = """Use the following pieces of context to answer the question at the end accuretly. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer :"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [22]:
chain_type_kwargs = {"prompt": PROMPT}

In [23]:
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs
)

In [24]:
rag_pipeline.run('what is transformers?')

' Transformers are a type of neural network architecture that is used for sequence transduction tasks, such as machine translation, text summarization, and language modeling. They are called "transformers" because they use self-attention mechanisms to "transform" the input sequence into a continuous representation, rather than using traditional recurrent neural networks (RNNs) or convolutional neural networks (CNNs). Transformers were introduced in a paper by Vaswani et al. in 2017 and have since become a popular choice for many sequence transduction tasks.'

In [25]:
rag_pipeline.run('how is openheimer movie?')

" I don't know."

In [26]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


In [32]:
from langchain.schema.prompt_template import BasePromptTemplate
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
chat = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

In [33]:
result = chat({"question": "What is multihead attension mechanism ?"})
result['answer']

' Multi-head attention is a variation of the standard attention mechanism that allows the model to jointly attend to information from different representation subspaces at different positions. It is achieved by concatenating the outputs of multiple attention heads and then projecting them back to the original space. The number of attention heads used in the Transformer is 8, and each head has a size of 64. This allows the model to capture longer-range dependencies and better handle input sequences of varying lengths.'

In [34]:
result = chat({"question": "How is self attention different from it?"})
result['answer']

'  Self-attention and multi-head attention are both used in the Transformer architecture, but they serve different purposes. Self-attention allows each position in the encoder or decoder to attend to all positions in the previous layer, while multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. Multi-head attention is used in the Transformer to reduce the computational cost of attention by averaging the attention weights across multiple heads, which are then concatenated and projected. The number of heads used in the Transformer is 8, and each head has a size of 64.'