# Imports

In [34]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

from langchain.vectorstores import Chroma

import torch
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
from langchain.llms import HuggingFacePipeline

from langchain.chains import RetrievalQA

In [9]:
loader = DirectoryLoader('./Source Docs/', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [11]:
texts[1]

Document(page_content="Page 1 of 24https://www.destinypedia.com/Lore:Ghost_Stories\nSign up Login\nNavigationGames\nOther media\nGameplay\nThe universe\nLore database\nCommunity InfoDestinypedia\nWiki help\nRelated sites\nBungie.netRecent changes Random pageSpecial pages What links hereDid you know?...that the Red Legion were feared across the galaxy as destroyers of worlds?...that the Dreadnaught was partially constructed out of a part of Akka's corpse?...that Zhalo Supercell was originally named after the hammer of the Finnish thunder god Ukko?...that while Mercury was shown to only be partially converted into a machine world during theDark Age in Season of Dawn, it was previously stated in Destiny that it had been converted in amatter of days?...that Ghost was originally voiced by Peter Dinklage, but his lines were re-recorded by Nolan Northand completely replaced with Destiny patch 2.0.0?Lore Discussion View source HistoryLore:Ghost StoriesFrom Destinypedia, the Destiny wiki\nDisco

In [12]:
# use huggingface instructor embeddings
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", model_kwargs={"device": "cpu"})


Downloading (…)7f436/.gitattributes: 100%|██████████| 1.48k/1.48k [00:00<00:00, 190kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 270/270 [00:00<00:00, 173kB/s]
Downloading (…)/2_Dense/config.json: 100%|██████████| 116/116 [00:00<00:00, 181kB/s]
Downloading pytorch_model.bin: 100%|██████████| 3.15M/3.15M [00:01<00:00, 2.39MB/s]
Downloading (…)0daf57f436/README.md: 100%|██████████| 66.3k/66.3k [00:00<00:00, 27.6MB/s]
Downloading (…)af57f436/config.json: 100%|██████████| 1.52k/1.52k [00:00<00:00, 3.16MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 207kB/s]
Downloading pytorch_model.bin: 100%|██████████| 4.96G/4.96G [04:02<00:00, 20.5MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 20.7kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 2.20k/2.20k [00:00<00:00, 780kB/s]
Downloading spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 848kB/s]
Downloading (…)7f436/tokenizer.json: 100%|██████████| 2

load INSTRUCTOR_Transformer
max_seq_length  512


In [27]:
# Generate and store embeddings in Chroma
persist_directory = 'db'
vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=instructor_embeddings,
                                 persist_directory=persist_directory)

In [28]:
vectordb.persist()

In [32]:
retriever = vectordb.as_retriever()
docs = retriever.get_relevant_documents("Who is Clovis Bray?")
docs

[Document(page_content='25/5/2023, 5:29 PMLore:The Man They Call Cayde - Destinypedia, the Destiny wiki\nPage 1 of 16https://www.destinypedia.com/Lore:The_Man_They_Call_Cayde\nSign up Login\nNavigationGames\nOther media\nGameplay\nThe universe\nLore database\nCommunity InfoDestinypedia\nWiki help\nRelated sites\nBungie.netRecent changes Random pageSpecial pages What links hereDid you know?...that the Ahamkara are shapeshifters?...that Oryx is responsible for the Taken invasion of the Dreaming City, and the taking of Riven?...that the Sword Heavy Weapon class was ﬁrst introduced in The Taken King?...that while Fallen Dregs can be seen piloting Pikes, no Cabal was ever seen piloting an Interceptorin Destiny until Destiny 2?...that the Hunter Ana Bray was thought to be dead until she reappeared on Mars to investigate herpast?Lore Discussion View source HistoryLore:The Man They Call CaydeFrom Destinypedia, the Destiny wiki\nDiscord\n chat', metadata={'source': 'Source Docs/Lore:The Man The

In [35]:
tokenizer = LlamaTokenizer.from_pretrained("TheBloke/wizardLM-7B-HF")

model = LlamaForCausalLM.from_pretrained("TheBloke/wizardLM-7B-HF",
                                              load_in_8bit=True,
                                              device_map='auto',
                                              torch_dtype=torch.float16,
                                              low_cpu_mem_usage=True
                                              )

Downloading (…)lve/main/config.json: 100%|██████████| 555/555 [00:00<00:00, 91.7kB/s]
Downloading (…)model.bin.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 7.39MB/s]
Downloading (…)l-00001-of-00002.bin: 100%|██████████| 9.98G/9.98G [08:01<00:00, 20.7MB/s]
Downloading (…)l-00002-of-00002.bin: 100%|██████████| 3.50G/3.50G [02:54<00:00, 20.0MB/s]
Downloading shards: 100%|██████████| 2/2 [10:58<00:00, 329.43s/it]


NameError: name 'init_empty_weights' is not defined

In [None]:
pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=1024,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
qa_chain = RetrievalQA.from_chain_type(llm=local_llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [None]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "Who is Clovis?"
llm_response = qa_chain(query)
process_llm_response(llm_response)