In [1]:
#Libraries

# !pip install pandoc langchain gradio chromadb tiktoken clean-text
# !pip install "unstructured[local-inference]"
# !pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
# !pip install layoutparser pypdf unidecode

import os
import glob
import pandoc
from io import StringIO
import gradio as gr
import re
import time

from langchain.document_loaders import TextLoader
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import UnstructuredEPubLoader
from langchain.document_loaders import PyPDFLoader


from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain import LlamaCpp

from html.parser import HTMLParser
import chromadb    

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
extensions = ['txt', 'md', 'pdf', 'doc', 'docx']


In [18]:
model_path = "../models/wizard-vicuna-13B.ggmlv3.q4_1.bin"

local_llm = LlamaCpp(
    model_path=model_path,
    temperature=0,
    max_tokens=512,
    n_ctx=2048
    )

llama.cpp: loading model from ../models/wizard-vicuna-13B.ggmlv3.q4_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 5120
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 40
llama_model_load_internal: n_layer    = 40
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 3 (mostly Q4_1)
llama_model_load_internal: n_ff       = 13824
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size =    0.09 MB
llama_model_load_internal: mem required  = 9807.48 MB (+ 1608.00 MB per state)
.
AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | 
llama_init_from_file: kv self size  = 1600.00 MB


In [4]:
from llama_cpp import Embedding
tokenizer = Embedding(model=model_path)

In [5]:
from langchain.prompts import PromptTemplate
QUESTION_PROMPT_TEMPLATE = """### Instruction:
Use the following portion of a long document to see if any of the text is relevant to answer the question.

Question: {question}

### Input:
{context}

### Response:"""

QUESTION_PROMPT = PromptTemplate(template=QUESTION_PROMPT_TEMPLATE, input_variables=["question","context"])

In [6]:
COMBINE_PROMPT_TEMPLATE = """### Instruction:
Given the following extracted parts of a long document and a question, create a final answer. 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.

Question:{question}

### Input:
{summaries}

### Response:"""
COMBINE_PROMPT = PromptTemplate(template=COMBINE_PROMPT_TEMPLATE, input_variables=["question","summaries"])

In [7]:
all_docs = []

def examinelibrary():
    # Define the extensions we're looking for
    # extensions = ['pdf']
    all_docs.clear()
    # Initialize a counter
    counter = 0
    # Scan the directory
    for extension in extensions:
        counter += len(glob.glob("../books/*." + extension))
            
    # Initialize library as dict and load documents into it
    library = {}
    for extension in extensions:
        for file_name in glob.glob("../books/*." + extension):
            content = ""
            if extension in ['txt', 'md']:
                loader = TextLoader(file_name)
            elif extension in ['doc', 'docx']:
                loader = UnstructuredWordDocumentLoader(file_name) 
            elif extension == 'pdf':
                loader = PyPDFLoader(file_name)
            elif extension == 'epub':  
                loader = UnstructuredEPubLoader(file_name) 
        # Load the contents of the file        
        documents = loader.load()
        # Add the file and its contents to the library
        library[file_name] = documents
        
    # Display the count of files detected   
    return(library, counter)

In [8]:
# Split the texts in the library into chunks
def processtext(library):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = []
    for file_name in library:
          texts += text_splitter.split_documents(library[file_name])
    return texts

In [13]:
# Produce a librarian (database of embedded texts)
def prodlibrarian():
    library,counter = examinelibrary()
    texts = processtext(library)
    embeddings = HuggingFaceEmbeddings(model_name="intfloat/e5-base-v2")
    db = Chroma.from_documents(texts, embeddings)
    return (db, counter)


In [14]:
# Init on first run
db,counter = prodlibrarian()


# Call prodlibrarian and display progress
def scan(progress=gr.Progress()):
    progress(0.2, desc="Examining /books folder ...")
    time.sleep(1)
    progress(0.4, desc="Counting documents ...")
    time.sleep(1.5)
    progress(0.6, desc="Generating embeddings ...")
    time.sleep(1.5)
    global db
    db, counter = prodlibrarian()
    return "Librarian found and embedded " + str(counter) + " documents."

convert /Users/msloof/Projects/MachineLearning/BasedLibrarian/books/mediations-marcus-aurelius.doc -> /private/var/folders/j6/swqsdhv14kgdvmnvtstt6x1w0000gp/T/tmpojimvi3s/mediations-marcus-aurelius.docx using filter : MS Word 2007 XML
convert /Users/msloof/Projects/MachineLearning/BasedLibrarian/books/mediations-marcus-aurelius.doc -> /private/var/folders/j6/swqsdhv14kgdvmnvtstt6x1w0000gp/T/tmpbj6yzf2o/mediations-marcus-aurelius.docx using filter : MS Word 2007 XML


Created a chunk of size 1107, which is longer than the specified 1000
Created a chunk of size 1946, which is longer than the specified 1000
Created a chunk of size 1436, which is longer than the specified 1000
Created a chunk of size 1182, which is longer than the specified 1000
Created a chunk of size 1065, which is longer than the specified 1000
Created a chunk of size 1759, which is longer than the specified 1000
Created a chunk of size 1023, which is longer than the specified 1000
Created a chunk of size 1063, which is longer than the specified 1000
Created a chunk of size 1128, which is longer than the specified 1000
Created a chunk of size 1350, which is longer than the specified 1000
Created a chunk of size 1138, which is longer than the specified 1000
Created a chunk of size 2798, which is longer than the specified 1000
Created a chunk of size 1070, which is longer than the specified 1000
Created a chunk of size 1183, which is longer than the specified 1000
Created a chunk of s

In [15]:
def check_metadata_page(docs):
    for i in range(len(docs)):
        if 'page' not in docs[i].metadata.keys():
            docs[i].metadata['page'] = "Not applicable"
    return docs

In [22]:
# Initialize language model and qa chain
# llm = OpenAI(temperature=0, openai_api_base=OPENAI_API_BASE, model="gpt4-x-vicuna-13B-GPTQ")
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from time import perf_counter

# chain_type_kwargs = {"prompt": prompt}
chain = load_qa_with_sources_chain(local_llm, chain_type="map_reduce", question_prompt=QUESTION_PROMPT, combine_prompt=COMBINE_PROMPT,verbose=True)

# Main query code.
def ask(query, search_filter=None, progress=gr.Progress()):
    start_time = perf_counter()

    progress(0.1, desc="Scanning embedded documents for matches ...")
  
    #generate docs which are texts relevant to the query
    docs = db.similarity_search(query, k=2, filter=search_filter) 

    docs = check_metadata_page(docs)

    progress(0.2, desc="Assembling request...")
    progress(0.4, desc="Appending citations and metadata ...")
    #some muckwork to log sources
    x = 0
    citations = ""
    for x in range(len(docs)): 
        # citations += docs[x].metadata['source']  + "\n"
        citations += docs[x].metadata['source'] + " in page: " + str(docs[x].metadata['page'])  + "\n"
    
    progress(0.5, desc="Talking to LLM for answers ...")

    #calls llm with the query and relevant docs in hand and returns both the response and the sources    
    librarianoutput = chain({"input_documents":docs, "question": query})
    output = "Answer: \n" + librarianoutput["output_text"] + "\n\nI found this in: \n" + citations

    query_time = str(round(perf_counter() - start_time ,2)) + " seconds"
    return output

#Gradio UI
with gr.Blocks() as app:


    with gr.Row():
        gr.Markdown("# Welcome to your Natassistant!")
        scan_btn = gr.Button("Scan the library again.")
        
    query = gr.Textbox(label="What can I help you find?")
    output = gr.Textbox(label="Response:")
    ask_btn = gr.Button("Ask Librarian")
    # performance_box = gr.Textbox(label=f"Time to complete query:",)
    selected_docs = gr.CheckboxGroup(all_docs, label="Documents used as input for the model")

    ask_btn.click(fn=ask, inputs=query, outputs=output)
    scan_btn.click(fn=scan, outputs=output)
    # gr.Markdown("*...a private library is not an ego-boosting appendage but a research tool. The library should contain as much of what you do not know ... You will accumulate more knowledge and more books as you grow older, and the growing number of unread books on the shelves will look at you menacingly. Indeed, the more you know, the larger the rows of unread books. Let us call this collection of unread books an antilibrary.* \n - Nassim Nicholas Taleb, The Black Swan")
app.queue(concurrency_count=1).launch()

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.






[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m### Instruction:
Use the following portion of a long document to see if any of the text is relevant to answer the question.

Question: intfloat/e5-base-v2

### Input:
50 100 150 200 250 300 350
 10  100  1000  10000  100000  1e+06 0 0.1 0.2 0.3 0.4 0.5 0.6Perplexity
Fraction of covered 5-grams
LM training data size in million tokens+.022/x2+.035/x2+.038/x2+.026/x2
target KN PP
ldcnews KN PP
webnews KN PP
target C5
+ldcnews C5
+webnews C5
+web C5
Figure 4:Perple xities with Kneser -NeySmoothing
(KN PP)andfraction ofcovered 5-grams (C5).
7.3Perplexityandn-GramCoverage
Astandard measure forlanguage model quality is
perple xity.Itismeasured ontestdataT=w|T|
1:
PP(T)=e−1
|T||T| 
i=1logp(wi|wi−1
i−n+1)
(7)
This istheinverse oftheaverage conditional prob-
ability ofanextword; lower perple xities arebet-
ter.Figure 4showsperple xities formodels with
Kneser 

Llama.generate: prefix-match hit
