In [1]:
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores.chroma import Chroma
from langchain.llms import OpenAI
from langchain import PromptTemplate
import torch
from langchain.llms import CTransformers
from langchain import PromptTemplate, LLMChain
import numpy as np
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp
import time 
import re
from langchain.retrievers import BM25Retriever, EnsembleRetriever

## Setting things up

In [None]:
#!pip install llama-cpp-python

In [18]:
#set up model path
path = "/Users/Kenneth/PycharmProjects/pubMedNLP/kedronlp/data/06_models/llama-2-7b-chat.Q4_K_M.gguf"

In [8]:
#making chromadb use the right embedding function 
class PubMedBert:
    def __init__(self, device):
        self.device = device
        self.model = SentenceTransformer(
            "pritamdeka/S-PubMedBert-MS-MARCO", device=self.device
        )
        self.model.max_seq_length = 512

    def encode(self, doc_batch):
        batch_size = len(doc_batch)
        embeddings = self.model.encode(
            doc_batch, device=self.device, batch_size=batch_size
        )
        return np.stack(embeddings, axis=0).tolist()


class PubMedEmbeddingFunction(chromadb.EmbeddingFunction):
    def __init__(self, model):
        self.model = model
        
    def embed_query(self, input):
        return self.model.encode(input)

    def __call__(self, input):
        return self.model.encode(input)


In [6]:
#custom langchain function to get a vector store object
def get_langchain_chroma(device, persist_dir="../chroma_store"):
    model = PubMedBert(device=device)
    embed_fn = PubMedEmbeddingFunction(model=model)
    client = chromadb.PersistentClient(path=persist_dir)
    langchain_chroma = Chroma(
        client=client,
        collection_name="pubmed_embeddings",
        embedding_function=embed_fn,
        collection_metadata={"hnsw:space": "cosine"},
    )
    return langchain_chroma

In [9]:
#set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
vectordb = get_langchain_chroma(device=device)
#check if chroma got documents
print(vectordb._collection.count())

cpu
187838


In [16]:
#preparing the prompt structure that is passed to the LLM
template = """Answer the question as short as possible and only based on the following context:
{context}

Question: {question}
"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [16]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [19]:
#instantiate the llm 
llm = LlamaCpp(
    model_path=path,
    temperature=0,
    max_tokens=1000,
    n_ctx=2048,
    top_p=1,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /Users/Kenneth/PycharmProjects/pubMedNLP/kedronlp/data/06_models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.at

llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
llama_build_graph: non-view tensors processed: 676/676
llama_new_context_with_model: compute buffer total size = 5.75 MiB
AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


## Test 

In [65]:
start_time = time.time()
question = "What is Dysarthria?"
docs = vectordb.similarity_search(question,k=2)
context = [doc.page_content for doc in docs] #alternative test: context = "Dysarthria is a motor speech disorder which can be classified according to the underlying neuropathology and is associated with disturbances of respiration, laryngeal function, airflow direction, and articulation resulting in difficulties of speech quality and intelligibility. There are six major types of dysarthria: flaccid dysarthria associated with lower motor neuron impairment, spastic dysarthria associated with damaged upper motor neurons linked to the motor areas of the cerebral cortex, ataxic dysarthria primarily caused by cerebellar dysfunction, and hyperkinetic dysarthria and hypokinetic dysarthria, which are related to a disorder of the extrapyramidal system. The sixth is generally termed a mixed dysarthria and is associated with damage in more than one area, resulting in speech characteristics of at least two groups."
# Extract the abstract from each string
pattern = re.compile(r"Abstract: (.+?)(?=\n)")
abstracts = [re.search(pattern, string).group(1) for string in context]
input_context = ''.join(abstracts)
input_dict =  {"context": input_context, "question": question}
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [66]:
response = llm_chain.run(input_dict)
end_time = time.time()
execution_time = end_time - start_time

Llama.generate: prefix-match hit


Answer: Dysarthria is an acquired speech disorder caused by neurological injury that affects muscle control of speech production, resulting in weak, imprecise, slow, or uncoordinated movements of the articulatory and respiratory muscles.


llama_print_timings:        load time =   10542.72 ms
llama_print_timings:      sample time =       5.53 ms /    58 runs   (    0.10 ms per token, 10495.84 tokens per second)
llama_print_timings: prompt eval time =   71391.00 ms /   111 tokens (  643.16 ms per token,     1.55 tokens per second)
llama_print_timings:        eval time =   37238.13 ms /    57 runs   (  653.30 ms per token,     1.53 tokens per second)
llama_print_timings:       total time =  108809.94 ms


In [67]:
print(f"Execution time: {execution_time} seconds")

Execution time: 109.0794267654419 seconds


Note: this test was with 2 full abstracts. In general: The longer the input, the longer the response time. 

# Ensemble retriever test

In [3]:
%pip install --upgrade --quiet  rank_bm25

Note: you may need to restart the kernel to use updated packages.


In [15]:
retriever = BM25Retriever.from_documents(vectordb.get())

In [14]:
print(vectordb.get())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [13]:
#Access and print the first entry
first_entry_key, first_entry_value = next(iter(hi.items()))
print(f"First Entry: {first_entry_key}: {first_entry_value}")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [23]:
vectordb.get(where={"ids": 1})

[1m{[0m[32m'ids'[0m: [1m[[0m[1m][0m, [32m'embeddings'[0m: [3;35mNone[0m, [32m'metadatas'[0m: [1m[[0m[1m][0m, [32m'documents'[0m: [1m[[0m[1m][0m, [32m'uris'[0m: [3;35mNone[0m, [32m'data'[0m: [3;35mNone[0m[1m}[0m

In [34]:
document_list = []
doc_list = vectordb.get().get("documents", [])

    
    

In [38]:
from langchain.schema import Document
lang_docs = [Document(page_content=doc) for doc in vectordb.get().get("documents", [])]

In [48]:
bm25_retriever = BM25Retriever.from_documents(lang_docs)
bm25_retriever.k = 2
similarity_retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [49]:
bm25_retriever.get_relevant_documents("curing for dysarthria")


[1m[[0m
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m'Title: Shorter Sentence Length Maximizes Intelligibility and Speech Motor Performance in Persons With Dysarthria Due to Amyotrophic Lateral Sclerosis.\nAuthors: Kristen M Allison, Yana Yunusova, Jordan R Green\nAffiliations: NA\nQualifier: NA\nMajor Qualifier: NA\nDescriptor: Aged, Amyotrophic Lateral Sclerosis, Communication, Dysarthria, Female, Humans, Male, Middle Aged, Severity of Illness Index, Speech, Speech Acoustics, Speech Intelligibility, Speech Production Measurement, Speech Therapy, Time Factors\nMajor Descriptor: Speech Intelligibility\nAbstract: Purpose The purpose of this study was to investigate the effect of sentence length on intelligibility and measures of speech motor performance in persons with amyotrophic lateral sclerosis [0m[32m([0m[32mALS[0m[32m)[0m[32m and to determine how these effects were influenced by dysarthria severity levels. Method One hundred thirty-one persons wit

In [44]:
similarity_retriever.get_relevant_documents("curing for dysarthria")


[1m[[0m
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m"Title[0m[32m: ReaDySpeech for people with dysarthria after stroke: protocol for a feasibility randomised controlled trial.\nAuthors: Claire Mitchell, Audrey Bowen, Sarah Tyson, Paul Conroy\nAffiliations: NA\nQualifier: NA\nMajor Qualifier: NA\nDescriptor: NA\nMajor Descriptor: NA\nAbstract: Dysarthria, a disordered speech production resulting from neuro-muscular impairment, is a common symptom after stroke. It causes significant problems for patients' speech intelligibility, communication, psychological well-being, social engagement and stroke recovery. Rehabilitation for dysarthria is variable in quality, intensity and duration, which may be, in part, due to the lack of good quality evidence. An online therapy programme, ReaDySpeech, has the potential to improve quality, intensity and duration of speech rehabilitation and was considered in a proof-of-concept study to be acceptable to speech and language 

In [46]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, similarity_retriever], weights=[0.5, 0.5]
)

In [47]:
ensemble_retriever.get_relevant_documents("curing for dysarthria")


[1m[[0m
    [1;35mDocument[0m[1m([0m
        [33mpage_content[0m=[32m'Title: Shorter Sentence Length Maximizes Intelligibility and Speech Motor Performance in Persons With Dysarthria Due to Amyotrophic Lateral Sclerosis.\nAuthors: Kristen M Allison, Yana Yunusova, Jordan R Green\nAffiliations: NA\nQualifier: NA\nMajor Qualifier: NA\nDescriptor: Aged, Amyotrophic Lateral Sclerosis, Communication, Dysarthria, Female, Humans, Male, Middle Aged, Severity of Illness Index, Speech, Speech Acoustics, Speech Intelligibility, Speech Production Measurement, Speech Therapy, Time Factors\nMajor Descriptor: Speech Intelligibility\nAbstract: Purpose The purpose of this study was to investigate the effect of sentence length on intelligibility and measures of speech motor performance in persons with amyotrophic lateral sclerosis [0m[32m([0m[32mALS[0m[32m)[0m[32m and to determine how these effects were influenced by dysarthria severity levels. Method One hundred thirty-one persons wit

# Test self query retriever