In [1]:
from langchain.document_loaders import DirectoryLoader
# retrieve all files from the Datapath and convert that into langchain.schema.Document

# split into smaller chunks, relavant 
from langchain.text_splitter import RecursiveCharacterTextSplitter
# as each document will be too long for the context window of the LLM, we will split that into k=3

from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings # to create the embedding of the database and 
from langchain.vectorstores.chroma import Chroma # for storing the vectors 
from langchain.chat_models import ChatOpenAI # LLM 

# We won't be using OpenAI LLM, Falcon7B instead 
from langchain.prompts import ChatPromptTemplate # After the RAG, we will send the retrieved and query embed to the LLM
from langchain.evaluation import load_evaluator # for evaluating the model 

from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings  # Use this instead of openAI 
from langchain.chains import RetrievalQA

# system 
import os
import sys 
import shutil # for removing the dir 

# for parsing the query to the retrieved docs 
import argparse

# 
from dataclasses import dataclass # simplifies the creation of classes to store the data 

# to load the keys 
from dotenv import load_dotenv

# transformers stuff 
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import AutoConfig # to load the model configurations 

# initialize empty weights for a model 
from accelerate import init_empty_weights   # for distributed training for using multiple GPUs
from accelerate import infer_auto_device_map # to infer the mapping of devices for distrubuted training. # for multiple GPUs 

import transformers
import torch
from torch import cuda, bfloat16
load_dotenv()

True

In [3]:
model = "tiiuae/falcon-7b"

# get the config of the model 
config = AutoConfig.from_pretrained(model) # fetches config, and creates a FalConConfig Object 

In [4]:
config

FalconConfig {
  "_name_or_path": "tiiuae/falcon-7b",
  "alibi": false,
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "FalconForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "tiiuae/falcon-7b--configuration_falcon.FalconConfig",
    "AutoModel": "tiiuae/falcon-7b--modeling_falcon.FalconModel",
    "AutoModelForCausalLM": "tiiuae/falcon-7b--modeling_falcon.FalconForCausalLM",
    "AutoModelForQuestionAnswering": "tiiuae/falcon-7b--modeling_falcon.FalconForQuestionAnswering",
    "AutoModelForSequenceClassification": "tiiuae/falcon-7b--modeling_falcon.FalconForSequenceClassification",
    "AutoModelForTokenClassification": "tiiuae/falcon-7b--modeling_falcon.FalconForTokenClassification"
  },
  "bias": false,
  "bos_token_id": 11,
  "eos_token_id": 11,
  "hidden_dropout": 0.0,
  "hidden_size": 4544,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "falcon",
  "multi

In [7]:
# creates an empty model based on the configuration loaded 
with init_empty_weights(): 
    # LLM
    model = AutoModelForCausalLM.from_config(config=config) # model creation - empty 

# analyzes the empty model's structure to understand the memeory requirements of each layer and param

model.tie_weights()  # weight tying between the input and the output embedding 

device_map = infer_auto_device_map(model)

device_map # dictionary mappying 

OrderedDict([('transformer.word_embeddings', 0),
             ('lm_head', 0),
             ('transformer.h.0.self_attention.rotary_emb', 0),
             ('transformer.h.0.self_attention.query_key_value', 'cpu'),
             ('transformer.h.0.self_attention.dense', 'cpu'),
             ('transformer.h.0.self_attention.attention_dropout', 'cpu'),
             ('transformer.h.0.mlp', 'cpu'),
             ('transformer.h.0.input_layernorm', 'cpu'),
             ('transformer.h.1', 'cpu'),
             ('transformer.h.2', 'cpu'),
             ('transformer.h.3', 'cpu'),
             ('transformer.h.4', 'cpu'),
             ('transformer.h.5', 'cpu'),
             ('transformer.h.6', 'cpu'),
             ('transformer.h.7.self_attention', 'cpu'),
             ('transformer.h.7.input_layernorm', 'disk'),
             ('transformer.h.8', 'disk'),
             ('transformer.h.9', 'disk'),
             ('transformer.h.10', 'disk'),
             ('transformer.h.11', 'disk'),
             ('tran

In [8]:
# automatically determine the device map from the empty model. maximize all GPU's, then CPU RAM
device_map = infer_auto_device_map(
    model=model,
    no_split_module_classes=["OPTDecoderLayer"]
)

In [4]:
torch.cuda.is_available()

True

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

In [2]:
#model = "tiiuae/falcon-40b"

model = "tiiuae/falcon-7b"

tokenizer = AutoTokenizer.from_pretrained(model)


falcon_pipeline = transformers.pipeline(
    "text-generation", # task
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,  # by googleBRAIN tea
    offload_folder="offload",
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


: 

In [None]:
sequences = pipeline(
   "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")



In [None]:
CHROMA_PATH = "chroma"
DATA_PATH = "./Dummy_Medical_report/Dummy medicine/"   # path of the database 

In [None]:
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    documents = loader.load()
    return documents

In [None]:
def split_text(documents: list[Document]):
    # split the dcoument into chunks 
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    
    # splitted 
    chunks = text_splitter.split_documents(documents)

    # original documents and # chunks 

    print(f"Split {len(documents)} documents into {len(chunks)} chunks")

    # random chunk
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

Chroma database that uses vector embeddings as the key 

In [None]:
evaluator = load_evaluator("pairwise_embedding_distance")

x = evaluator.evaluate_strings_pairs(prediction="apple", prediction_b="orange")

Query for relevant data 

In [None]:
def save_to_chroma(chunks: list[Document]):

    # remove if database already exists 
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    
    # to generate the chroma vector database from the chunks 
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH  # this should be the clound or MongoDB
    )

    db.persist()

    # save in .sqlite3
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}")

In [None]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)

In [None]:
generate_data_store()

In [None]:
# to opeimize the output from the RAG 

PROMTP_TEMPLATE = """

Answer the question based only on the following context:

{context}

---

Answer the question based on the above context:

{question}
"""

Answer the question based only on the following context in a medical point of view:




Context:
{

 "name": "augmentin 625 duo tablet",

 "substitute0": "Penciclav 500 mg/125 mg Tablet",

 "substitute1": "Moxikind-CV 625 Tablet",

 "substitute2": "Moxiforce-CV 625 Tablet",

 "substitute3": "Fightox 625 Tablet",

 "substitute4": "Novamox CV 625mg Tablet",

 "sideEffect0": "Vomiting",

 "sideEffect1": "Nausea",

 "sideEffect2": "Diarrhea",

 "sideEffect3": "Treatment of Bacterial infections",

 "Habit Forming": "ANTI INFECTIVES",

}



---



Answer the question based on the above context:



"
can you give me some details about augmentin 625 duo tablet?
""

In [None]:
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("query_text", type=str, help="The query text.")
    args = parser.parse_args()
    
    query_text = args.query_texts

    # embedding function should be same as the embedding function we used to create the database
    embedding_function = OpenAIEmbeddings()

    """EMBEDDING"""
    embedding = HuggingFaceEmbeddings(
        model_name = None,
        model_kwargs = {"device": "cuda"}
    )

    # Do the same 
    db = Chroma( 
        persist_directory=CHROMA_PATH,  # data path 
        embedding_function=embedding_function # function
    )

    # search the database 
    results = db.similarity_search_with_relevance_scores(query_text, k=3) # top 3 results 
    # List[Tupel[Document, float]]

    if len(results) == 0 or results[0][-1] < 0.7:  # threshold 
        print(f"unable to find matching results")
        return 

    # from results, merge the documents  -- convert to single piece of code 
    context_text = "\n\n--\n\n".join([doc.page_content for doc, _score in results])

    # get the template of the prompt 
    prompt_template = ChatPromptTemplate.from_template(PROMTP_TEMPLATE)

    # format of the code 
    prompt = prompt_template.format(
        context=context_text,
        question=query_text
    )

    print(prompt)

    # LLM, use local model 
    model = ChatOpenAI()

    response_text = model.predict(prompt)


    # sourcing the result from the database 
    sources = [doc.metadata.get("source", None) for doc, _score in results]

    formatted_response = f"Response: {response_text}\n Sources: {sources}"

    print(formatted_response)


In [None]:
def compare_embeddings(text1, text2):
    embedding_function = OpenAIEmbeddings()

    vector = embedding_function.embed_query(text1)
    print(f"Vector for {text1} : {vector}")
    
    evaluator = load_evaluator("pairwise_embedding_distance")
    words = (text1, text2)

    x = evaluator.evaluate_string_pairs(prediction=words[0], prediction_b=words[1])
    print(f"Comparing ({words[0]}, {words[1]}): {x}")

