In [None]:
# !pip install faiss-cpu langchain langchain_community sentence-transformers bitsandbytes langchain_huggingface 

In [2]:
import torch
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig
)


In [None]:
csv_path = "Animal disease spreadsheet - Sheet1.csv"

loader = CSVLoader(file_path=csv_path,encoding='utf-8')
documents = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
split_docs = text_splitter.split_documents(documents)

In [None]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
    
)

vector_store = FAISS.from_documents(
    split_docs,
    embedding_model
)

vector_store = 'index.faiss'

vector_store.save_local(r"D:\Projectss\EPICS Chatbot\FAISS")

# After creating the embeddings you can load the embedding by uncommenting the below line and commenting above lines
# vector_store = FAISS.load_local(folder_path='FAISS',embeddings=embedding_model,allow_dangerous_deserialization=True)


In [None]:
# for cuda model
model_name = "tiiuae/Falcon3-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 4-bit configuration
bnb_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 8-bit configuration
bnb_config_8bit = BitsAndBytesConfig(
    load_in_8bit=True,
)

### Modified Model Loading with Quantization Options
def load_quantized_model(model_name, use_quantization='8bit'):
    if use_quantization == '4bit':
        quant_config = bnb_config_4bit
    elif use_quantization == '8bit':
        quant_config = bnb_config_8bit
    else:
        quant_config = None

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=quant_config,
        torch_dtype="auto" if not quant_config else None
    )
    return model

model = load_quantized_model(model_name,'none')


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=500,
    do_sample=True,
    # temperature=0.7,
    # top_p=0.9,
    # repetition_penalty=1.1,
    return_full_text=False
)

Device set to use cpu


In [None]:
rag_prompt = PromptTemplate.from_template(
    """Answer the question only if it is explicitly about veterinary diseases or animal health. Follow these steps:  
1. Check Scope:
   - If the question is unrelated to veterinary topics, say: "I don't know. My expertise is limited to veterinary diseases and animal health."  
   - Do not use the context for non-veterinary questions.
2. Veterinary Answers:  
   - If veterinary-related, answer directly and concisely using the provided context.  
   - Do not add self-generated questions, hypothetical scenarios, or unrelated topics.
   - Only supplement with veterinary knowledge if the context is insufficient. \nContext: {context} \nQuestion: {question} \nAnswer:"""
)


In [None]:
from langchain_huggingface import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
    chain_type_kwargs={"prompt": rag_prompt},
    return_source_documents=True
)

In [None]:
import time

def ask(query):
    # Start timing
    start_time = time.time()
    
    # Run the RAG chain
    result = rag_chain.invoke({"query": query})
    
    # End timing
    end_time = time.time()
    time_taken = end_time - start_time
    
    # Calculate tokens/second
    # Get the generated text and tokenize it
    generated_text = result["result"]
    tokens = tokenizer(generated_text, return_tensors="pt").input_ids.shape[1]
    tokens_per_second = tokens / time_taken
    
    # Print results
    print("Question:", query)
    print("\nAnswer:", result["result"])
    print("\nMetrics:")
    print(f"- Time taken: {time_taken:.2f} seconds")
    print(f"- Tokens generated: {tokens}")
    print(f"- Tokens/second: {tokens_per_second:.2f}")
    
    print("\nSources:")
    for doc in result["source_documents"][:2]:  # Show top 2 sources
        print(f"- {doc.page_content[:150]}...")
    print("\n" + "="*50 + "\n")


In [None]:
ask("Blue Tongue and its preventions")
ask("Give info about anthrax")