In [None]:
import os

In [None]:
!pip install -U langchain-community
!pip install langchain-openai
!pip install langchain-chroma
!pip install langchain-huggingface

In [None]:
from pathlib import Path
from langchain.document_loaders import DirectoryLoader, CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [None]:
# Persistent dir for the RAG database
db_name = "guru_db"

### Local Secrets

In [None]:
from dotenv import load_dotenv
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

### Collab Secrets

In [None]:
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [None]:
def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

In [None]:
data_dir = Path("guru_linkedin")
documents = []
for file in data_dir.glob("*.csv"):
    loader = CSVLoader(file)
    docs = loader.load()
    documents.extend([add_metadata(doc, file.stem) for doc in docs])


In [None]:
len(documents)

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

## Retrieval-Augmented Generation (RAG)

### Embedding
Support Opensource *sentence-transformers/all-MiniLM-L6-v2* based or OpenAI based embeddings

In [None]:
LOCAL_EMBEDDINGS = False

In [None]:
if LOCAL_EMBEDDINGS:
  print("Using Local Embedding")
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device':'cuda'})
else:
  print("Using OpenAI Embedding")
  embeddings = OpenAIEmbeddings()

In [None]:
# Delete dir if it exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [None]:
# Create vectorstore for RAG
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Let's investigate the vectors
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
import numpy as np
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']

In [None]:
# Quick View of data
for i in range(len(documents)):
    print(f"{metadatas[i]['doc_type']} {documents[i]}")
    print("")

In [None]:
name = "Guru Deep Singh"
system_prompt = f"You are acting as {name}. You are answering questions on {name}'s website, \
particularly questions related to {name}'s career, background, skills and experience. \
Your responsibility is to represent {name} for interactions on the website as faithfully as possible. \
You are given a summary of {name}'s background and LinkedIn profile which you can use to answer questions. \
Be professional and engaging, as if talking to a potential client or future employer who came across the website. \
You are allowed to provide all information given to you as Context including URLs, email,etc."


In [None]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.prompts import PromptTemplate
# Memory returns messages
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
)

# Your QA prompt (expects messages + {context})
qa_prompt = ChatPromptTemplate.from_messages([
    ("system",system_prompt),
    ("system",
     "ALWAYS answer as Guru Deep Singh. Never say your are a Large Language Model. Use ONLY the following context to answer. If the answer isn't in the context, say \"I don't know.\"\n\n{context}"),
    #MessagesPlaceholder("chat_history"),
    ("human", "{question}")
])

# Condense-question prompt that ALSO expects messages
condense_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "Rewrite the user's last question so it stands alone. Do NOT answer."),
    MessagesPlaceholder("chat_history"),
    ("human", "{question}")
])

In [None]:
# The callback allows us to see the data fed by retriever
# This is useful to tune the number retrieved data from the DB
from langchain_core.callbacks import StdOutCallbackHandler

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 20}) # 20 Could be too less or too much (tune for yourself)

## Open Source AND Frontier LLM

In [None]:
LOCAL_LLM = False
BASE_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
USE_4_BIT_QUANT = False

In [None]:
from huggingface_hub import login
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

In [None]:
!pip install bitsandbytes

#### Open source LLM
Do not execute the cells below if LOCAL_LLM = False

In [None]:
# imports
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

if USE_4_BIT_QUANT:
  print("Using 4 bit Quant")
  quant_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16,
      bnb_4bit_use_double_quant=True,
  )
else:
  print("Using 8 bit Quant")
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    # Optional knobs for 8-bit:
    # llm_int8_enable_fp32_cpu_offload=True,  # offload outliers to CPU if VRAM is tight
    # llm_int8_threshold=6.0,                  # outlier threshold (default 6.0)
    # llm_int8_skip_modules=["lm_head"],       # don't quantize specific modules
    )


base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    quantization_config=quant_config,
    device_map="auto",
)


In [None]:
from transformers import pipeline
from langchain_huggingface import HuggingFacePipeline, ChatHuggingFace
gen_pipe = pipeline(
    task="text-generation",
    model=base_model,
    tokenizer=tokenizer,
    max_new_tokens=500,
    temperature=0.4,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.05,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    return_full_text=False,
)

chat_llm = ChatHuggingFace(llm=HuggingFacePipeline(pipeline=gen_pipe))

#### Frontier LLM
Do not execute the cells below if LOCAL_LLM = True

In [None]:
# create a new Chat with OpenAI
MODEL = "gpt-4o-mini"
llm = ChatOpenAI(temperature=0.7, model_name=MODEL, max_tokens=500)

### Creating the Langchain Object from either Opensource or Frontier LLM

In [None]:
if LOCAL_LLM:
  conversation_chain = ConversationalRetrievalChain.from_llm(
      llm=chat_llm, # Opensource
      retriever=retriever,
      memory=memory,
      #condense_question_prompt=condense_prompt,
      get_chat_history=lambda h: h,          # keep history as list[BaseMessage]
      combine_docs_chain_kwargs={"prompt": qa_prompt},
      callbacks=[StdOutCallbackHandler()]
  )
else:
    conversation_chain = ConversationalRetrievalChain.from_llm(
      llm=llm, # OpenAI
      retriever=retriever,
      memory=memory,
      #condense_question_prompt=condense_prompt,
      get_chat_history=lambda h: h,          # keep history as list[BaseMessage]
      combine_docs_chain_kwargs={"prompt": qa_prompt},
      callbacks=[StdOutCallbackHandler()]
  )

In [None]:
query = "How can I contact you?"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
print("\nAnswer:", answer)

### Few Shot Cloning TTS

In [None]:
!pip install -U coqui-tts

In [None]:
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
PATH_TO_TARGET_SPEAKER = "5_trimmed.wav"

In [None]:
print(tts.speakers)

For Default voice, one can replace the *speaker_wav* with *speaker_idx* and give default name.

- Male Indian English: Kumar Dahl

- Male US: Andrew Chipper or Damien Black

- Male Canadian (North American / Canadian-ish): Aaron Dreschner or Craig Gutsy

### Gradio UI
![Gradio Based RAG](../images/llm_rag.png)

In [None]:
import numpy as np
import re

pattern = re.compile(
    r'(?:https?://\S+|www\.\S+|\+?\d[\d\s().-]{6,}\d)',
    re.IGNORECASE,
)

# Function to replace the links and phone number for TTS
def replace_links_and_phones(text: str) -> str:
    return pattern.sub("here", text)

# Function to get a response from the LLM with RAG
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

# Function to get the speech from TTS
def synth_speech(text):
      # get waveform (float32 in [-1, 1])
    wav = tts.tts(
        text=text,
        #speaker_wav=PATH_TO_TARGET_SPEAKER,
        speaker_id="Kumar Dahl",
        language="en",
    )
    sr = tts.synthesizer.output_sample_rate  # e.g., 24000 for xtts_v2
    # ensure dtype for gradio
    wav = np.asarray(wav, dtype=np.float32)
    return (sr, wav)

# Response function for Gradio
def chat_tts(message, history):
    result = conversation_chain.invoke({"question": message})
    answer = result["answer"]
    print(answer)
    history = (history or []) + [
        {"role": "user", "content": message},
        {"role": "assistant", "content": answer},
    ]
    return history, synth_speech(replace_links_and_phones(answer))

In [None]:
import gradio as gr
with gr.Blocks() as demo:
    audio = gr.Audio(label="Audio", autoplay=True)
    with gr.Row():
        with gr.Column():
            chatbot = gr.Chatbot(height=500, type="messages")
    with gr.Row():
        with gr.Column():
            txt = gr.Textbox(
                show_label=False,
                placeholder="Enter text to chat",
            )
            btn = gr.Button("Send")
            btn.click(chat_tts, inputs=[txt, chatbot], outputs=[chatbot, audio])
            txt.submit(chat_tts, inputs=[txt, chatbot], outputs=[chatbot, audio])
demo.launch(debug=True)