In [1]:
                    ## requirements.txt
# ipython==8.28.0
# langchain==0.1.13
# sentence-transformers==3.1.1
# PyPDF2==3.0.1
# edge-tts==7.0.2
# pymilvus==2.5.10
# ollama==0.4.3
# python-docx==0.8.11
# SpeechRecognition==3.14.3
# docx==0.2.4
# asyncio==3.4.3
# lxml==5.3.0

In [2]:
import warnings
import os
import time
import speech_recognition as sr
import edge_tts
import asyncio
from IPython.display import Audio, display, clear_output
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
from docx import Document
from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection
from sentence_transformers import SentenceTransformer
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate

# Suppress warnings
warnings.filterwarnings('ignore')

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Configuration
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"
COLLECTION_NAME = "voice_rag"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
OLLAMA_MODEL = "gemma2:2b"
OLLAMA_BASE_URL = "http://localhost:11434"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100

# Voice settings
LISTEN_TIMEOUT = 8
PHRASE_TIME_LIMIT = 15
LISTEN_PAUSE_THRESHOLD = 0.8
DEFAULT_VOICE = "en-US-GuyNeural"  # Edge TTS voice

In [4]:
# Initialize speech recognition
recognizer = sr.Recognizer()
recognizer.pause_threshold = LISTEN_PAUSE_THRESHOLD
microphone = sr.Microphone()

In [5]:
async def stream_output(text, voice=DEFAULT_VOICE):
    """Stream text output while simultaneously generating speech"""
    # Start voice generation in background
    communicate = edge_tts.Communicate(text=text, voice=voice)
    filename = "temp_output.mp3"
    voice_task = asyncio.create_task(communicate.save(filename))
    
    # Stream text output
    print("\nAnswer: ", end="", flush=True)
    for char in text:
        print(char, end="", flush=True)
        time.sleep(0.02)  # Adjust speed as needed
    
    # Wait for voice generation to complete
    await voice_task
    
    # Play audio
    display(Audio(filename, autoplay=True))
    return filename

In [6]:
def get_voice_command():
    """Capture voice input with configurable timing"""
    with microphone as source:
        print(f"\nListening (max {PHRASE_TIME_LIMIT} seconds)... Speak your question now.")
        recognizer.adjust_for_ambient_noise(source, duration=1)
        try:
            audio = recognizer.listen(
                source, 
                timeout=LISTEN_TIMEOUT, 
                phrase_time_limit=PHRASE_TIME_LIMIT
            )
            question = recognizer.recognize_google(audio)
            print(f"You asked: {question}")
            return question
        except sr.WaitTimeoutError:
            print("No speech detected. Please try again.")
            return None
        except sr.UnknownValueError:
            print("Sorry, I didn't understand that. Please speak clearly.")
            return None
        except sr.RequestError as e:
            print(f"Speech recognition service error: {e}")
            return None
        except Exception as e:
            print(f"Unexpected error: {e}")
            return None

In [7]:
def load_pdf(pdf_path):
    """Load text from PDF file"""
    try:
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        return "\n".join([page.page_content for page in pages])
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return None

In [8]:
def load_docx(docx_path):
    """Load text from DOCX file"""
    try:
        doc = Document(docx_path)
        return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
    except Exception as e:
        print(f"Error loading DOCX: {e}")
        return None

In [9]:
def load_document(file_path):
    """Load text from supported file types"""
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None
    
    if file_path.lower().endswith('.pdf'):
        return load_pdf(file_path)
    elif file_path.lower().endswith('.docx'):
        return load_docx(file_path)
    else:
        print("Unsupported file format. Please provide PDF or DOCX.")
        return None

In [10]:
def split_text(text):
    """Split text into chunks"""
    if not text:
        return []
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len
    )
    return splitter.split_text(text)

In [11]:
def init_milvus():
    """Initialize connection to Milvus"""
    try:
        connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
        print(f"Connected to Milvus at {MILVUS_HOST}:{MILVUS_PORT}")
        return True
    except Exception as e:
        print(f"Failed to connect to Milvus: {e}")
        return False

In [12]:
def setup_milvus_collection(embedding_dim):
    """Create and configure Milvus collection"""
    try:
        if utility.has_collection(COLLECTION_NAME):
            utility.drop_collection(COLLECTION_NAME)
        
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
            FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=65535),
            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=embedding_dim)
        ]
        schema = CollectionSchema(fields, "Voice RAG Collection")
        collection = Collection(COLLECTION_NAME, schema)
        
        index_params = {
            "index_type": "IVF_FLAT",
            "metric_type": "L2",
            "params": {"nlist": 128}
        }
        collection.create_index("embedding", index_params)
        return collection
    except Exception as e:
        print(f"Failed to setup Milvus collection: {e}")
        return None

In [13]:
async def main():
    # Initialize Voice Assistant
    print("Voice system initialized")
    
    # Document processing
    file_path = input("\nEnter path to your document (PDF or DOCX): ").strip()
    if not os.path.exists(file_path):
        print("File not found. Please check the path.")
        return
    
    print("\nProcessing document...")
    text = load_document(file_path)
    if not text:
        return
    
    chunks = split_text(text)
    print(f"Document split into {len(chunks)} chunks")
    
    # Initialize Milvus
    if not init_milvus():
        return
    
    # Initialize embedding model
    try:
        st_model = SentenceTransformer(EMBEDDING_MODEL)
        print("Embedding model loaded successfully")
    except Exception as e:
        print(f"Failed to load embedding model: {e}")
        return
    
    # Generate embeddings
    print("Generating embeddings...")
    embeddings = st_model.encode(chunks, show_progress_bar=True)
    
    # Setup Milvus collection
    collection = setup_milvus_collection(embeddings.shape[1])
    if not collection:
        return
    
    # Store data
    print("Storing data in Milvus...")
    collection.insert([chunks, embeddings.tolist()])
    collection.load()
    
    # Initialize LLM
    try:
        llm = Ollama(base_url=OLLAMA_BASE_URL, model=OLLAMA_MODEL)
        print("LLM initialized successfully")
    except Exception as e:
        print(f"Failed to initialize LLM: {e}")
        return
    
    # Prompt template
    prompt_template = """Answer this question based on the context below.
    Keep your answer concise and to the point.
    
    Context: {context}
    
    Question: {question}
    
    Answer:"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    
    # Main interaction loop
    print("\nSystem ready! You can now ask questions about the document.")
    await stream_output("System ready. You can now ask questions about the document.")
    
    while True:
        question = get_voice_command()
        if not question:
            continue
        
        if question.lower() in ['exit', 'quit', 'stop']:
            print("Exiting...")
            await stream_output("Goodbye!")
            break
        
        # Search Milvus
        try:
            query_embedding = st_model.encode([question])
            results = collection.search(
                data=query_embedding,
                anns_field="embedding",
                param={"metric_type": "L2", "params": {"nprobe": 10}},
                limit=3,
                output_fields=["content"]
            )
            
            # Prepare context
            context = "\n".join([hit.entity.get("content") for hit in results[0]])
            
            # Generate answer
            answer = llm(prompt.format(context=context, question=question)).strip()
            
            # Stream output with voice
            await stream_output(answer)
        
        except Exception as e:
            error_msg = f"Error processing your question: {str(e)}"
            print(error_msg)
            await stream_output("Sorry, I encountered an error processing your question.")

In [None]:
# For Jupyter notebook compatibility
if __name__ == "__main__":
    import nest_asyncio
    nest_asyncio.apply()
    asyncio.run(main())

Voice system initialized



Enter path to your document (PDF or DOCX):  test_data/CAG-test_data.docx



Processing document...
Document split into 10 chunks
Connected to Milvus at localhost:19530
Embedding model loaded successfully
Generating embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Storing data in Milvus...
LLM initialized successfully

System ready! You can now ask questions about the document.

Answer: System ready. You can now ask questions about the document.


Listening (max 15 seconds)... Speak your question now.
Sorry, I didn't understand that. Please speak clearly.

Listening (max 15 seconds)... Speak your question now.
You asked: comment about in 150 words

Answer: The automobile industry is undergoing a remarkable transformation driven by innovations that promise a sustainable future. From electric vehicles and autonomous driving to improved connectivity and alternative fuels, these advancements address existing challenges like range limitations and charging infrastructure while creating a greener transportation ecosystem.  This forward-thinking approach will result in smarter, cleaner, and more efficient cars, shaping the way we move and interact with our world.


Listening (max 15 seconds)... Speak your question now.
No speech detected. Please try again.

Listening (max 15 seconds)... Speak your question now.
No speech detected. Please try again.

Listening (max 15 seconds)... Speak your question now.
Sorry, I didn't understand that. Please speak clearly.

Listening (max 15 seconds)... Speak your question now.
You asked: tell me a good title for the document

Answer: Here are a few title options that capture the essence of the text:

* **The Future of Driving: Innovations in the Automobile Industry**
* **Automobiles on the Horizon:  Unprecedented Innovation and Sustainability** 
* **Beyond Gas-Powered Cars: The Rise of Smart & Eco-Friendly Transportation**


Choose the title that best suits your intended audience and the overall tone you want to convey.


Listening (max 15 seconds)... Speak your question now.
No speech detected. Please try again.

Listening (max 15 seconds)... Speak your question now.
No speech detected. Please try again.

Listening (max 15 seconds)... Speak your question now.
Sorry, I didn't understand that. Please speak clearly.

Listening (max 15 seconds)... Speak your question now.
Sorry, I didn't understand that. Please speak clearly.

Listening (max 15 seconds)... Speak your question now.
