# ML

## SECTION 1: Install Dependencies

In [None]:
!pip install langchain langchain-community chromadb pypdf langchain-google-genai gdown



## SECTION 2: Import Libraries

In [None]:
"""
Imports all necessary libraries for file handling, Google Drive integration, document processing,
embeddings, vector storage, LLM setup, and RAG pipeline creation.
"""
import os
import gdown
from google.colab import drive
from getpass import getpass
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai

## SECTION 3: Set Up Environment

In [None]:
"""
Configures the environment by mounting Google Drive for persistent storage, defining file paths,
and downloading the CTSE lecture notes PDF. Initializes a cache for query responses.
"""

'\nConfigures the environment by mounting Google Drive for persistent storage, defining file paths,\nand downloading the CTSE lecture notes PDF. Initializes a cache for query responses.\n'

In [None]:
# Mount Google Drive for persistent Chroma database storage
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define file paths and cache
DATA_PATH = "/content/ML - Merged.pdf"  # Path for downloaded PDF
CHROMA_PATH = "/content/drive/MyDrive/chroma_db_ml_gemini"  # Path for Chroma vector database
CACHE = {}  # Dictionary to store cached query responses

In [None]:
# Download lecture notes PDF from Google Drive
file_id = "1tu26YxQNFO1GbY-QFiJXGv8yV9OsvWaF"
gdown.download(f"https://drive.google.com/uc?id={file_id}", DATA_PATH, quiet=True)

'/content/ML - Merged.pdf'

In [None]:
# Verify PDF download
if not os.path.exists(DATA_PATH):
    print(f"Error: Failed to download lecture notes to {DATA_PATH}")
else:
    print(f"Lecture notes downloaded to {DATA_PATH}")

Lecture notes downloaded to /content/ML - Merged.pdf


In [None]:
# Set up Google API key for Gemini
print("Enter your Google API key for Gemini:")
api_key = getpass("API Key: ")
os.environ["GOOGLE_API_KEY"] = api_key
genai.configure(api_key=api_key)

Enter your Google API key for Gemini:
API Key: ··········


## SECTION 4: Load and Split Documents

In [None]:
"""
Loads the lecture notes PDF and splits it into manageable chunks for embedding. Uses a larger
chunk size and overlap to preserve context, improving retrieval accuracy.
"""

'\nLoads the lecture notes PDF and splits it into manageable chunks for embedding. Uses a larger\nchunk size and overlap to preserve context, improving retrieval accuracy.\n'

In [None]:
# Load PDF using PyPDFLoader
print("Loading lecture notes...")
loader = PyPDFLoader(DATA_PATH)
documents = loader.load()

Loading lecture notes...


In [None]:
# Verify document loading
if not documents:
    print("Error: No documents loaded from the PDF.")
else:
    print(f"Loaded {len(documents)} document pages.")

Loaded 757 document pages.


In [None]:
# Split documents into chunks for embedding
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Larger chunk size to retain context
    chunk_overlap=200  # Overlap to ensure continuity between chunks
)
docs = splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks.")

Split into 692 chunks.


## SECTION 5: Create Embeddings and Vector Store

In [None]:
"""
Generates embeddings using Google's embedding-001 and stores them in a Chroma vector database,
persisted to Google Drive. Configures a retriever to fetch relevant document chunks.
"""

"\nGenerates embeddings using Google's embedding-001 and stores them in a Chroma vector database,\npersisted to Google Drive. Configures a retriever to fetch relevant document chunks.\n"

In [None]:
# Initialize embedding model
embedding_model_name = "models/embedding-001"
embeddings = GoogleGenerativeAIEmbeddings(model=embedding_model_name)
print(f"Initialized embeddings: {embedding_model_name}")

Initialized embeddings: models/embedding-001


In [None]:
# Load or create Chroma vector store
if os.path.exists(CHROMA_PATH):
    print(f"Loading vector store from {CHROMA_PATH}")
    vectorstore = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=embeddings
    )
else:
    print(f"Creating vector store in {CHROMA_PATH}")
    vectorstore = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        persist_directory=CHROMA_PATH
    )
print("Vector store created.")

Creating vector store in /content/drive/MyDrive/chroma_db_ml_gemini
Vector store created.


In [None]:
# Configure retriever to fetch top 5 relevant chunks
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
print("Retriever configured.")

Retriever configured.


## SECTION 6: Initialize Language Model

In [None]:
"""
Sets up the google/gemini-2.0-flash model.
"""

'\nSets up the google/gemini-2.0-flash model.\n'

In [None]:
# Initialize Gemini model
model = "gemini-2.0-flash"
print(f"Initializing model: {model}")
llm = ChatGoogleGenerativeAI(
    model=model,
    temperature=0.7,
    max_output_tokens=1024,
    top_p=0.95,
)
print("LLM initialized.")

Initializing model: gemini-2.0-flash
LLM initialized.


## SECTION 7: Configure RAG Chain

In [None]:
"""
Sets up the RAG pipeline with a custom prompt to ensure answers are based solely on the provided
context. Combines the retriever and LLM for retrieval-augmented generation.
"""

'\nSets up the RAG pipeline with a custom prompt to ensure answers are based solely on the provided\ncontext. Combines the retriever and LLM for retrieval-augmented generation.\n'

In [None]:
# Define custom prompt for RAG
prompt = ChatPromptTemplate.from_template(
    "Context from Lecture Notes:\n"
    "{context}\n\n"
    "Based on the above context, provide an answer to the following question.\n"
    "Summarize relevant information (e.g., bullet points, definitions) and answer ONLY the question asked.\n"
    "If the context does not contain the answer, respond with: "
    "\"I cannot answer this question based on the provided notes.\"\n\n"
    "Question: {input}"
)

In [None]:
# Create document chain to process retrieved documents
document_chain = create_stuff_documents_chain(llm, prompt)

In [None]:
# Create RAG chain combining retriever and document chain
rag_chain = create_retrieval_chain(retriever, document_chain)
print("RAG chain created. Ready to answer questions.")

RAG chain created. Ready to answer questions.


## SECTION 8: Interactive Chat Loop

In [None]:
"""
Implements an interactive chat loop for user queries. Supports caching for efficiency, verbose mode
for source document display, and a Markdown-inspired output format for clarity.
"""

'\nImplements an interactive chat loop for user queries. Supports caching for efficiency, verbose mode\nfor source document display, and a Markdown-inspired output format for clarity.\n'

In [None]:
# Print chatbot introduction
print("\n======================================")
print("    🤖 CTSE Lecture Notes Chatbot    ")
print("======================================")
print("Enter your question or 'exit' to quit.")
print("Append '--verbose' to see source documents.\n")

# Main loop for user interaction
while True:
    query = input("❓ Question: ")

    # Handle exit command
    if query.strip().lower() == 'exit':
        print("\n👋 Exiting. Goodbye!")
        break

    # Handle verbose mode and normalize query
    verbose = False
    if '--verbose' in query:
        verbose = True
        query = query.replace('--verbose', '').strip()

    # Validate and normalize input
    query = ' '.join(query.split())  # Remove extra spaces
    if not query:
        print("\n⚠️ Error: Please enter a valid question.")
        continue

    # Log processing
    print("\n⏳ Processing...")

    try:
        # Check cache and validate response
        if query in CACHE:
            cached_answer = CACHE[query]['answer']
            # Re-invoke if cached answer is the fallback response
            if cached_answer == "I cannot answer this question based on the provided notes.":
                result = rag_chain.invoke({"input": query})
                CACHE[query] = {'answer': result['answer'], 'context': result['context']}
            else:
                print(f"\nQuestion: {query}\n")
                print("Answer (Cached):")
                print(f"{cached_answer}\n")
                if verbose:
                    print("📚 Source Documents (Cached):")
                    for i, doc in enumerate(CACHE[query]['context'], 1):
                        print(f"- Source {i} (Page: {doc.metadata.get('page', 'N/A')}):")
                        print(f"{doc.page_content[:300]}{'...' if len(doc.page_content) > 300 else ''}")
                        print("" + "-" * 100)
                print("\n💡 Ask another question or type 'exit' to quit!\n")
                continue

        # Invoke RAG chain
        result = rag_chain.invoke({"input": query})

        # Cache response
        CACHE[query] = {
            'answer': result['answer'],
            'context': result['context']
        }

        # Print refined terminal-friendly output
        print(f"\nQuestion: {query}\n")
        print("Answer:")
        print(f"{result['answer']}\n")
        if verbose:
            print("📚 Source Documents:\n")
            for i, doc in enumerate(result['context'], 1):
                print(f"- Source {i} (Page: {doc.metadata.get('page', 'N/A')}):")
                print(f"{doc.page_content[:300]}{'...' if len(doc.page_content) > 300 else ''}")
                print("" + "-" * 100)
        print("\n💡 Ask another question or type 'exit' to quit!\n")

    except Exception as e:
        print(f"\n⚠️ Error: {e}")
        print("Please check your input, ensure the PDF is accessible, and verify your Hugging Face token.")


    🤖 CTSE Lecture Notes Chatbot    
Enter your question or 'exit' to quit.
Append '--verbose' to see source documents.

❓ Question: explain ensemble methods --verbose

⏳ Processing...

Question: explain ensemble methods

Answer:
*   **Definition:** Ensemble methods use a combination of multiple models (classifiers/predictors) to create an improved composite model.
*   **Goal:** To create an improved composite model M* from a series of k learned models (M1, M2, M3, ..., Mk).
*   **Process:**
    *   Individual models make predictions.
    *   These predictions are combined (e.g., through voting).
    *   The combined result is the final prediction of the ensemble.
*   **Examples:** Multiple decision trees, Bagging, Boosting, Boolean operator-based ensemble methods, ML based ensemble methods, Stack ensemble.
*   **Motivation:** Often needed to solve real-world problems.
*   **Prediction Mechanism:**
    *   Each classifier's vote is assigned a score/weight based on its performance (low