In [3]:
# Import required modules
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferMemory
from langchain.tools import TavilySearchResults
import gradio as gr
import wikipedia
import yt_dlp
from datetime import datetime
import os


In [4]:
# Load Quantized LLM (4-bit Mistral)
model_id = "mistralai/Mistral-7B-v0.1"
quant_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map="auto"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Load Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load PDFs into FAISS Vector Store
def load_documents(pdf_path):
    loader = PyMuPDFLoader(pdf_path)
    documents = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = splitter.split_documents(documents)
    return FAISS.from_documents(texts, embedding_model)

# Web Search Tool
search_tool = TavilySearchResults()

# Wikipedia Retrieval
def search_wikipedia(query):
    try:
        return wikipedia.summary(query, sentences=2)
    except:
        return "No relevant Wikipedia information found."

# YouTube Transcription Retrieval
def get_youtube_transcript(url):
    if url:  # Only process if URL is provided
        ydl_opts = {
            "quiet": True,
            "format": "bestaudio/best",
            "cookiefile": "/content/cookies.txt"  # Use the exported cookies
        }
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                info = ydl.extract_info(url, download=False)
                transcript = info.get("description", "")
            return transcript if transcript else "No transcript available."
        except Exception as e:
            return f"Error retrieving transcript: {str(e)}"
    return ""


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [6]:
# Conversation Memory
memory = ConversationBufferMemory(return_messages=True)

def update_memory(query, response):
    """Stores conversation history"""
    memory.save_context({"input": query}, {"output": response})


  memory = ConversationBufferMemory(return_messages=True)


In [7]:
# Retrieve Top Sources
def retrieve_sources(query, vector_db=None, youtube_url=None):
    sources = ""

    # Document search if PDF is loaded
    if vector_db is not None:
        retrieved_docs = vector_db.similarity_search(query, k=10)
        ranked_docs = [doc.page_content for doc in retrieved_docs]
        sources += f"Documents: {ranked_docs}\n"

    # Web search
    web_search = search_tool.run(query)
    sources += f"Web: {web_search}\n"

    # Wikipedia
    wiki_summary = search_wikipedia(query)
    sources += f"Wikipedia: {wiki_summary}\n"

    # YouTube transcript if URL provided
    if youtube_url:
        transcript = get_youtube_transcript(youtube_url)
        sources += f"YouTube Transcript: {transcript}"

    return sources

# Generate Response with LLM
def generate_response(query, vector_db=None, youtube_url=None):
    # Retrieve sources
    sources = retrieve_sources(query, vector_db=vector_db, youtube_url=youtube_url)

    # Construct the prompt
    prompt = f"""
    You are a helpful and intelligent assistant that must answer questions using only the information provided in the sources below.

    --------------------
    Sources:
    {sources}
    --------------------

    Using only the above sources, provide a clear, direct, and concise answer to the following question. If the sources do not contain sufficient information to answer the question, respond with "Insufficient information provided."

    Question: {query}
    """

    # Tokenize and move to GPU
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Get the length of the input prompt
    input_length = inputs['input_ids'].shape[1]

    # Generate the response
    output = model.generate(**inputs, max_new_tokens=200)

    # Extract only the generated tokens (exclude the prompt)
    generated_ids = output[0, input_length:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True)

    # Update memory with query and clean answer
    update_memory(query, response)

    return response


In [44]:
# Inference function
def inference():
    # Sample PDF path
    pdf_path = "None"
    vector_db = None

    if os.path.exists(pdf_path):
        print("Loading PDF...")
        vector_db = load_documents(pdf_path)
        print("PDF loaded successfully!")
    else:
        print("No PDF found, proceeding without document search.")

    # Example Query
    query = "What is Low Rank Adaptation in the context of machine learning models?"
    youtube_url = None

    print("\nQuery:", query)
    print("YouTube URL:", youtube_url if youtube_url else "None")

    # source retrieval
    print("\nRetrieving sources...")
    sources = retrieve_sources(query, vector_db=vector_db, youtube_url=youtube_url)
    print("Sources retrieved:")
    print(sources)

    # response generation
    print("\nGenerating response...")
    response = generate_response(query, vector_db=vector_db, youtube_url=youtube_url)
    print("\nResponse:")
    print(response)

    # Check memory
    print("\nConversation Memory:")
    memory_content = memory.load_memory_variables({})
    for msg in memory_content["history"]:
        if msg.type == "human":
            print(f"Q: {msg.content}")
        elif msg.type == "ai":
            print(f"A: {msg.content}\n")

if __name__ == "__main__":
    inference()


No PDF found, proceeding without document search.

Query: What is Low Rank Adaptation in the context of machine learning models?
YouTube URL: None

Retrieving sources...
Sources retrieved:
Web: [{'title': 'What is LoRA? | Low-rank adaptation - Cloudflare', 'url': 'https://www.cloudflare.com/learning/ai/what-is-lora/', 'content': 'Low-rank adaptation, or LoRA, is a less expensive, more efficient method for adapting large machine learning models to specific uses. Learn how LoRA works.', 'score': 0.9438932}, {'title': 'Low Rank Adaptation: A Technical deep dive - ML6', 'url': 'https://www.ml6.eu/blogpost/low-rank-adaptation-a-technical-deep-dive', 'content': 'Enter LoRA (Low Rank Adaptation) — a groundbreaking and efficient fine-tuning technique that harnesses the power of these advanced models for custom tasks and datasets without straining resources or incurring excessive costs. LoRA is an efficient finetuning technique proposed by Microsoft researchers to adapt large models to specific

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Response:
--------------------

    Answer:

    Low Rank Adaptation (LoRA) is a technique used in machine learning models to reduce the number of trainable parameters while maintaining the model's performance. It involves adding low-rank matrices to the model's parameters, which reduces the number of trainable parameters and the GPU memory requirement for the training process.

    LoRA leverages the concept of lower-rank matrices to make the model training process extremely efficient and fast. By leveraging smaller matrices, which are called low-rank matrices, LoRA adapts these models effectively.

    LoRA adds low-rank matrices to the frozen original machine learning model. This significantly reduces the trainable parameters of the model and the GPU memory requirement for the training process, which is another significant challenge when it comes to fine-tuning or training large models.

    To implement LoRA fine tuning with HuggingFace using Python and PyTorch,

Conversation Memo

In [8]:
# Gradio Web Deployment
def gradio_interface(query, youtube_url, pdf):
    """
    This function is the Gradio callback.
    It checks if a PDF is uploaded, loads it into the vector store,
    and then generates a response based on the provided query, YouTube URL, and PDF.
    It also returns the full chat history.
    """
    vector_db = None
    # If a PDF is provided, load it into FAISS
    if pdf is not None:
        # Gradio's File component returns a dict when type="binary"
        if isinstance(pdf, dict):
            pdf_path = pdf.get("name", "uploaded_pdf.pdf")
            with open(pdf_path, "wb") as f:
                f.write(pdf["data"])
        elif isinstance(pdf, str):
            pdf_path = pdf
        else:
            pdf_path = None

        if pdf_path is not None and os.path.exists(pdf_path):
            vector_db = load_documents(pdf_path)

    # Generate response using the provided inputs
    response = generate_response(query, vector_db=vector_db, youtube_url=youtube_url)

    # Retrieve conversation history
    history = memory.load_memory_variables({})["history"]
    chat_history = ""
    for msg in history:
        # Check for common attributes in stored messages.
        if hasattr(msg, "role"):
            if msg.role == "human":
                chat_history += f"User: {msg.content}\n"
            elif msg.role == "ai":
                chat_history += f"Assistant: {msg.content}\n"
        elif hasattr(msg, "type"):
            if msg.type == "human":
                chat_history += f"User: {msg.content}\n"
            elif msg.type == "ai":
                chat_history += f"Assistant: {msg.content}\n"
        else:
            chat_history += f"{msg}\n"

    return response, chat_history

# Build Gradio interface
with gr.Blocks() as app:
    gr.Markdown("# Advanced RAG Chatbot with PDF, YouTube & Web Search")
    with gr.Row():
         query_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...", lines=2)
         youtube_input = gr.Textbox(label="Enter YouTube URL (optional)", placeholder="YouTube URL here...", lines=1)

    pdf_input = gr.File(label="Upload PDF (optional)", file_count="single", type="binary")
    generate_button = gr.Button("Generate Response")
    response_output = gr.Textbox(label="Response", lines=10)
    chat_history_output = gr.Textbox(label="Chat History", lines=10)

    generate_button.click(
        fn=gradio_interface,
        inputs=[query_input, youtube_input, pdf_input],
        outputs=[response_output, chat_history_output]
    )

# launch the interface
app.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cd27c0c9405c7074c4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


