In [1]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.3 (from gradio)
  Downloading gradio_client-1.4.3-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.7.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [4]:
# !pip install openai-whisper
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu117
!pip install pymongo[srv]

Looking in indexes: https://download.pytorch.org/whl/cu117
Collecting pymongo[srv]
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
[0mCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


In [6]:
# !pip install langchain_community
!pip install transformers



In [7]:
import os
os.environ["HF_TOKEN"] = "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"

In [8]:
!pip install huggingface_hub



In [9]:
from huggingface_hub import login
import os

# Authenticate using the token from the environment variable
login(token=os.getenv("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [1]:
import gradio as gr
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from urllib.parse import quote_plus
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_community.document_loaders import DirectoryLoader
from transformers import pipeline
import whisper
import torch
import json
import os

# MongoDB setup
username = quote_plus("valmik0000000")
password = quote_plus("valmik@mongo7")  # Replace with your actual password

uri = f"mongodb+srv://{username}:{password}@valmikcluster0.hdqee.mongodb.net/?retryWrites=true&w=majority&appName=ValmikCluster0"
client = MongoClient(uri, server_api=ServerApi('1'), tls=True)

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(f"Error: {e}")

# Feedback file
FEEDBACK_FILE = "feedback.json"

# Load or initialize feedback data
if os.path.exists(FEEDBACK_FILE):
    with open(FEEDBACK_FILE, "r") as f:
        feedback_data = json.load(f)
else:
    feedback_data = {}

# Save feedback to file
def save_feedback_to_file():
    with open(FEEDBACK_FILE, "w") as f:
        json.dump(feedback_data, f, indent=4)

# Database and collection setup for QA
dbName = "ML_Fiesta"
collectionName = "translations"
collection = client[dbName][collectionName]

# Define the embedding model
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Check if the data already exists in the vector store
if collection.estimated_document_count() == 0:
    print("No data found in the vector store. Loading documents and creating the vector store.")
    loader = DirectoryLoader(
        r"/content/drive/MyDrive/ML_Fiesta_Mongo_DB_Vectorization/translations",
        glob="./*.txt",
        show_progress=True
    )
    data = loader.load()
    vectorStore = MongoDBAtlasVectorSearch.from_documents(data, collection=collection, embedding=embeddings)
else:
    print("Data already exists in the vector store. Connecting to existing vector store.")
    vectorStore = MongoDBAtlasVectorSearch(collection=collection, embedding=embeddings)

# Define the question-answering pipeline
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad", tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad", device="cuda" if torch.cuda.is_available() else "cpu")

# Load Whisper model for audio transcription
whisper_model = whisper.load_model("base", device="cuda" if torch.cuda.is_available() else "cpu")

# Global variables to store the last query and answer
last_query = ""
last_answer = ""

def transcribe_audio(audio_file):
    """
    Transcribes the given audio file using Whisper.
    """
    result = whisper_model.transcribe(audio_file)
    return result['text']

def query_data(query) -> str:
    """
    Querying data loaded in MongoDB and returning an answer.
    """
    # Check for feedback corrections in the feedback file
    if query in feedback_data:
        print("Using corrected answer from feedback file.")
        return feedback_data[query]  # Use the corrected answer

    # If no corrections, use the vector store and QA pipeline
    retriever = vectorStore.as_retriever(search_kwargs={"K": 61})
    retrieved_docs = retriever.get_relevant_documents(query)
    context = " ".join([doc.page_content for doc in retrieved_docs])
    response = qa_pipeline(question=query, context=context)
    return response['answer']

def handle_query(audio):
    """
    Handles user queries from audio input and returns results.
    """
    global last_query, last_answer
    # Transcribe the audio input
    query = transcribe_audio(audio)
    last_query = query  # Store the query for later feedback
    answer = query_data(query)
    last_answer = answer  # Store the answer for later correction

    # Return the question and the answer
    return f"Question: {query}\nAnswer: {answer}"

# def submit_feedback(feedback_text):
#     """
#     Handles feedback submission and updates the answer if the feedback indicates a wrong answer.
#     """
#     global last_query, last_answer
#     if last_query == "":
#         return "No query has been submitted yet. Please submit a query first."

#     if feedback_text.lower() in ['wrong', 'incorrect', 'not correct']:
#         # Prompt the user for the correct answer
#         corrected_answer = input(f"Please provide the correct answer for the question '{last_query}': ")
#         # Store the feedback and corrected answer in the feedback file
#         feedback_data[last_query] = corrected_answer
#         save_feedback_to_file()
#         print("Feedback stored successfully! Answer has been corrected.")
#         return f"The answer has been corrected. The new answer is: {corrected_answer}"
#     else:
#         # Store feedback without corrections
#         feedback_data[last_query] = last_answer
#         save_feedback_to_file()
#         print("Feedback stored successfully!")
#         return f"Feedback for the query '{last_query}' has been stored."
def submit_feedback(feedback_text):
    """
    Handles feedback submission and updates the answer if the feedback indicates a wrong answer.
    """
    global last_query, last_answer
    if last_query == "":
        return "No query has been submitted yet. Please submit a query first."

    if feedback_text.lower() in ['wrong', 'incorrect', 'not correct']:
        # Prompt the user for the correct answer
        corrected_answer = input(f"Please provide the correct answer for the question '{last_query}': ")
        if corrected_answer.strip():  # Ensure the corrected answer is not empty
            # Store the feedback and corrected answer in the feedback file
            feedback_data[last_query] = corrected_answer
            save_feedback_to_file()
            print("Feedback stored successfully! Answer has been corrected.")
            return f"The answer has been corrected. The new answer is: {corrected_answer}"
        else:
            return "No corrected answer provided. Feedback not stored."
    else:
        # Store feedback without corrections
        feedback_data[last_query] = last_answer
        save_feedback_to_file()
        print("Feedback stored successfully!")
        return f"Feedback for the query '{last_query}' has been stored."


# Gradio Blocks Setup
with gr.Blocks() as query_block:
    gr.Markdown("# Voice-based QA System with Feedback")
    gr.Markdown("Ask questions using your voice in Kannada or English, get answers, and provide feedback. If an answer is incorrect, you can provide the correct answer, and it will be stored.")

    with gr.Row():
        with gr.Column():
            query_input = gr.Audio(type="filepath", label="Upload your audio query")
            query_button = gr.Button("Submit Query")
            query_output = gr.Textbox(label="Response")

        with gr.Column():
            feedback_input = gr.Textbox(placeholder="Provide feedback on the answer here", label="Feedback")
            feedback_button = gr.Button("Submit Feedback")
            feedback_output = gr.Textbox(label="Feedback Result")

    query_button.click(fn=handle_query, inputs=query_input, outputs=query_output)
    feedback_button.click(fn=submit_feedback, inputs=feedback_input, outputs=feedback_output)

# Launch the Gradio interface
query_block.launch(debug=True)

Pinged your deployment. You successfully connected to MongoDB!


  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Data already exists in the vector store. Connecting to existing vector store.


  vectorStore = MongoDBAtlasVectorSearch(collection=collection, embedding=embeddings)
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  checkpoint = torch.load(fp, map_location=device)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://36f65c5c2c5bac3cd4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  retrieved_docs = retriever.get_relevant_documents(query)


Feedback stored successfully! Answer has been corrected.




Using corrected answer from feedback file.


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 624, in process_events
    response = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 323, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 2015, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1562, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 8

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://36f65c5c2c5bac3cd4.gradio.live


