In [1]:
!pip install gradio
!pip install -U openai-whisper
!pip install sentence_transformers
!pip install transformers

Collecting gradio
  Downloading gradio-5.5.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.7.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

# QA Model's for answering questions given as input in the form of audio
## Whisper was used to transcribe the audio

In [None]:
import torch
import whisper
import librosa
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Set the device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Whisper ASR model on GPU
model_m = whisper.load_model("medium", device=device)

# Load Hugging Face Transformers pipeline for question answering
qa_pipeline = pipeline('question-answering', model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", device=0 if device == "cuda" else -1)

# Load Sentence Transformer model for semantic search
semantic_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

def transcribe_question(input_question_file):
    # Load and resample audio to 16kHz
    question_audio, sr = librosa.load(input_question_file, sr=16000)
    whisper_audio = torch.tensor(question_audio, dtype=torch.float32).to(device)
    # Perform transcription on the question audio
    result = model_m.transcribe(whisper_audio, language="en", fp16=torch.cuda.is_available())
    return result["text"]

def generate_answer(question_text, context):
    # Use the Hugging Face Transformers pipeline to extract the answer
    answer = qa_pipeline({
        "question": question_text,
        "context": context
    })
    return answer["answer"]

def semantic_search(query, segments):
    # Embed the query and segments to compute cosine similarity
    query_embedding = semantic_model.encode(query, convert_to_tensor=True)
    segment_embeddings = semantic_model.encode(segments, convert_to_tensor=True)

    # Compute cosine similarities
    similarities = util.pytorch_cos_sim(query_embedding, segment_embeddings)[0]

    # Get the index of the most similar segment
    most_similar_idx = torch.argmax(similarities).item()
    return segments[most_similar_idx]

def speech_based_qa_pipeline(input_question_file, transcription_files):
    # Step 1: Transcribe the question
    question_text = transcribe_question(input_question_file)
    print("User Question:", question_text)

    # Step 2: Load and concatenate transcriptions from files as separate segments
    segments = []
    for file_path in transcription_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            segments.append(f.read().strip())  # Add each file's transcription as a separate segment

    # Step 3: Perform semantic search to find the most relevant segment
    relevant_segment = semantic_search(question_text, segments)

    # Step 4: Generate answer using the Hugging Face Transformers pipeline with the relevant context
    final_answer = generate_answer(question_text, relevant_segment)
    print("Final Answer:", final_answer)

    return final_answer


  checkpoint = torch.load(fp, map_location=device)
Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import os
import json
import torch
import whisper
import librosa
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Set the device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Whisper ASR model on GPU
model_m = whisper.load_model("medium", device=device)

# Load Hugging Face Transformers pipeline for question answering
qa_pipeline = pipeline('question-answering', model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", device=0 if device == "cuda" else -1)

# Load Sentence Transformer model for semantic search
semantic_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Path to feedback log file
feedback_log_path = "feedback_log.json"

# Load feedback log if it exists
if os.path.exists(feedback_log_path):
    with open(feedback_log_path, 'r', encoding='utf-8') as f:
        feedback_log = json.load(f)
else:
    feedback_log = {}

def transcribe_question(input_question_file):
    # Load and resample audio to 16kHz
    question_audio, sr = librosa.load(input_question_file, sr=16000)
    whisper_audio = torch.tensor(question_audio, dtype=torch.float32).to(device)
    # Perform transcription on the question audio
    result = model_m.transcribe(whisper_audio, language="en", fp16=torch.cuda.is_available())
    return result["text"]

def generate_answer(question_text, context):
    # Use the Hugging Face Transformers pipeline to extract the answer
    answer = qa_pipeline({
        "question": question_text,
        "context": context
    })
    return answer["answer"]

def semantic_search(query, segments):
    # Embed the query and segments to compute cosine similarity
    query_embedding = semantic_model.encode(query, convert_to_tensor=True)
    segment_embeddings = semantic_model.encode(segments, convert_to_tensor=True)

    # Compute cosine similarities
    similarities = util.pytorch_cos_sim(query_embedding, segment_embeddings)[0]

    # Get the index of the most similar segment
    most_similar_idx = torch.argmax(similarities).item()
    return segments[most_similar_idx]

def save_feedback(question_text, correct_answer):
    # Store corrected answer in feedback log and save to file
    feedback_log[question_text] = correct_answer
    with open(feedback_log_path, 'w', encoding='utf-8') as f:
        json.dump(feedback_log, f, ensure_ascii=False, indent=4)
    print(f"Feedback saved for question '{question_text}'.")

def check_feedback(question_text):
    # Check if feedback exists for the question
    return feedback_log.get(question_text, None)

def speech_based_qa_pipeline(input_question_file, transcription_files):
    # Step 1: Transcribe the question
    question_text = transcribe_question(input_question_file)
    print("User Question:", question_text)

    # Check if feedback is available for a similar question
    corrected_answer = check_feedback(question_text)
    if corrected_answer:
        print("Corrected Answer from Feedback Log:", corrected_answer)
        return corrected_answer  # Return the feedback if available

    # Step 2: Load and concatenate transcriptions from files as separate segments
    segments = []
    for file_path in transcription_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            segments.append(f.read().strip())  # Add each file's transcription as a separate segment

    # Step 3: Perform semantic search to find the most relevant segment
    relevant_segment = semantic_search(question_text, segments)

    # Step 4: Generate answer using the Hugging Face Transformers pipeline with the relevant context
    final_answer = generate_answer(question_text, relevant_segment)
    print("Generated Answer:", final_answer)

    # Step 5: Request feedback from user
    user_feedback = input("Is the answer correct? (yes/no): ").strip().lower()
    if user_feedback == "no":
        corrected_answer = input("Please provide the correct answer: ").strip()
        save_feedback(question_text, corrected_answer)
        return corrected_answer
    else:
        print("Answer confirmed as correct.")
        return final_answer

KeyboardInterrupt: 

In [2]:
import torch
import whisper
import librosa
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
import os

# Set the device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set up GenAI API key
os.environ["API_KEY"] = "AIzaSyBZTAdmQ-cAi0wKD_z9hvSt_r5qjgm13lY"  # Replace with your actual GenAI key
genai.configure(api_key=os.environ["API_KEY"])

# Load Whisper ASR model on GPU
model_m = whisper.load_model("medium", device=device)

# Load Sentence Transformer model for semantic search
semantic_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')


def transcribe_question(input_question_file):
    """
    Transcribe audio input containing the question using Whisper.
    """
    # Load and resample audio to 16kHz
    question_audio, sr = librosa.load(input_question_file, sr=16000)
    whisper_audio = torch.tensor(question_audio, dtype=torch.float32).to(device)
    # Perform transcription on the question audio
    result = model_m.transcribe(whisper_audio, language="en", fp16=torch.cuda.is_available())
    return result["text"]


def generate_answer_with_genai(question_text, context):
    """
    Use GenAI to generate an answer to the question based on the provided context.
    """
    try:
        # Prepare the prompt for GenAI
        prompt = (
            f"Answer the following question based on the provided context:\n\n"
            f"Question: {question_text}\n"
            f"Context: {context}\n"
            f"Answer:"
            f"Just give answer no explaination, if proper answer not found then give any answer close to it, if the question does not look relevant make it relevant as there can be some spelling or grammer mistakes in question and if"
            f"and if question's answer is not found give according to you or anything close to it just not give output as 'this passage does not contain answer'"
        )
        # Use GenAI to generate the answer
        model = genai.GenerativeModel("gemini-1.5-flash")  # Specify the model to use
        response = model.generate_content(prompt)

        # Extract the generated answer from the response
        answer = response.text
        return answer.strip()
    except Exception as e:
        print(f"Error generating answer with GenAI: {e}")
        return None


def semantic_search(query, segments):
    """
    Perform semantic search to find the most relevant context segment for the query.
    """
    # Embed the query and segments to compute cosine similarity
    query_embedding = semantic_model.encode(query, convert_to_tensor=True)
    segment_embeddings = semantic_model.encode(segments, convert_to_tensor=True)

    # Compute cosine similarities
    similarities = util.pytorch_cos_sim(query_embedding, segment_embeddings)[0]

    # Get the index of the most similar segment
    most_similar_idx = torch.argmax(similarities).item()
    return segments[most_similar_idx]


def speech_based_qa_pipeline(input_question_file, transcription_files):
    """
    Perform the full speech-based QA pipeline:
    - Transcribe the question from audio.
    - Perform semantic search on context segments.
    - Use GenAI for QA.
    """
    # Step 1: Transcribe the question
    question_text = transcribe_question(input_question_file)
    print("User Question:", question_text)

    # Step 2: Load and concatenate transcriptions from files as separate segments
    segments = []
    for file_path in transcription_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            segments.append(f.read().strip())  # Add each file's transcription as a separate segment

    # Step 3: Perform semantic search to find the most relevant segment
    relevant_segment = semantic_search(question_text, segments)

    # Step 4: Generate answer using GenAI with the relevant context
    final_answer = generate_answer_with_genai(question_text, relevant_segment)
    print("Final Answer:", final_answer)

    return final_answer


  from tqdm.autonotebook import tqdm, trange
100%|█████████████████████████████████████| 1.42G/1.42G [00:31<00:00, 48.4MiB/s]
  checkpoint = torch.load(fp, map_location=device)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
import gradio as gr
import os

# Dummy transcription and question-answering logic for illustration purposes
transcription_files = [
    r"/content/final_combined_output.txt",  # Refined text transcribed from audio
]

def validate_audio(audio_file):
    """
    Validates the audio file and confirms if it is ready for processing.
    """
    if not audio_file or not os.path.exists(audio_file):
        return "Error: No valid audio file detected. Please record or upload a valid audio file."
    try:
        # Simulate audio processing check
        # Replace with actual validation logic if needed
        return "Audio processed successfully! You can now submit your question."
    except Exception as e:
        return f"An error occurred during validation: {str(e)}"


def process_audio_question(audio_file):
    """
    Processes the audio file and provides an answer.
    """
    if not audio_file or not os.path.exists(audio_file):
        return "Error: No valid audio file detected. Please record or upload a valid audio file."
    try:
        # Replace with actual transcription and story-based answering logic
        answer = speech_based_qa_pipeline(audio_file, transcription_files)
        return answer
    except Exception as e:
        return f"An error occurred during processing: {str(e)}"


# Gradio UI
with gr.Blocks() as iface:
    gr.Markdown("# Sandalwood Stories QA System")
    gr.Markdown("Record or upload your voice asking a question, and the system will provide an answer based on Sandalwood stories.")

    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Question (Record or Upload Audio)")
        validate_output = gr.Textbox(label="Status", interactive=False)
        output_text = gr.Textbox(label="Answer", interactive=False)

    submit_button = gr.Button("Get Answer")

    # Handle audio input
    def handle_audio_input(audio_file):
        """
        Validate audio and show the status.
        """
        return validate_audio(audio_file)

    # Connect audio input to validation
    audio_input.change(
        handle_audio_input,
        inputs=audio_input,
        outputs=validate_output
    )

    # Submission step
    submit_button.click(
        process_audio_question,
        inputs=audio_input,
        outputs=output_text
    )

iface.launch(debug=True, share=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://f679f77c88ed78002b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


User Question:  What are the unique qualities of sandalwood growing in Karnataka that make it valuable in global market?
Final Answer: India's sandalwood, prized for its chemical composition (up to 95% santalol, traces of santalin and santineel), achieves superior quality due to Karnataka's unique soil, water, and climate.
User Question:  Why does this not work?
Final Answer: The provided text offers numerous examples of agricultural projects and initiatives, many of which encountered challenges.  There is no single "this" to refer to, making it impossible to answer "Why does this not work?" without more context.  Please specify which project or initiative you are referring to.
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://f679f77c88ed78002b.gradio.live


