In [None]:
!pip install gradio
!pip install -U openai-whisper
!pip install sentence_transformers
!pip install transformers



In [None]:
import torch
import whisper
import librosa
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Set the device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Whisper ASR model on GPU
model_m = whisper.load_model("medium", device=device)

# Load Hugging Face Transformers pipeline for question answering
qa_pipeline = pipeline('question-answering', model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", device=0 if device == "cuda" else -1)

# Load Sentence Transformer model for semantic search
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

def transcribe_question(input_question_file):
    # Load and resample audio to 16kHz
    question_audio, sr = librosa.load(input_question_file, sr=16000)
    whisper_audio = torch.tensor(question_audio, dtype=torch.float32).to(device)
    # Perform transcription on the question audio
    result = model_m.transcribe(whisper_audio, language="en", fp16=torch.cuda.is_available())
    return result["text"]

def generate_answer(question_text, context):
    # Use the Hugging Face Transformers pipeline to extract the answer
    answer = qa_pipeline({
        "question": question_text,
        "context": context
    })
    return answer["answer"]

def semantic_search(query, segments):
    # Embed the query and segments to compute cosine similarity
    query_embedding = semantic_model.encode(query, convert_to_tensor=True)
    segment_embeddings = semantic_model.encode(segments, convert_to_tensor=True)

    # Compute cosine similarities
    similarities = util.pytorch_cos_sim(query_embedding, segment_embeddings)[0]

    # Get the index of the most similar segment
    most_similar_idx = torch.argmax(similarities).item()
    return segments[most_similar_idx]

def speech_based_qa_pipeline(input_question_file, transcription_files):
    # Step 1: Transcribe the question
    question_text = transcribe_question(input_question_file)
    print("User Question:", question_text)

    # Step 2: Load and concatenate transcriptions from files as separate segments
    segments = []
    for file_path in transcription_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            segments.append(f.read().strip())  # Add each file's transcription as a separate segment

    # Step 3: Perform semantic search to find the most relevant segment
    relevant_segment = semantic_search(question_text, segments)

    # Step 4: Generate answer using the Hugging Face Transformers pipeline with the relevant context
    final_answer = generate_answer(question_text, relevant_segment)
    print("Final Answer:", final_answer)

    return final_answer


Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import gradio as gr

transcription_files = [
    r"/content/refined_transription.txt",
    # Add paths to other transcriptions here
]

# Run the pipeline
# Dummy function for frontend-only display
def dummy_function(audio_file):
    answer = speech_based_qa_pipeline(audio_file, transcription_files)
    return answer

# Set up Gradio interface with dummy function
iface = gr.Interface(
    fn=dummy_function,  # Dummy function, no backend processing
    inputs=gr.Audio(type="filepath"),  # Removed 'source' argument
    outputs="text",
    title="Voice to Text Transcription",
    description="Record your voice and display the transcription output."
)

iface.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://0804cd1e2c19bf2cc0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


User Question:  What are the unique qualities of sandalwood growing in Karnataka that make it valuable in global market?
Final Answer: essential oils and heartwood
Final Answer: essential oils and heartwood
