In [6]:
!pip install gradio
!pip install -U openai-whisper
!pip install sentence_transformers
!pip install transformers



In [None]:
# Function to refine text by improving grammar, removing redundancy, and maintaining cohesiveness without section breaks
def refine_text_general(text):
    # Split text into sentences, remove redundancy, and improve clarity
    sentences = text.split('. ')
    refined_sentences = []
    seen_sentences = set()

    for sentence in sentences:
        sentence = sentence.strip()  # Remove extra whitespace
        if sentence and sentence not in seen_sentences:
            # Capitalize the first letter of each sentence for grammar improvement
            sentence = sentence[0].upper() + sentence[1:] if sentence else sentence
            refined_sentences.append(sentence)
            seen_sentences.add(sentence)

    # Join sentences to form cohesive paragraphs without section breaks
    refined_text = '. '.join(refined_sentences) + '.'
    return refined_text.strip()

# Load the content from the input text file
input_file_path = '/content/all_transcription.txt'  # Replace with your input file path
with open(input_file_path, 'r') as file:
    raw_text = file.read()

# Apply the refinement function
refined_text_general = refine_text_general(raw_text)

# Save the refined text to a new file
output_file_path = '/content/refined_transcription_general.txt'
with open(output_file_path, 'w') as output_file:
    output_file.write(refined_text_general)

print("Refinement complete. The refined text has been saved to:", output_file_path)

Refinement complete. The refined text has been saved to: /content/refined_transcription_general.txt


In [22]:
import torch
import whisper
import librosa
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Set the device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Whisper ASR model on GPU
model_m = whisper.load_model("medium", device=device)

# Load Hugging Face Transformers pipeline for question answering
qa_pipeline = pipeline('question-answering', model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", device=0 if device == "cuda" else -1)

# Load Sentence Transformer model for semantic search
semantic_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

def transcribe_question(input_question_file):
    # Load and resample audio to 16kHz
    question_audio, sr = librosa.load(input_question_file, sr=16000)
    whisper_audio = torch.tensor(question_audio, dtype=torch.float32).to(device)
    # Perform transcription on the question audio
    result = model_m.transcribe(whisper_audio, language="en", fp16=torch.cuda.is_available())
    return result["text"]

def generate_answer(question_text, context):
    # Use the Hugging Face Transformers pipeline to extract the answer
    answer = qa_pipeline({
        "question": question_text,
        "context": context
    })
    return answer["answer"]

def semantic_search(query, segments):
    # Embed the query and segments to compute cosine similarity
    query_embedding = semantic_model.encode(query, convert_to_tensor=True)
    segment_embeddings = semantic_model.encode(segments, convert_to_tensor=True)

    # Compute cosine similarities
    similarities = util.pytorch_cos_sim(query_embedding, segment_embeddings)[0]

    # Get the index of the most similar segment
    most_similar_idx = torch.argmax(similarities).item()
    return segments[most_similar_idx]

def speech_based_qa_pipeline(input_question_file, transcription_files):
    # Step 1: Transcribe the question
    question_text = transcribe_question(input_question_file)
    print("User Question:", question_text)

    # Step 2: Load and concatenate transcriptions from files as separate segments
    segments = []
    for file_path in transcription_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            segments.append(f.read().strip())  # Add each file's transcription as a separate segment

    # Step 3: Perform semantic search to find the most relevant segment
    relevant_segment = semantic_search(question_text, segments)

    # Step 4: Generate answer using the Hugging Face Transformers pipeline with the relevant context
    final_answer = generate_answer(question_text, relevant_segment)
    print("Final Answer:", final_answer)

    return final_answer


  checkpoint = torch.load(fp, map_location=device)
Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
import os
import json
import torch
import whisper
import librosa
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Set the device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Whisper ASR model on GPU
model_m = whisper.load_model("medium", device=device)

# Load Hugging Face Transformers pipeline for question answering
qa_pipeline = pipeline('question-answering', model="google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", device=0 if device == "cuda" else -1)

# Load Sentence Transformer model for semantic search
semantic_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Path to feedback log file
feedback_log_path = "feedback_log.json"

# Load feedback log if it exists
if os.path.exists(feedback_log_path):
    with open(feedback_log_path, 'r', encoding='utf-8') as f:
        feedback_log = json.load(f)
else:
    feedback_log = {}

def transcribe_question(input_question_file):
    # Load and resample audio to 16kHz
    question_audio, sr = librosa.load(input_question_file, sr=16000)
    whisper_audio = torch.tensor(question_audio, dtype=torch.float32).to(device)
    # Perform transcription on the question audio
    result = model_m.transcribe(whisper_audio, language="en", fp16=torch.cuda.is_available())
    return result["text"]

def generate_answer(question_text, context):
    # Use the Hugging Face Transformers pipeline to extract the answer
    answer = qa_pipeline({
        "question": question_text,
        "context": context
    })
    return answer["answer"]

def semantic_search(query, segments):
    # Embed the query and segments to compute cosine similarity
    query_embedding = semantic_model.encode(query, convert_to_tensor=True)
    segment_embeddings = semantic_model.encode(segments, convert_to_tensor=True)

    # Compute cosine similarities
    similarities = util.pytorch_cos_sim(query_embedding, segment_embeddings)[0]

    # Get the index of the most similar segment
    most_similar_idx = torch.argmax(similarities).item()
    return segments[most_similar_idx]

def save_feedback(question_text, correct_answer):
    # Store corrected answer in feedback log and save to file
    feedback_log[question_text] = correct_answer
    with open(feedback_log_path, 'w', encoding='utf-8') as f:
        json.dump(feedback_log, f, ensure_ascii=False, indent=4)
    print(f"Feedback saved for question '{question_text}'.")

def check_feedback(question_text):
    # Check if feedback exists for the question
    return feedback_log.get(question_text, None)

def speech_based_qa_pipeline(input_question_file, transcription_files):
    # Step 1: Transcribe the question
    question_text = transcribe_question(input_question_file)
    print("User Question:", question_text)

    # Check if feedback is available for a similar question
    corrected_answer = check_feedback(question_text)
    if corrected_answer:
        print("Corrected Answer from Feedback Log:", corrected_answer)
        return corrected_answer  # Return the feedback if available

    # Step 2: Load and concatenate transcriptions from files as separate segments
    segments = []
    for file_path in transcription_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            segments.append(f.read().strip())  # Add each file's transcription as a separate segment

    # Step 3: Perform semantic search to find the most relevant segment
    relevant_segment = semantic_search(question_text, segments)

    # Step 4: Generate answer using the Hugging Face Transformers pipeline with the relevant context
    final_answer = generate_answer(question_text, relevant_segment)
    print("Generated Answer:", final_answer)

    # Step 5: Request feedback from user
    user_feedback = input("Is the answer correct? (yes/no): ").strip().lower()
    if user_feedback == "no":
        corrected_answer = input("Please provide the correct answer: ").strip()
        save_feedback(question_text, corrected_answer)
        return corrected_answer
    else:
        print("Answer confirmed as correct.")
        return final_answer

KeyboardInterrupt: 

In [None]:
import torch
import whisper
import librosa
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
import os

# Set the device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set up GenAI API key
os.environ["API_KEY"] = "AIzaSyBZTAdmQ-cAi0wKD_z9hvSt_r5qjgm13lY"  # Replace with your actual GenAI key
genai.configure(api_key=os.environ["API_KEY"])

# Load Whisper ASR model on GPU
model_m = whisper.load_model("medium", device=device)

# Load Sentence Transformer model for semantic search
semantic_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')


def transcribe_question(input_question_file):
    """
    Transcribe audio input containing the question using Whisper.
    """
    # Load and resample audio to 16kHz
    question_audio, sr = librosa.load(input_question_file, sr=16000)
    whisper_audio = torch.tensor(question_audio, dtype=torch.float32).to(device)
    # Perform transcription on the question audio
    result = model_m.transcribe(whisper_audio, language="en", fp16=torch.cuda.is_available())
    return result["text"]


def generate_answer_with_genai(question_text, context):
    """
    Use GenAI to generate an answer to the question based on the provided context.
    """
    try:
        # Prepare the prompt for GenAI
        prompt = (
            f"Answer the following question based on the provided context:\n\n"
            f"Question: {question_text}\n"
            f"Context: {context}\n"
            f"Answer:"
            f"Just give answer no explaination, if proper answer not found then give any answer close to it, if the question does not look relevant make it relevant as there can be some spelling or grammer mistakes in question and if"
            f"and if question's answer is not found give according to you or anything close to it just not give output as 'this passage does not contain answer'"
        )
        # Use GenAI to generate the answer
        model = genai.GenerativeModel("gemini-1.5-flash")  # Specify the model to use
        response = model.generate_content(prompt)

        # Extract the generated answer from the response
        answer = response.text
        return answer.strip()
    except Exception as e:
        print(f"Error generating answer with GenAI: {e}")
        return None


def semantic_search(query, segments):
    """
    Perform semantic search to find the most relevant context segment for the query.
    """
    # Embed the query and segments to compute cosine similarity
    query_embedding = semantic_model.encode(query, convert_to_tensor=True)
    segment_embeddings = semantic_model.encode(segments, convert_to_tensor=True)

    # Compute cosine similarities
    similarities = util.pytorch_cos_sim(query_embedding, segment_embeddings)[0]

    # Get the index of the most similar segment
    most_similar_idx = torch.argmax(similarities).item()
    return segments[most_similar_idx]


def speech_based_qa_pipeline(input_question_file, transcription_files):
    """
    Perform the full speech-based QA pipeline:
    - Transcribe the question from audio.
    - Perform semantic search on context segments.
    - Use GenAI for QA.
    """
    # Step 1: Transcribe the question
    question_text = transcribe_question(input_question_file)
    print("User Question:", question_text)

    # Step 2: Load and concatenate transcriptions from files as separate segments
    segments = []
    for file_path in transcription_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            segments.append(f.read().strip())  # Add each file's transcription as a separate segment

    # Step 3: Perform semantic search to find the most relevant segment
    relevant_segment = semantic_search(question_text, segments)

    # Step 4: Generate answer using GenAI with the relevant context
    final_answer = generate_answer_with_genai(question_text, relevant_segment)
    print("Final Answer:", final_answer)

    return final_answer


In [23]:
import gradio as gr
import os
import time

transcription_files = [
    r"/content/final_combined_output.txt",
    # Add paths to other transcriptions here
]

# Helper function to check if the audio file is fully saved
def wait_for_audio_file(audio_file_path, max_wait_time=5, interval=0.5):
    elapsed_time = 0
    while elapsed_time < max_wait_time:
        if os.path.exists(audio_file_path) and os.path.getsize(audio_file_path) > 0:
            return True
        time.sleep(interval)
        elapsed_time += interval
    return False

# Dummy function for frontend-only display
def dummy_function(audio_file):
    if audio_file is None:
        return "Error: No audio file detected. Please record or upload a valid audio file."
    try:
        # Process the saved audio file
        answer = speech_based_qa_pipeline(audio_file, transcription_files)
        return answer
    except Exception as e:
        return f"An error occurred during processing: {str(e)}"

# Set up Gradio interface with loader
with gr.Blocks() as iface:
    gr.Markdown("# Voice to Text Transcription")
    gr.Markdown("Record or upload your voice, and the system will transcribe it.")

    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Record or Upload Audio")
        output_text = gr.Textbox(label="Transcription Output")

    submit_button = gr.Button("Submit")
    submit_button.click(dummy_function, inputs=audio_input, outputs=output_text)

iface.launch(debug=True, share=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1e7a263ae08053558d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


User Question:  What are the main differences between Indian Sandalwood and Sandalwood varieties found in Australia and Hawaii?




Final Answer: 95% santalol and traces of santalin and santineel
User Question:  How many districts across Karnataka are involved in Srikanthada Garden?
Final Answer: nine
User Question:  What is the commuting distance mentioned in the conversation for software engineers or professionals from a city center?
Final Answer: 60 km
User Question:  Which district is specially mentioned for the efforts of creating a clean environment in the Kuntada Garden project and what is the name of the village in what?
Final Answer: Bijapur
User Question:  in which taluk is Yerevarthukalulla village located as per Sri Kantada Garden Project.
Final Answer: Srirabha
User Question:  Who is credited with providing advice that influences the clean environment efforts in Jalanayana district?
Final Answer: agronomists
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://1e7a263ae08053558d.gradio.live


