<a href="https://colab.research.google.com/github/KamoLovesCode/dr/blob/main/Talk_To_Me.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai-whisper transformers soundfile numpy torchaudio IPython

In [None]:
import whisper

model = whisper.load_model("base")

In [None]:
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const record = async () => {
  const options = {  mimeType: 'audio/webm; codecs=opus' }
  const div = document.createElement('div')
  const live = document.createElement('button')
  live.appendChild(document.createTextNode('🎤 Record'))
  div.appendChild(live)

  const rec = document.createElement('button')
  rec.appendChild(document.createTextNode('⏹ Stop'))
  div.appendChild(rec)

  const msg = document.createElement('div')
  msg.appendChild(document.createTextNode('Press 🎤 to start recording'))
  div.appendChild(msg)
  output.appendChild(div)

  const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  const recorder = new MediaRecorder(stream, options)
  let  data = []
  recorder.ondataavailable = e => data.push(e.data)
  recorder.start()
  msg.textcontent = 'Recording...'

  await new Promise(resolve => { live.onclick = resolve })
  live.replaceWith(rec)
  recorder.stop()

  await sleep(1000)

  const  blob = new Blob(data, options)
  const  url = URL.createObjectURL(blob)
  const a = document.createElement('a')
  a.href = url
  a.download = 'audio.webm'
  a.click()
  output.removeChild(div)
  return await new Promise(resolve => {
    const reader = new FileReader()
    reader.readAsDataURL(blob)
    reader.onloadend = () => resolve(reader.result.split(',', 1)[0])
  })
}
"""

Now run the cell below to record your voice.

In [None]:
from IPython.display import Javascript, display
from google.colab import output
from base64 import b64decode
import threading

def record_audio(filename='audio.webm'):
    """Records audio from the microphone in Colab and saves it to a file."""

    js_code = """
        const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
        const record = async () => {
          const options = {  mimeType: 'audio/webm; codecs=opus' }
          const div = document.createElement('div')
          const live = document.createElement('button')
          live.appendChild(document.createTextNode('🎤 Record'))
          div.appendChild(live)

          const rec = document.createElement('button')
          rec.appendChild(document.createTextNode('⏹ Stop'))
          div.appendChild(rec)

          const msg = document.createElement('div')
          msg.appendChild(document.createTextNode('Press 🎤 to start recording'))
          div.appendChild(msg)
          document.querySelector('#output-area').appendChild(div); // Append to the output area

          const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
          const recorder = new MediaRecorder(stream, options)
          let  data = []
          recorder.ondataavailable = e => data.push(e.data)
          recorder.start()
          msg.textContent = 'Recording...' // Use textContent

          await new Promise(resolve => { live.onclick = resolve })
          live.replaceWith(rec)
          recorder.stop()

          await sleep(1000)

          const  blob = new Blob(data, options)
          document.querySelector('#output-area').removeChild(div); // Remove from the output area

          // Read the blob and send it to the Python callback
          const reader = new FileReader();
          reader.onloadend = () => {
            google.colab.kernel.invokeFunction('notebook.record_audio_callback', [reader.result]);
          };
          reader.readAsDataURL(blob);
        }
        record();
    """

    def record_audio_callback(data_url):
        binary = b64decode(data_url.split(',')[1])
        with open(filename, 'wb') as f:
            f.write(binary)
        print(f"Audio saved to {filename}")

    output.register_callback('notebook.record_audio_callback', record_audio_callback)
    display(Javascript(js_code))

# Use threading to prevent blocking the notebook
threading.Thread(target=record_audio).start()

In [None]:
!pip install openai-whisper transformers soundfile numpy torchaudio IPython

import whisper

# Load the Whisper model (ensure this is run in the same cell or before)
model = whisper.load_model("base")

audio_file = "audio.webm"
result = model.transcribe(audio_file)
transcribed_text = result["text"]
print(f"Transcribed Text: {transcribed_text}")

Now we can use a language model to generate a response based on the transcribed text. You'll need an API key to use the Gemini API. If you don't already have one, create a key in Google AI Studio.

In Colab, add the key to the secrets manager under the "🔑" in the left panel. Give it the name `GOOGLE_API_KEY`. Then pass the key to the SDK:

In [None]:
# Import the Python SDK
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

Before you can make any API calls, you need to initialize the Generative Model.

In [None]:
# Initialize the Gemini API
# Using a suitable model for conversational tasks
gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')

Now, let's generate a response using the transcribed text.

In [None]:
# Use the transcribed text as a prompt for the language model
prompt = transcribed_text
response = gemini_model.generate_content(prompt)

# Print the generated response
print("Generated Response:")
print(response.text)

# Task
Create a simple web application using Python that records audio, transcribes it using the Whisper model, and then uses the Gemini model to generate a response based on the transcription. The application should display both the transcribed text and the generated response.

## Set up the web framework

### Subtask:
Install necessary libraries and set up a basic web framework (like Flask or Streamlit).


**Reasoning**:
Install the Flask library.



In [None]:
!pip install Flask

**Reasoning**:
Create a basic Flask application in a new file and define a home route.



In [None]:
# This code would typically go into a file named app.py
from flask import Flask

app = Flask(__name__)

@app.route('/')
def hello_world():
    return 'Hello, World!'

# To run this, you would typically use:
# if __name__ == '__main__':
#     app.run(debug=True)

# Note: In a standard Colab environment, running Flask directly like this
# might require additional setup for external access or using tools like ngrok.
# For this subtask, we are just setting up the basic structure in a file.

## Create the web interface

### Subtask:
Design a simple interface for the web app, including elements for recording audio and displaying the transcribed text and generated response.


**Reasoning**:
Create the HTML template file with the necessary elements and basic styling.



In [None]:
%%writefile templates/index.html
<!DOCTYPE html>
<html>
<head>
    <title>Voice Assistant</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        #controls { margin-bottom: 20px; }
        #transcribed_text, #generated_response {
            border: 1px solid #ccc;
            padding: 10px;
            margin-bottom: 10px;
            min-height: 100px;
            white-space: pre-wrap;
        }
        button { padding: 10px; cursor: pointer; }
    </style>
</head>
<body>
    <h1>Voice Assistant</h1>

    <div id="controls">
        <button id="recordButton">🎤 Record</button>
    </div>

    <h2>Transcribed Text:</h2>
    <div id="transcribed_text"></div>

    <h2>Generated Response:</h2>
    <div id="generated_response"></div>

    <script>
        const recordButton = document.getElementById('recordButton');
        const transcribedTextDiv = document.getElementById('transcribed_text');
        const generatedResponseDiv = document.getElementById('generated_response');
        let mediaRecorder;
        let audioChunks = [];

        recordButton.onclick = async () => {
            if (mediaRecorder && mediaRecorder.state === 'recording') {
                mediaRecorder.stop();
                recordButton.textContent = '🎤 Record';
            } else {
                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
                mediaRecorder = new MediaRecorder(stream);
                audioChunks = [];
                mediaRecorder.ondataavailable = event => {
                    audioChunks.push(event.data);
                };
                mediaRecorder.onstop = async () => {
                    const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
                    // Here you would typically send the audioBlob to your Flask backend
                    // For now, we'll just indicate recording stopped.
                    transcribedTextDiv.textContent = 'Processing audio...';
                    generatedResponseDiv.textContent = '';

                    // Example of how you might send the blob to the backend (requires fetch API)
                    // const formData = new FormData();
                    // formData.append('audio', audioBlob, 'audio.webm');
                    // fetch('/upload_audio', {
                    //     method: 'POST',
                    //     body: formData
                    // })
                    // .then(response => response.json())
                    // .then(data => {
                    //     transcribedTextDiv.textContent = data.transcription;
                    //     generatedResponseDiv.textContent = data.response;
                    // })
                    // .catch(error => {
                    //     transcribedTextDiv.textContent = 'Error processing audio.';
                    //     generatedResponseDiv.textContent = '';
                    //     console.error('Error:', error);
                    // });
                };
                mediaRecorder.start();
                recordButton.textContent = '⏹ Stop';
            }
        };
    </script>
</body>
</html>

**Reasoning**:
The previous attempt failed because the 'templates' directory does not exist. Create the directory first and then create the HTML file within it.



In [None]:
import os

# Create the templates directory if it doesn't exist
if not os.path.exists('templates'):
    os.makedirs('templates')

# Now write the HTML file into the templates directory
%%writefile templates/index.html
<!DOCTYPE html>
<html>
<head>
    <title>Voice Assistant</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        #controls { margin-bottom: 20px; }
        #transcribed_text, #generated_response {
            border: 1px solid #ccc;
            padding: 10px;
            margin-bottom: 10px;
            min-height: 100px;
            white-space: pre-wrap;
        }
        button { padding: 10px; cursor: pointer; }
    </style>
</head>
<body>
    <h1>Voice Assistant</h1>

    <div id="controls">
        <button id="recordButton">🎤 Record</button>
    </div>

    <h2>Transcribed Text:</h2>
    <div id="transcribed_text"></div>

    <h2>Generated Response:</h2>
    <div id="generated_response"></div>

    <script>
        const recordButton = document.getElementById('recordButton');
        const transcribedTextDiv = document.getElementById('transcribed_text');
        const generatedResponseDiv = document.getElementById('generated_response');
        let mediaRecorder;
        let audioChunks = [];

        recordButton.onclick = async () => {
            if (mediaRecorder && mediaRecorder.state === 'recording') {
                mediaRecorder.stop();
                recordButton.textContent = '🎤 Record';
            } else {
                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
                mediaRecorder = new MediaRecorder(stream);
                audioChunks = [];
                mediaRecorder.ondataavailable = event => {
                    audioChunks.push(event.data);
                };
                mediaRecorder.onstop = async () => {
                    const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
                    // Here you would typically send the audioBlob to your Flask backend
                    // For now, we'll just indicate recording stopped.
                    transcribedTextDiv.textContent = 'Processing audio...';
                    generatedResponseDiv.textContent = '';

                    // Example of how you might send the blob to the backend (requires fetch API)
                    // const formData = new FormData();
                    // formData.append('audio', audioBlob, 'audio.webm');
                    // fetch('/upload_audio', {
                    //     method: 'POST',
                    //     body: formData
                    // })
                    // .then(response => response.json())
                    // .then(data => {
                    //     transcribedTextDiv.textContent = data.transcription;
                    //     generatedResponseDiv.textContent = data.response;
                    // })
                    // .catch(error => {
                    //     transcribedTextDiv.textContent = 'Error processing audio.';
                    //     generatedResponseDiv.textContent = '';
                    //     console.error('Error:', error);
                    // });
                };
                mediaRecorder.start();
                recordButton.textContent = '⏹ Stop';
            }
        };
    </script>
</body>
</html>

**Reasoning**:
The error is a `SyntaxError: invalid decimal literal` caused by mixing Python code and the `%%writefile` magic command within the same cell without a blank line in between. Separate the Python code from the magic command with a blank line.



In [None]:
import os

# Create the templates directory if it doesn't exist
if not os.path.exists('templates'):
    os.makedirs('templates')


**Reasoning**:
Now that the directory is created, write the HTML template file into the 'templates' directory.



In [None]:
%%writefile templates/index.html
<!DOCTYPE html>
<html>
<head>
    <title>Voice Assistant</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        #controls { margin-bottom: 20px; }
        #transcribed_text, #generated_response {
            border: 1px solid #ccc;
            padding: 10px;
            margin-bottom: 10px;
            min-height: 100px;
            white-space: pre-wrap;
        }
        button { padding: 10px; cursor: pointer; }
    </style>
</head>
<body>
    <h1>Voice Assistant</h1>

    <div id="controls">
        <button id="recordButton">🎤 Record</button>
    </div>

    <h2>Transcribed Text:</h2>
    <div id="transcribed_text"></div>

    <h2>Generated Response:</h2>
    <div id="generated_response"></div>

    <script>
        const recordButton = document.getElementById('recordButton');
        const transcribedTextDiv = document.getElementById('transcribed_text');
        const generatedResponseDiv = document.getElementById('generated_response');
        let mediaRecorder;
        let audioChunks = [];

        recordButton.onclick = async () => {
            if (mediaRecorder && mediaRecorder.state === 'recording') {
                mediaRecorder.stop();
                recordButton.textContent = '🎤 Record';
            } else {
                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
                mediaRecorder = new MediaRecorder(stream);
                audioChunks = [];
                mediaRecorder.ondataavailable = event => {
                    audioChunks.push(event.data);
                };
                mediaRecorder.onstop = async () => {
                    const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
                    // Here you would typically send the audioBlob to your Flask backend
                    // For now, we'll just indicate recording stopped.
                    transcribedTextDiv.textContent = 'Processing audio...';
                    generatedResponseDiv.textContent = '';

                    // Example of how you might send the blob to the backend (requires fetch API)
                    // const formData = new FormData();
                    // formData.append('audio', audioBlob, 'audio.webm');
                    // fetch('/upload_audio', {
                    //     method: 'POST',
                    //     body: formData
                    // })
                    // .then(response => response.json())
                    // .then(data => {
                    //     transcribedTextDiv.textContent = data.transcription;
                    //     generatedResponseDiv.textContent = data.response;
                    // })
                    // .catch(error => {
                    //     transcribedTextDiv.textContent = 'Error processing audio.';
                    //     generatedResponseDiv.textContent = '';
                    //     console.error('Error:', error);
                    // });
                };
                mediaRecorder.start();
                recordButton.textContent = '⏹ Stop';
            }
        };
    </script>
</body>
</html>

**Reasoning**:
The HTML template file has been created. The next step is to configure the Flask application to render this template when the root URL is accessed. This involves modifying the Flask application code. Since the Flask app was set up in a previous subtask, we will create a new file or modify the existing one to include the rendering logic. We'll create a new file named `app.py` and include the `render_template` function.



In [None]:
%%writefile app.py
from flask import Flask, render_template, request, jsonify

app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')

# In future steps, we will add routes for handling audio upload and processing
# @app.route('/upload_audio', methods=['POST'])
# def upload_audio():
#     if 'audio' not in request.files:
#         return jsonify({"error": "No audio file part"}), 400
#
#     audio_file = request.files['audio']
#     if audio_file.filename == '':
#         return jsonify({"error": "No selected file"}), 400
#
#     # Save the audio file and process it with Whisper and Gemini
#     # This part will be implemented in later subtasks
#     # transcribed_text = transcribe_audio(audio_file)
#     # generated_response = generate_response(transcribed_text)
#
#     # return jsonify({"transcription": transcribed_text, "response": generated_response})
#     return jsonify({"message": "Audio uploaded and processed (placeholder)"}) # Placeholder

if __name__ == '__main__':
    # This is for running the Flask app directly.
    # In a Colab environment, you might need a different approach for external access.
    app.run(debug=True)

## Integrate audio recording

### Subtask:
Implement the functionality to record audio directly within the web app.


**Reasoning**:
Modify the `index.html` file to include JavaScript for audio recording and sending the data to the backend, and update `app.py` with a route to handle the audio upload.



In [None]:
%%writefile templates/index.html
<!DOCTYPE html>
<html>
<head>
    <title>Voice Assistant</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        #controls { margin-bottom: 20px; }
        #transcribed_text, #generated_response {
            border: 1px solid #ccc;
            padding: 10px;
            margin-bottom: 10px;
            min-height: 100px;
            white-space: pre-wrap;
        }
        button { padding: 10px; cursor: pointer; }
    </style>
</head>
<body>
    <h1>Voice Assistant</h1>

    <div id="controls">
        <button id="recordButton">🎤 Record</button>
    </div>

    <h2>Transcribed Text:</h2>
    <div id="transcribed_text"></div>

    <h2>Generated Response:</h2>
    <div id="generated_response"></div>

    <script>
        const recordButton = document.getElementById('recordButton');
        const transcribedTextDiv = document.getElementById('transcribed_text');
        const generatedResponseDiv = document.getElementById('generated_response');
        let mediaRecorder;
        let audioChunks = [];

        recordButton.onclick = async () => {
            if (mediaRecorder && mediaRecorder.state === 'recording') {
                mediaRecorder.stop();
                recordButton.textContent = '🎤 Record';
            } else {
                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
                mediaRecorder = new MediaRecorder(stream);
                audioChunks = [];
                mediaRecorder.ondataavailable = event => {
                    audioChunks.push(event.data);
                };
                mediaRecorder.onstop = async () => {
                    const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });

                    transcribedTextDiv.textContent = 'Processing audio...';
                    generatedResponseDiv.textContent = '';

                    const formData = new FormData();
                    formData.append('audio', audioBlob, 'audio.webm');

                    fetch('/upload_audio', {
                        method: 'POST',
                        body: formData
                    })
                    .then(response => response.json())
                    .then(data => {
                         // Update these lines in subsequent steps with actual transcription and response
                        transcribedTextDiv.textContent = data.message; // Using message for now
                        generatedResponseDiv.textContent = 'Waiting for processing...'; // Placeholder
                    })
                    .catch(error => {
                        transcribedTextDiv.textContent = 'Error processing audio.';
                        generatedResponseDiv.textContent = '';
                        console.error('Error:', error);
                    });
                };
                mediaRecorder.start();
                recordButton.textContent = '⏹ Stop';
            }
        };
    </script>
</body>
</html>

**Reasoning**:
Create or modify the `app.py` file to include the `/upload_audio` route to receive and save the audio file sent from the frontend.



In [None]:
%%writefile app.py
from flask import Flask, render_template, request, jsonify
import os

app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/upload_audio', methods=['POST'])
def upload_audio():
    if 'audio' not in request.files:
        return jsonify({"error": "No audio file part"}), 400

    audio_file = request.files['audio']
    if audio_file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    # Define the path to save the audio file
    audio_path = 'audio.webm'

    try:
        audio_file.save(audio_path)
        print(f"Audio file saved to {audio_path}") # Log successful save
        # In subsequent steps, we will process this file
        return jsonify({"message": f"Audio file received and saved as {audio_path}"})
    except Exception as e:
        print(f"Error saving audio file: {e}") # Log the error
        return jsonify({"error": "Failed to save audio file"}), 500

if __name__ == '__main__':
    # This is for running the Flask app directly.
    # In a Colab environment, you might need a different approach for external access.
    # For development, you can run this and use ngrok to expose the port.
    app.run(debug=True)

## Integrate whisper transcription

### Subtask:
Use the recorded audio as input for the Whisper model to get the transcribed text.


**Reasoning**:
Modify the `upload_audio` route in `app.py` to load the Whisper model, transcribe the saved audio file, store the transcribed text, and include it in the JSON response.



In [None]:
%%writefile app.py
from flask import Flask, render_template, request, jsonify
import os
import whisper # Import whisper

app = Flask(__name__)

# Load the Whisper model globally to avoid reloading on each request
model = whisper.load_model("base")

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/upload_audio', methods=['POST'])
def upload_audio():
    if 'audio' not in request.files:
        return jsonify({"error": "No audio file part"}), 400

    audio_file = request.files['audio']
    if audio_file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    # Define the path to save the audio file
    audio_path = 'audio.webm'

    try:
        audio_file.save(audio_path)
        print(f"Audio file saved to {audio_path}") # Log successful save

        # Transcribe the audio file using Whisper
        result = model.transcribe(audio_path)
        transcribed_text = result["text"]
        print(f"Transcribed Text: {transcribed_text}") # Log transcribed text

        # Remove the audio file after transcription to save space
        os.remove(audio_path)
        print(f"Audio file {audio_path} removed.")

        # Include the transcribed text in the JSON response
        return jsonify({"transcription": transcribed_text, "message": f"Audio file received and transcribed."})

    except Exception as e:
        print(f"Error processing audio file: {e}") # Log the error
        # Ensure to remove the file even if transcription fails
        if os.path.exists(audio_path):
            os.remove(audio_path)
            print(f"Audio file {audio_path} removed after error.")
        return jsonify({"error": "Failed to process audio file"}), 500

if __name__ == '__main__':
    # This is for running the Flask app directly.
    # In a Colab environment, you might need a different approach for external access.
    # For development, you can run this and use ngrok to expose the port.
    # app.run(debug=True)
    # Use a different port if needed, e.g., port=5001
    app.run(host='0.0.0.0', port=5000, debug=True)

**Reasoning**:
Update the `index.html` file to display the transcribed text received from the backend.



In [None]:
%%writefile templates/index.html
<!DOCTYPE html>
<html>
<head>
    <title>Voice Assistant</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        #controls { margin-bottom: 20px; }
        #transcribed_text, #generated_response {
            border: 1px solid #ccc;
            padding: 10px;
            margin-bottom: 10px;
            min-height: 100px;
            white-space: pre-wrap;
        }
        button { padding: 10px; cursor: pointer; }
    </style>
</head>
<body>
    <h1>Voice Assistant</h1>

    <div id="controls">
        <button id="recordButton">🎤 Record</button>
    </div>

    <h2>Transcribed Text:</h2>
    <div id="transcribed_text"></div>

    <h2>Generated Response:</h2>
    <div id="generated_response"></div>

    <script>
        const recordButton = document.getElementById('recordButton');
        const transcribedTextDiv = document.getElementById('transcribed_text');
        const generatedResponseDiv = document.getElementById('generated_response');
        let mediaRecorder;
        let audioChunks = [];

        recordButton.onclick = async () => {
            if (mediaRecorder && mediaRecorder.state === 'recording') {
                mediaRecorder.stop();
                recordButton.textContent = '🎤 Record';
            } else {
                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
                mediaRecorder = new MediaRecorder(stream);
                audioChunks = [];
                mediaRecorder.ondataavailable = event => {
                    audioChunks.push(event.data);
                };
                mediaRecorder.onstop = async () => {
                    const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });

                    transcribedTextDiv.textContent = 'Processing audio...';
                    generatedResponseDiv.textContent = ''; // Clear previous response

                    const formData = new FormData();
                    formData.append('audio', audioBlob, 'audio.webm');

                    fetch('/upload_audio', {
                        method: 'POST',
                        body: formData
                    })
                    .then(response => response.json())
                    .then(data => {
                        if (data.transcription) {
                            transcribedTextDiv.textContent = data.transcription;
                        } else {
                            transcribedTextDiv.textContent = data.error || 'Error processing audio.';
                        }
                        generatedResponseDiv.textContent = 'Waiting for response generation...'; // Placeholder for next step
                    })
                    .catch(error => {
                        transcribedTextDiv.textContent = 'Error processing audio.';
                        generatedResponseDiv.textContent = '';
                        console.error('Error:', error);
                    });
                };
                mediaRecorder.start();
                recordButton.textContent = '⏹ Stop';
            }
        };
    </script>
</body>
</html>

## Integrate gemini model

### Subtask:
Use the transcribed text as input for the Gemini model to generate a response.


**Reasoning**:
Modify `app.py` to include the Gemini model, use the transcribed text to generate a response, and include the response in the JSON output. Update `index.html` to display the generated response.



In [None]:
%%writefile app.py
from flask import Flask, render_template, request, jsonify
import os
import whisper
import google.generativeai as genai # Import genai
from google.colab import userdata # Import userdata for API key

app = Flask(__name__)

# Load the Whisper model globally
whisper_model = whisper.load_model("base")

# Configure and load the Gemini model globally
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/upload_audio', methods=['POST'])
def upload_audio():
    if 'audio' not in request.files:
        return jsonify({"error": "No audio file part"}), 400

    audio_file = request.files['audio']
    if audio_file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    # Define the path to save the audio file
    audio_path = 'audio.webm'

    try:
        audio_file.save(audio_path)
        print(f"Audio file saved to {audio_path}") # Log successful save

        # Transcribe the audio file using Whisper
        result = whisper_model.transcribe(audio_path)
        transcribed_text = result["text"]
        print(f"Transcribed Text: {transcribed_text}") # Log transcribed text

        # Generate a response using the Gemini model
        # Use a simple prompt, you can make this more complex
        prompt = transcribed_text
        gemini_response = gemini_model.generate_content(prompt)
        generated_response_text = gemini_response.text
        print(f"Generated Response: {generated_response_text}") # Log generated response


        # Remove the audio file after transcription to save space
        os.remove(audio_path)
        print(f"Audio file {audio_path} removed.")

        # Include both the transcribed text and generated response in the JSON response
        return jsonify({"transcription": transcribed_text, "response": generated_response_text, "message": "Audio processed and response generated."})

    except Exception as e:
        print(f"Error processing audio file: {e}") # Log the error
        # Ensure to remove the file even if transcription or generation fails
        if os.path.exists(audio_path):
            os.remove(audio_path)
            print(f"Audio file {audio_path} removed after error.")
        return jsonify({"error": "Failed to process audio file"}), 500

if __name__ == '__main__':
    # This is for running the Flask app directly.
    # In a Colab environment, you might need a different approach for external access.
    # For development, you can run this and use ngrok to expose the port.
    # app.run(debug=True)
    # Use a different port if needed, e.g., port=5001
    app.run(host='0.0.0.0', port=5000, debug=True)

**Reasoning**:
Update `index.html` to extract and display the generated response from the JSON payload.



In [None]:
%%writefile templates/index.html
<!DOCTYPE html>
<html>
<head>
    <title>Voice Assistant</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        #controls { margin-bottom: 20px; }
        #transcribed_text, #generated_response {
            border: 1px solid #ccc;
            padding: 10px;
            margin-bottom: 10px;
            min-height: 100px;
            white-space: pre-wrap;
        }
        button { padding: 10px; cursor: pointer; }
    </style>
</head>
<body>
    <h1>Voice Assistant</h1>

    <div id="controls">
        <button id="recordButton">🎤 Record</button>
    </div>

    <h2>Transcribed Text:</h2>
    <div id="transcribed_text"></div>

    <h2>Generated Response:</h2>
    <div id="generated_response"></div>

    <script>
        const recordButton = document.getElementById('recordButton');
        const transcribedTextDiv = document.getElementById('transcribed_text');
        const generatedResponseDiv = document.getElementById('generated_response');
        let mediaRecorder;
        let audioChunks = [];

        recordButton.onclick = async () => {
            if (mediaRecorder && mediaRecorder.state === 'recording') {
                mediaRecorder.stop();
                recordButton.textContent = '🎤 Record';
            } else {
                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
                mediaRecorder = new MediaRecorder(stream);
                audioChunks = [];
                mediaRecorder.ondataavailable = event => {
                    audioChunks.push(event.data);
                };
                mediaRecorder.onstop = async () => {
                    const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });

                    transcribedTextDiv.textContent = 'Processing audio...';
                    generatedResponseDiv.textContent = ''; // Clear previous response

                    const formData = new FormData();
                    formData.append('audio', audioBlob, 'audio.webm');

                    fetch('/upload_audio', {
                        method: 'POST',
                        body: formData
                    })
                    .then(response => response.json())
                    .then(data => {
                        if (data.transcription) {
                            transcribedTextDiv.textContent = data.transcription;
                        } else {
                            transcribedTextDiv.textContent = data.error || 'Error processing audio.';
                        }

                        // Update to display the generated response
                        if (data.response) {
                            generatedResponseDiv.textContent = data.response;
                        } else {
                             generatedResponseDiv.textContent = data.error || 'Error generating response.';
                        }
                    })
                    .catch(error => {
                        transcribedTextDiv.textContent = 'Error processing audio.';
                        generatedResponseDiv.textContent = '';
                        console.error('Error:', error);
                    });
                };
                mediaRecorder.start();
                recordButton.textContent = '⏹ Stop';
            }
        };
    </script>
</body>
</html>

## Display results

### Subtask:
Display the transcribed text and the generated response on the web interface.


## Run the web app

### Subtask:
Provide instructions on how to run the web app locally or deploy it.


**Reasoning**:
Provide instructions on how to run the Flask application locally and how to access it, including the case of running it in a Colab-like environment.



In [None]:
print("""
To run the web application locally:

1.  Ensure you have Python installed.
2.  Save the `app.py` and `templates/index.html` files in the same directory structure (create a 'templates' folder and put 'index.html' inside it).
3.  Open a terminal or command prompt in the directory where `app.py` is saved.
4.  Run the Flask application using the command:
    ```bash
    python app.py
    ```
5.  You will see output in the terminal indicating that the Flask development server is running. It will typically show something like:
    ```
     * Serving Flask app 'app'
     * Debug mode: on
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
     * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
    ```
6.  Open a web browser and go to the address shown in the output, which is usually `http://127.0.0.1:5000/`.

Interacting with the web app:

*   Once the page loads, you will see the "Voice Assistant" title and a "🎤 Record" button.
*   Click the "🎤 Record" button to start recording your voice. You might need to grant microphone permissions to your browser.
*   Click the "⏹ Stop" button to stop recording.
*   The application will then process the audio. The "Transcribed Text" area will first show "Processing audio..." and then display the text transcribed by the Whisper model.
*   The "Generated Response" area will show "Waiting for response generation..." and then display the response generated by the Gemini model based on your transcribed text.

Running in environments like Google Colab:

If you are running this Flask application in an environment like Google Colab, the local address (like `http://127.0.0.1:5000/`) is not directly accessible from your web browser because it's running on a remote server. To access it from your local machine, you would typically need to use a service like ngrok to create a public URL that tunnels to your local Flask server.

Steps using ngrok in Colab:

1.  Install ngrok in your Colab notebook:
    ```bash
    !pip install pyngrok
    ```
2.  Import `ngrok` and run it, specifying the port your Flask app is running on (default is 5000):
    ```python
    from pyngrok import ngrok
    # Terminate open tunnels if any
    ngrok.kill()
    # Replace 5000 with the port your Flask app is running on
    public_url = ngrok.connect(5000)
    print(f" * ngrok tunnel established at: {public_url}")
    ```
3.  Run your Flask app in a separate cell (or using threading/background process) so it doesn't block the notebook:
    ```python
    # Make sure app is defined as in app.py
    # app.run(host='0.0.0.0', port=5000, debug=True)
    # You might need to run this in a separate thread or use a tool like `run_with_ngrok`
    # if you want it to run in the background in Colab.
    # A simple way for demonstration in Colab is to put the app.run call
    # in a cell below the ngrok cell and execute them sequentially.
    # However, this will block the cell. For non-blocking, threading is better.
    # Example using threading (requires 'import threading' at the top):
    # threading.Thread(target=app.run, kwargs={'host':'0.0.0.0','port':5000,'debug':True}).start()

    # A simpler approach for demonstration is to use a utility function if available
    # or just run the app.run() call in a cell and be aware it blocks.
    # For this example, assume app.py contains the if __name__ == '__main__': block
    # and you would run that cell.
    ```
4.  Open the `public_url` provided by ngrok in your web browser to access the application.

Remember to replace `5000` with the actual port number if you configured your Flask app to run on a different port.
""")

## Summary:

### Data Analysis Key Findings

*   The Flask web framework was successfully set up and a basic application structure was created.
*   A simple HTML interface was designed with elements for recording audio and displaying transcribed text and generated responses.
*   JavaScript was integrated into the frontend to handle audio recording using the `MediaRecorder` API and send the recorded audio to the backend via a POST request.
*   The backend Flask application was updated with an `/upload_audio` route to receive and save the audio file.
*   The Whisper model was successfully integrated into the backend to transcribe the saved audio file.
*   The Gemini model was integrated into the backend to generate a response based on the transcribed text.
*   The backend now returns both the transcribed text and the generated response to the frontend.
*   The frontend JavaScript was updated to display both the transcribed text and the generated response received from the backend.
*   Instructions were provided on how to run the web application locally and how to access it from environments like Google Colab using ngrok.

### Insights or Next Steps

*   Enhance the user interface with visual feedback during recording and processing states.
*   Implement more robust error handling and potentially add features like language selection for transcription or prompt engineering for the Gemini model.


In [None]:
!python app.py

In [None]:
!pip install openai-whisper

In [None]:
!python app.py

In [None]:
%%writefile app.py
from flask import Flask, render_template, request, jsonify
import os
import whisper
import google.generativeai as genai
from google.colab import userdata

app = Flask(__name__)

# Load the Whisper model globally
whisper_model = whisper.load_model("base")

# Configure and load the Gemini model globally
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/upload_audio', methods=['POST'])
def upload_audio():
    if 'audio' not in request.files:
        return jsonify({"error": "No audio file part"}), 400

    audio_file = request.files['audio']
    if audio_file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    # Define the path to save the audio file
    audio_path = 'audio.webm'

    try:
        audio_file.save(audio_path)
        print(f"Audio file saved to {audio_path}")

        # Transcribe the audio file using Whisper
        result = whisper_model.transcribe(audio_path)
        transcribed_text = result["text"]
        print(f"Transcribed Text: {transcribed_text}")

        # Generate a response using the Gemini model
        prompt = transcribed_text
        gemini_response = gemini_model.generate_content(prompt)
        generated_response_text = gemini_response.text
        print(f"Generated Response: {generated_response_text}")

        # Remove the audio file after transcription to save space
        os.remove(audio_path)
        print(f"Audio file {audio_path} removed.")

        # Include both the transcribed text and generated response in the JSON response
        return jsonify({"transcription": transcribed_text, "response": generated_response_text, "message": "Audio processed and response generated."})

    except Exception as e:
        print(f"Error processing audio file: {e}")
        # Ensure to remove the file even if transcription or generation fails
        if os.path.exists(audio_path):
            os.remove(audio_path)
            print(f"Audio file {audio_path} removed after error.")
        return jsonify({"error": "Failed to process audio file"}), 500

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, debug=True)

In [None]:
!python app.py