<a href="https://colab.research.google.com/github/Harris-giki/AI_VoiceAssistants/blob/main/ColabTestFile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install git+https://github.com/openai/whisper.git --quiet
!pip install transformers accelerate bitsandbytes gtts soundfile pydub --quiet

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [39]:
!pip uninstall -y bitsandbytes
!pip install bitsandbytes --quiet

Found existing installation: bitsandbytes 0.46.0
Uninstalling bitsandbytes-0.46.0:
  Successfully uninstalled bitsandbytes-0.46.0


In [42]:
!pip install bitsandbytes --quiet

In [51]:
import whisper
from gtts import gTTS
from IPython.display import Audio, display, Javascript
import tempfile
import os
import base64
import time
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import BitsAndBytesConfig
import torch
from google.colab import output
from pydub import AudioSegment

In [None]:
# Global variable to store audio data
audio_bytes = None

def record_audio(filename="input.wav", record_seconds=5):
    """Record audio with proper timeout handling and extended debugging"""
    global audio_bytes
    audio_bytes = None

    print(f"Recording for {record_seconds} seconds...")

    # JavaScript code with improved error handling and proper timeout
    js_code = f"""
    async function record(seconds) {{
        try {{
            console.log('Starting recording for', seconds, 'seconds');
            const stream = await navigator.mediaDevices.getUserMedia({{audio: true}});

            // Use webm format which is more reliable in browsers
            const mimeType = MediaRecorder.isTypeSupported('audio/webm') ? 'audio/webm' : 'audio/mp4';
            const mediaRecorder = new MediaRecorder(stream, {{ mimeType: mimeType }});

            let chunks = [];

            mediaRecorder.ondataavailable = function(event) {{
                if (event.data.size > 0) {{
                    chunks.push(event.data);
                }}
            }};

            mediaRecorder.onstop = function() {{
                console.log('Recording stopped, processing audio');
                const blob = new Blob(chunks, {{ type: mimeType }});
                const reader = new FileReader();

                reader.onloadend = function() {{
                    const base64data = reader.result;
                    console.log('Sending audio data to Python:', base64data.substring(0, 20) + '...');
                    google.colab.kernel.invokeFunction('notebook.receive_audio', [base64data], {{}});
                }};

                reader.readAsDataURL(blob);

                // Stop all tracks to release microphone
                stream.getTracks().forEach(track => track.stop());
            }};

            mediaRecorder.onerror = function(event) {{
                console.error('MediaRecorder error:', event.error);
                stream.getTracks().forEach(track => track.stop());
            }};

            // Start recording
            mediaRecorder.start();
            console.log('Recording started');

            // Set timeout to stop recording
            setTimeout(() => {{
                if (mediaRecorder.state === 'recording') {{
                    console.log('Stopping recording after timeout');
                    mediaRecorder.stop();
                }}
            }}, seconds * 1000);

        }} catch (error) {{
            console.error('Error in recording function:', error);
            alert('Error accessing microphone: ' + error.message);
        }}
    }}

    record({record_seconds});
    """

    display(Javascript(js_code))

    # Add a small delay to allow Colab to process the JavaScript
    time.sleep(1)

    def _receive_audio(base64_audio):
        global audio_bytes
        try:
            print(f"Callback triggered at {time.strftime('%H:%M:%S')}, data length: {len(base64_audio) if base64_audio else 0}")
            header, data = base64_audio.split(',', 1)
            audio_bytes = base64.b64decode(data)
            print(f"Audio data decoded successfully at {time.strftime('%H:%M:%S')}, length: {len(audio_bytes)} bytes")
        except Exception as e:
            print(f"Error processing audio data at {time.strftime('%H:%M:%S')}: {e}")
            audio_bytes = None  # Explicitly set to None on failure

    # Register the callback
    output.register_callback('notebook.receive_audio', _receive_audio)

    # Wait for audio with a longer timeout and polling
    timeout_counter = 0
    max_wait_time = record_seconds + 20  # 25 seconds total
    while audio_bytes is None and timeout_counter < max_wait_time * 10:
        time.sleep(0.1)
        timeout_counter += 1

        # Show progress
        if timeout_counter % 10 == 0:
            elapsed = timeout_counter / 10
            if elapsed <= record_seconds:
                print(f"Recording... {elapsed:.1f}s / {record_seconds}s")
            else:
                print(f"Waiting for audio data... {elapsed - record_seconds:.1f}s")

    if audio_bytes is None:
        print("Recording timeout or microphone access denied - no audio received")
        return None

    # Save as temporary file and convert to WAV
    temp_filename = "temp_audio.webm"
    with open(temp_filename, "wb") as f:
        f.write(audio_bytes)

    try:
        audio = AudioSegment.from_file(temp_filename)
        audio.export(filename, format="wav")
        print(f"Audio saved successfully to {filename} ({len(audio_bytes)} bytes)")
        os.remove(temp_filename)  # Clean up temp file
        return filename
    except Exception as e:
        print(f"Error converting/saving audio file: {e}")
        return None

# Load models
print("Loading Whisper model...")
whisper_model = whisper.load_model("base")
print("Whisper model loaded successfully!")

print("Loading chatbot model...")
model_name = "large-traversaal/Alif-1.0-8B-Instruct"
try:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
except Exception as e:
    print(f"Error with quantization: {e}. Falling back to CPU without quantization...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu")

chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
print("Chatbot model loaded successfully!")

Loading Whisper model...
Whisper model loaded successfully!
Loading chatbot model...


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/947 [00:00<?, ?B/s]

None of the available devices `available_devices = None` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {'npu', '"cpu" (needs an Intel CPU and intel_extension_for_pytorch installed and compatible with the PyTorch version)', 'cuda', 'hpu', 'mps', 'xpu'}`. Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


Error with quantization: None of the available devices `available_devices = None` are supported by the bitsandbytes version you have installed: `bnb_supported_devices = {'npu', '"cpu" (needs an Intel CPU and intel_extension_for_pytorch installed and compatible with the PyTorch version)', 'cuda', 'hpu', 'mps', 'xpu'}`. Please check the docs to see if the backend you intend to use is available and how to install it: https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend. Falling back to CPU without quantization...


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [45]:
def transcribe_audio_file(file_path):
    """Transcribe audio file with Whisper"""
    if not file_path or not os.path.exists(file_path):
        print("Invalid audio file path")
        return ""

    try:
        result = whisper_model.transcribe(file_path)
        text = result.get("text", "").strip()
        print(f"You said: {text}")
        return text
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return ""

In [46]:
def generate_response(prompt, max_length=500, min_length=80, temperature=0.7):
    """Generate chatbot response with prompt"""
    try:
        # Enforce Urdu response
        prompt = f"Respond in Urdu only: {prompt}"
        response = chatbot(prompt, max_length=max_length, min_length=min_length,
                          do_sample=True, temperature=temperature)
        answer = response[0]["generated_text"].strip()
        return answer
    except Exception as e:
        print(f"Error generating response: {e}")
        return "معذرت، جواب بنانے میں خرابی ہوئی۔"

In [47]:
def speak(text, lang='ur'):
    """Speak the text using gTTS"""
    print(f"Assistant: {text}")
    try:
        tts = gTTS(text=text, lang=lang)
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
            tts.save(temp_file.name)
            display(Audio(temp_file.name, autoplay=True))
        os.remove(temp_file.name)  # Clean up after playback
    except Exception as e:
        print(f"Error in text-to-speech: {e}")

In [48]:
def main_colab():
    """Main loop for continuous voice assistant"""
    print("Voice assistant started. Say 'exit', 'quit', or 'stop' to end.\n")

    while True:
        try:
            print("\n" + "="*50)
            print("Please speak now...")

            audio_path = record_audio(record_seconds=5)

            if not audio_path:
                print("Failed to record audio. Please try again.")
                continue

            command = transcribe_audio_file(audio_path)

            if not command:
                print("No speech detected. Please try again.")
                continue

            # Check for exit commands
            if any(word in command.lower() for word in ["exit", "quit", "stop", "خروج", "بند"]):
                speak("خدا حافظ", lang='ur')
                print("Exiting voice assistant.")
                break

            # Generate and speak response
            answer = generate_response(command)
            speak(answer, lang='ur')

        except KeyboardInterrupt:
            print("\nStopped by user.")
            break
        except Exception as e:
            print(f"Error in main loop: {e}")
            print("Continuing...")

In [49]:
def test_recording():
    """Test the recording function"""
    print("Testing recording function...")
    audio_file = record_audio("test_audio.wav", 3)
    if audio_file:
        print("Recording test successful!")
        # Test transcription
        text = transcribe_audio_file(audio_file)
        print(f"Transcription test: {text}")
    else:
        print("Recording test failed!")

# Uncomment the line below to test recording first
# test_recording()

# Run the assistant
print("Starting Urdu Voice Assistant...")
print("Make sure to allow microphone access when prompted.")

In [50]:
main_colab()

Starting Urdu Voice Assistant...
Make sure to allow microphone access when prompted.
Voice assistant started. Say 'exit', 'quit', or 'stop' to end.


Please speak now...
Recording for 5 seconds...


<IPython.core.display.Javascript object>

Recording... 1.0s / 5s
Recording... 2.0s / 5s
Recording... 3.0s / 5s
Recording... 4.0s / 5s
Recording... 5.0s / 5s
Waiting for audio data... 1.0s
Waiting for audio data... 2.0s
Waiting for audio data... 3.0s
Waiting for audio data... 4.0s
Waiting for audio data... 5.0s
Waiting for audio data... 6.0s
Waiting for audio data... 7.0s
Waiting for audio data... 8.0s
Waiting for audio data... 9.0s
Waiting for audio data... 10.0s
Waiting for audio data... 11.0s
Waiting for audio data... 12.0s
Waiting for audio data... 13.0s
Waiting for audio data... 14.0s
Waiting for audio data... 15.0s
Waiting for audio data... 16.0s
Waiting for audio data... 17.0s
Waiting for audio data... 18.0s
Waiting for audio data... 19.0s
Waiting for audio data... 20.0s
Recording timeout or microphone access denied - no audio received
Failed to record audio. Please try again.

Please speak now...
Recording for 5 seconds...


<IPython.core.display.Javascript object>

Recording... 1.0s / 5s
Recording... 2.0s / 5s
Recording... 3.0s / 5s
Recording... 4.0s / 5s
Recording... 5.0s / 5s
Waiting for audio data... 1.0s
Waiting for audio data... 2.0s
Waiting for audio data... 3.0s

Stopped by user.
Callback triggered at 10:30:27, data length: 81831
Audio data decoded successfully at 10:30:27, length: 61356 bytes
Callback triggered at 10:30:27, data length: 89871
Audio data decoded successfully at 10:30:27, length: 67386 bytes
