In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


training


now we have to see

In [None]:
# ========================================
# STEP 1: Imports
# ========================================
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import joblib

# ========================================
# STEP 2: Device
# ========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ========================================
# STEP 3: Model & Feature Extractor
# ========================================
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
base_model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(device)
base_model.eval()  # Freeze backbone

# ========================================
# STEP 4: Feature Extraction (Improved)
# ========================================
def extract_embedding(file_path, augment=False):
    try:
        speech, sr = librosa.load(file_path, sr=16000, mono=True)

        if augment:
            speech = speech + 0.005*np.random.randn(len(speech))
            rate = np.random.uniform(0.9, 1.1)
            speech = librosa.effects.time_stretch(speech, rate)
            n_steps = np.random.randint(-2, 3)
            speech = librosa.effects.pitch_shift(speech, sr=sr, n_steps=n_steps)

        inputs = feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            hidden_states = base_model(**inputs).last_hidden_state
            # Use mean + max pooling for richer embedding
            embedding = torch.cat([hidden_states.mean(dim=1), hidden_states.max(dim=1).values], dim=1)
            embedding = embedding.squeeze().cpu().numpy()

        return embedding
    except Exception as e:
        print(f" Error processing {file_path}: {e}")
        return None

# ========================================
# STEP 5: Load Dataset
# ========================================
DATASET_DIR = "/content/drive/MyDrive/augmented_dataset"  # change path
X, y = [], []
languages = sorted([d for d in os.listdir(DATASET_DIR) if os.path.isdir(os.path.join(DATASET_DIR, d))])
print(" Languages detected:", languages)

for idx, lang in enumerate(languages):
    lang_dir = os.path.join(DATASET_DIR, lang)
    files = [f for f in os.listdir(lang_dir) if f.endswith(".wav")]

    for f in tqdm(files, desc=f"Processing {lang}"):
        fpath = os.path.join(lang_dir, f)
        emb = extract_embedding(fpath, augment=False)
        if emb is not None:
            X.append(emb)
            y.append(idx)

X = np.array(X)
y = np.array(y)
print(" Dataset loaded:", X.shape, y.shape)

# ========================================
# STEP 6: Normalize Embeddings
# ========================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ========================================
# STEP 7: Train/Test Split
# ========================================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)

# ========================================
# STEP 8: Improved Neural Network
# ========================================
class LanguageClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

input_dim = X_train.shape[1]
num_classes = len(languages)
model = LanguageClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)  # slightly lower LR

# ========================================
# STEP 9: Training Loop (Improved)
# ========================================
epochs = 35
batch_size = 32

def get_batches(X, y, batch_size):
    for i in range(0, len(X), batch_size):
        yield X[i:i+batch_size], y[i:i+batch_size]

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in get_batches(X_train_tensor, y_train_tensor, batch_size):
        optimizer.zero_grad()
        outputs = model(xb)
        loss = criterion(outputs, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(X_train_tensor):.4f}")

# ========================================
# STEP 10: Evaluation
# ========================================
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    y_pred = torch.argmax(outputs, dim=1).cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=languages))

acc = accuracy_score(y_true, y_pred)
print(f" Final Accuracy: {acc:.4f}")

# ========================================
# STEP 11: Save Model & Encoder
# ========================================
torch.save(model.state_dict(), "/content/drive/MyDrive/language_classifier_nn.pth")
joblib.dump(scaler, "/content/drive/MyDrive/language_scaler.pkl")
joblib.dump(languages, "/content/drive/MyDrive/language_labels.pkl")

print(" Saved classifier, scaler, and labels to Drive!")


Using device: cuda


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Languages detected: ['Assamese_augmented', 'bengali_augmented', 'english_augmented', 'gujarati_augmented', 'hindi_augmented', 'kannada_augmented', 'malayalam_augmented', 'marathi_augmented', 'nepali_augmented', 'punjabi_augmented', 'tamil_augmented', 'telugu_augmented']


Processing Assamese_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.15it/s]
Processing bengali_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.24it/s]
Processing english_augmented: 100%|██████████| 954/954 [01:11<00:00, 13.31it/s]
Processing gujarati_augmented: 100%|██████████| 1008/1008 [01:15<00:00, 13.38it/s]
Processing hindi_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.25it/s]
Processing kannada_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.22it/s]
Processing malayalam_augmented: 100%|██████████| 1006/1006 [01:14<00:00, 13.49it/s]
Processing marathi_augmented: 100%|██████████| 1008/1008 [01:15<00:00, 13.29it/s]
Processing nepali_augmented: 100%|██████████| 1008/1008 [01:15<00:00, 13.33it/s]
Processing punjabi_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.25it/s]
Processing tamil_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.09it/s]
Processing telugu_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.18it/s]


✅ Dataset loaded: (12040, 2048) (12040,)
Epoch 1/35 - Loss: 0.0279
Epoch 2/35 - Loss: 0.0100
Epoch 3/35 - Loss: 0.0069
Epoch 4/35 - Loss: 0.0056
Epoch 5/35 - Loss: 0.0042
Epoch 6/35 - Loss: 0.0035
Epoch 7/35 - Loss: 0.0032
Epoch 8/35 - Loss: 0.0028
Epoch 9/35 - Loss: 0.0027
Epoch 10/35 - Loss: 0.0027
Epoch 11/35 - Loss: 0.0021
Epoch 12/35 - Loss: 0.0021
Epoch 13/35 - Loss: 0.0018
Epoch 14/35 - Loss: 0.0020
Epoch 15/35 - Loss: 0.0016
Epoch 16/35 - Loss: 0.0019
Epoch 17/35 - Loss: 0.0014
Epoch 18/35 - Loss: 0.0013
Epoch 19/35 - Loss: 0.0015
Epoch 20/35 - Loss: 0.0013
Epoch 21/35 - Loss: 0.0014
Epoch 22/35 - Loss: 0.0012
Epoch 23/35 - Loss: 0.0016
Epoch 24/35 - Loss: 0.0014
Epoch 25/35 - Loss: 0.0009
Epoch 26/35 - Loss: 0.0012
Epoch 27/35 - Loss: 0.0010
Epoch 28/35 - Loss: 0.0008
Epoch 29/35 - Loss: 0.0012
Epoch 30/35 - Loss: 0.0014
Epoch 31/35 - Loss: 0.0009
Epoch 32/35 - Loss: 0.0005
Epoch 33/35 - Loss: 0.0010
Epoch 34/35 - Loss: 0.0011
Epoch 35/35 - Loss: 0.0008

📊 Classification Repor

with whisper

In [None]:
# ===============================
# Install dependencies
# ===============================
#!pip install SpeechRecognition googletrans==4.0.0-rc1 gTTS pydub ipywidgets moviepy soundfile torch transformers
#!pip install openai-whisper
# ===============================
# Imports
# ===============================




import numpy as np
import speech_recognition as sr
from googletrans import Translator
from gtts import gTTS
from IPython.display import Audio, display, clear_output, Javascript
from google.colab import drive, output as colab_output
import ipywidgets as widgets
import joblib
import torch
import torch.nn as nn
from moviepy.editor import VideoFileClip
import base64
import soundfile as sf
import io
import os
import time
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa

# ===============================
# Mount Google Drive
# ===============================
#drive.mount('/content/drive')

# ===============================
# Load saved Wav2Vec2 classifier
# ===============================
# Paths to your trained files
MODEL_PATH = "/content/drive/MyDrive/nn/language_classifier_nn.pth"
SCALER_PATH = "/content/drive/MyDrive/nn/language_scaler.pkl"
LABELS_PATH = "/content/drive/MyDrive/nn/language_labels.pkl"

scaler = joblib.load(SCALER_PATH)
labels = joblib.load(LABELS_PATH)
languages = labels

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define same NN as training
# Define same NN as training (exact)
class LanguageClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Load model
input_dim = 2048  # matches Wav2Vec2 embedding size
num_classes = len(languages)
model = LanguageClassifier(input_dim, num_classes).to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()


# Load Wav2Vec2 feature extractor
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
BASE_MODEL = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53").to(device)
BASE_MODEL.eval()

# ===============================
# gTTS language mapping
# ===============================
lang_code_map = {
    "english_augmented": "en",
    "hindi_augmented": "hi",
    "bengali_augmented": "bn",
    "tamil_augmented": "ta",
    "telugu_augmented": "te",
    "kannada_augmented": "kn",
    "malayalam_augmented": "ml",
    "marathi_augmented": "mr",
    "gujarati_augmented": "gu",
    "punjabi_augmented": "pa",
    "nepali_augmented": "ne",
    "Assamese_augmented": "as"
}

# ===============================
# Initialize recognizer & translator
# ===============================
recognizer = sr.Recognizer()
translator = Translator()

# ===============================
# Feature extraction & prediction
# ===============================
# ===============================
# Feature extraction & prediction (training-style)
# ===============================
def extract_embedding(file_path, augment=False):
    """
    Extract embedding from audio using Wav2Vec2 backbone.
    Matches training: mean + max pooling of last hidden state.
    """
    try:
        # Load audio
        speech, sr_ = librosa.load(file_path, sr=16000, mono=True)

        if augment:
            # Optional small augmentation
            speech = speech + 0.005*np.random.randn(len(speech))
            rate = np.random.uniform(0.9, 1.1)
            speech = librosa.effects.time_stretch(speech, rate)
            n_steps = np.random.randint(-2, 3)
            speech = librosa.effects.pitch_shift(speech, sr=sr_, n_steps=n_steps)

        # Feature extraction
        inputs = FEATURE_EXTRACTOR(speech, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Forward pass through Wav2Vec2
        with torch.no_grad():
            hidden_states = BASE_MODEL(**inputs).last_hidden_state
            # mean + max pooling (matches training)
            embedding = torch.cat([hidden_states.mean(dim=1), hidden_states.max(dim=1).values], dim=1)
            embedding = embedding.squeeze().cpu().numpy()

        return embedding
    except Exception as e:
        print(f"  Error processing {file_path}: {e}")
        return None

def predict_language(file_path):
    """
    Predict the language of an audio file.
    """
    emb = extract_embedding(file_path)
    if emb is None:
        return "[Embedding Error]"

    emb_scaled = scaler.transform([emb])
    tensor = torch.tensor(emb_scaled, dtype=torch.float32).to(device)

    with torch.no_grad():
        outputs = model(tensor)
        pred_idx = torch.argmax(outputs, dim=1).item()

    return labels[pred_idx]


# ===============================
# Transcribe audio
# ===============================
import whisper

# Load Whisper model (choose "small", "medium", or "large")
whisper_model = whisper.load_model("large")  # you can try "medium" or "large" for higher accuracy

# ===============================
# Transcribe audio with Whisper
# ===============================
def transcribe_audio(file_path):
    """
    Transcribe audio using OpenAI Whisper.
    """
    try:
        result = whisper_model.transcribe(file_path, fp16=False)
        return result["text"]
    except Exception as e:
        print(f"  Whisper transcription failed: {e}")
        return "[Could not transcribe audio]"

# ===============================
# Translate & speak text
# ===============================
def translate_text(text, target_lang):
    t_code = lang_code_map.get(target_lang, "en")
    try:
        return translator.translate(text, dest=t_code).text
    except:
        return text

def speak_text(text, target_lang):
    t_code = lang_code_map.get(target_lang, "en")
    tts = gTTS(text=text, lang=t_code)
    tts.save("output.mp3")
    display(Audio("output.mp3", autoplay=True))

# ===============================
# Handle audio/video/mic
# ===============================
def handle_audio_file(file_path, target_lang):
    with output_area:
        clear_output()
        print(f"Processing file: {file_path}")
        detected_lang = predict_language(file_path)
        print("Predicted Language:", detected_lang)
        text = transcribe_audio(file_path)
        print("Transcribed Text:", text)
        translated = translate_text(text, target_lang)
        print(f"Translated ({target_lang}):", translated)
        speak_text(translated, target_lang)

def handle_video_file(file_path, target_lang):
    audio_path = file_path.rsplit('.', 1)[0] + "_audio.wav"
    video = VideoFileClip(file_path)
    video.audio.write_audiofile(audio_path, fps=16000)
    handle_audio_file(audio_path, target_lang)

# ===============================
# Mic recording via JS
# ===============================
recorded_audio = None
RECORD_JS = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
var recorder, gumStream;
var recordButton = document.createElement("button");
recordButton.innerHTML = "🎙️ Start Recording";
recordButton.style.fontSize = "20px";
recordButton.style.padding = "10px";
recordButton.style.margin = "10px";
recordButton.onclick = async () => {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      recordButton.innerHTML = "🎙️ Start Recording";
      return;
  }
  const stream = await navigator.mediaDevices.getUserMedia({audio:true});
  gumStream = stream;
  recorder = new MediaRecorder(stream);
  let chunks = [];
  recorder.ondataavailable = e => { if (e.data.size > 0) chunks.push(e.data); };
  recorder.onstop = async ()=> {
      const blob = new Blob(chunks, { type: 'audio/wav' });
      let arrayBuffer = await new Response(blob).arrayBuffer();
      let base64String = btoa(String.fromCharCode(...new Uint8Array(arrayBuffer)));
      google.colab.kernel.invokeFunction('notebook.get_audio', [base64String], {});
  };
  recorder.start();
  recordButton.innerHTML = "⏹️ Stop Recording";
};
document.body.appendChild(recordButton);
"""

def get_audio(b64string):
    global recorded_audio
    recorded_audio = base64.b64decode(b64string)

colab_output.register_callback('notebook.get_audio', get_audio)

save_dir = "/content/drive/MyDrive/mic_recordings"
os.makedirs(save_dir, exist_ok=True)

def save_recorded_audio(b64_bytes, save_path):
    try:
        audio_buffer = io.BytesIO(b64_bytes)
        data, samplerate = sf.read(audio_buffer)
        sf.write(save_path, data, samplerate)
        print(f" Mic recording saved to Drive: {save_path}")
    except Exception as e:
        print(f"Could not save audio properly: {e}")

def handle_mic_click(b):
    with output_area:
        clear_output()
        if recorded_audio:
            filename = f"mic_recording_{int(time.time())}.wav"
            tmp_path = os.path.join(save_dir, filename)
            save_recorded_audio(recorded_audio, tmp_path)
            handle_audio_file(tmp_path, lang_dropdown.value)
        else:
            display(Javascript(RECORD_JS))
            print(" Please record audio using the button above and click again.")

# ===============================
# GUI
# ===============================
output_area = widgets.Output()

lang_dropdown = widgets.Dropdown(
    options=languages,
    description="Translate to:",
    value="hindi_augmented"
)

audio_upload = widgets.FileUpload(accept=".wav,.mp3", multiple=False)
video_upload = widgets.FileUpload(accept=".mp4,.avi,.mkv,.mov", multiple=False)
mic_button = widgets.Button(description="🎤 Record from Mic")

def on_audio_upload(change):
    if change.new:
        uploaded_file = list(change.new.values())[0]
        with open(uploaded_file['metadata']['name'], "wb") as f:
            f.write(uploaded_file['content'])
        handle_audio_file(uploaded_file['metadata']['name'], lang_dropdown.value)

def on_video_upload(change):
    if change.new:
        uploaded_file = list(change.new.values())[0]
        with open(uploaded_file['metadata']['name'], "wb") as f:
            f.write(uploaded_file['content'])
        handle_video_file(uploaded_file['metadata']['name'], lang_dropdown.value)

audio_upload.observe(on_audio_upload, names='value')
video_upload.observe(on_video_upload, names='value')
mic_button.on_click(handle_mic_click)

display(widgets.VBox([
    widgets.Label("Upload Audio/Video or Record from Mic for Translation"),
    widgets.HBox([audio_upload, video_upload, mic_button]),
    lang_dropdown,
    output_area
]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Label(value='Upload Audio/Video or Record from Mic for Translation'), HBox(children=(FileUpload…

without weights

In [None]:
# ===============================
# Install dependencies torch transformers
# ===============================
!pip install SpeechRecognition googletrans==4.0.0-rc1 gTTS pydub ipywidgets moviepy soundfile
!pip install openai-whisper

# ===============================
# Imports
# ===============================
import numpy as np
import speech_recognition as sr
from googletrans import Translator
from gtts import gTTS
from IPython.display import Audio, display, clear_output, Javascript
from google.colab import output as colab_output
import ipywidgets as widgets
from moviepy.editor import VideoFileClip
import base64
import soundfile as sf
import io
import os
import time
import whisper

# ===============================
# Whisper model (for transcription)
# ===============================
whisper_model = whisper.load_model("large")  # you can change to "medium" or "small" for faster results

# ===============================
# Translator & gTTS language mapping
# ===============================
translator = Translator()

lang_code_map = {
    "english": "en",
    "hindi": "hi",
    "bengali": "bn",
    "tamil": "ta",
    "telugu": "te",
    "kannada": "kn",
    "malayalam": "ml",
    "marathi": "mr",
    "gujarati": "gu",
    "punjabi": "pa",
    "nepali": "ne",
    "assamese": "as"
}

# ===============================
# Core functions
# ===============================
def transcribe_audio(file_path):
    """Transcribe audio using Whisper."""
    try:
        result = whisper_model.transcribe(file_path, fp16=False)
        return result["text"]
    except Exception as e:
        print(f" Whisper transcription failed: {e}")
        return "[Could not transcribe audio]"

def translate_text(text, target_lang):
    """Translate text using Google Translate."""
    t_code = lang_code_map.get(target_lang, "en")
    try:
        return translator.translate(text, dest=t_code).text
    except Exception as e:
        print(f" Translation failed: {e}")
        return text

def speak_text(text, target_lang):
    """Convert text to speech using gTTS."""
    t_code = lang_code_map.get(target_lang, "en")
    try:
        tts = gTTS(text=text, lang=t_code)
        tts.save("output.mp3")
        display(Audio("output.mp3", autoplay=True))
    except Exception as e:
        print(f" Speech synthesis failed: {e}")

# ===============================
# Audio/Video/Mic Handlers
# ===============================
def handle_audio_file(file_path, target_lang):
    with output_area:
        clear_output()
        print(f"Processing file: {file_path}")
        text = transcribe_audio(file_path)
        print("Transcribed Text:", text)
        translated = translate_text(text, target_lang)
        print(f"Translated ({target_lang}):", translated)
        speak_text(translated, target_lang)

def handle_video_file(file_path, target_lang):
    """Extract audio from video and process."""
    audio_path = file_path.rsplit('.', 1)[0] + "_audio.wav"
    video = VideoFileClip(file_path)
    video.audio.write_audiofile(audio_path, fps=16000)
    handle_audio_file(audio_path, target_lang)

# ===============================
# Microphone Recording Setup
# ===============================
recorded_audio = None

RECORD_JS = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
var recorder, gumStream;
var recordButton = document.createElement("button");
recordButton.innerHTML = "🎙️ Start Recording";
recordButton.style.fontSize = "20px";
recordButton.style.padding = "10px";
recordButton.style.margin = "10px";
recordButton.onclick = async () => {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      recordButton.innerHTML = "🎙️ Start Recording";
      return;
  }
  const stream = await navigator.mediaDevices.getUserMedia({audio:true});
  gumStream = stream;
  recorder = new MediaRecorder(stream);
  let chunks = [];
  recorder.ondataavailable = e => { if (e.data.size > 0) chunks.push(e.data); };
  recorder.onstop = async ()=> {
      const blob = new Blob(chunks, { type: 'audio/wav' });
      let arrayBuffer = await new Response(blob).arrayBuffer();
      let base64String = btoa(String.fromCharCode(...new Uint8Array(arrayBuffer)));
      google.colab.kernel.invokeFunction('notebook.get_audio', [base64String], {});
  };
  recorder.start();
  recordButton.innerHTML = "⏹️ Stop Recording";
};
document.body.appendChild(recordButton);
"""

def get_audio(b64string):
    global recorded_audio
    recorded_audio = base64.b64decode(b64string)

colab_output.register_callback('notebook.get_audio', get_audio)

save_dir = "/content/mic_recordings"
os.makedirs(save_dir, exist_ok=True)

def save_recorded_audio(b64_bytes, save_path):
    """Save microphone audio from base64 to wav."""
    try:
        audio_buffer = io.BytesIO(b64_bytes)
        data, samplerate = sf.read(audio_buffer)
        sf.write(save_path, data, samplerate)
        print(f"Mic recording saved: {save_path}")
    except Exception as e:
        print(f" Could not save audio properly: {e}")

def handle_mic_click(b):
    with output_area:
        clear_output()
        if recorded_audio:
            filename = f"mic_recording_{int(time.time())}.wav"
            tmp_path = os.path.join(save_dir, filename)
            save_recorded_audio(recorded_audio, tmp_path)
            handle_audio_file(tmp_path, lang_dropdown.value)
        else:
            display(Javascript(RECORD_JS))
            print(" Please record audio using the button above and click again.")

# ===============================
# GUI Setup
# ===============================
output_area = widgets.Output()

lang_dropdown = widgets.Dropdown(
    options=list(lang_code_map.keys()),
    description="Translate to:",
    value="hindi"
)

audio_upload = widgets.FileUpload(accept=".wav,.mp3", multiple=False)
video_upload = widgets.FileUpload(accept=".mp4,.avi,.mkv,.mov", multiple=False)
mic_button = widgets.Button(description="🎤 Record from Mic")

def on_audio_upload(change):
    if change.new:
        uploaded_file = list(change.new.values())[0]
        with open(uploaded_file['metadata']['name'], "wb") as f:
            f.write(uploaded_file['content'])
        handle_audio_file(uploaded_file['metadata']['name'], lang_dropdown.value)

def on_video_upload(change):
    if change.new:
        uploaded_file = list(change.new.values())[0]
        with open(uploaded_file['metadata']['name'], "wb") as f:
            f.write(uploaded_file['content'])
        handle_video_file(uploaded_file['metadata']['name'], lang_dropdown.value)

audio_upload.observe(on_audio_upload, names='value')
video_upload.observe(on_video_upload, names='value')
mic_button.on_click(handle_mic_click)

display(widgets.VBox([
    widgets.Label(" Upload Audio/Video or Record from Mic for Translation"),
    widgets.HBox([audio_upload, video_upload, mic_button]),
    lang_dropdown,
    output_area
]))


Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* 

  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"
  lines_video = [l for l in lines if ' Video: ' in l and re.search('\d+x\d+', l)]
  rotation_lines = [l for l in lines if 'rotate          :' in l and re.search('\d+$', l)]
  match = re.search('\d+$', rotation_line)
  if event.key is 'enter':

100%|█████████████████████████████████████| 2.88G/2.88G [02:26<00:00, 21.1MiB/s]


VBox(children=(Label(value='🎧 Upload Audio/Video or Record from Mic for Translation'), HBox(children=(FileUplo…