In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


feautures extracted but got error

In [None]:
# Full pipeline: sequence-embedding extraction -> CNN+BiLSTM training (PyTorch)
# + RandomForest & XGBoost baselines on mean embeddings
# Run in Colab with GPU. Edit paths below.

# 0) Install deps (uncomment if needed)
# !pip install -q transformers librosa soundfile scikit-learn xgboost tqdm

import os, sys, time, math
from pathlib import Path
import numpy as np
import librosa
import soundfile as sf
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import joblib

# ============== USER CONFIG ==============
DATASET_DIR = "/content/drive/MyDrive/augmented_dataset"   # each subfolder is a language with .wav files
OUT_DIR = "/content/drive/MyDrive/lid_features"           # where .npy sequence embeddings & mean embeddings will be saved
HF_MODEL = "facebook/wav2vec2-base-960h"                  # fallback public model; change to ai4bharat/... if accessible
SAMPLE_RATE = 16000
MAX_FRAMES = 250      # number of frames (time steps) to pad/truncate sequence embeddings to
EMBED_DIM = None      # leave None -> derived from model
BATCH_SIZE = 32
NUM_EPOCHS = 40
LR = 1e-4
PATIENCE = 7
RANDOM_SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_WORKERS = 2
# ==========================================

os.makedirs(OUT_DIR, exist_ok=True)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# ---------- 1) Prepare HF model (frame-level embeddings) ----------
print("Loading pretrained model:", HF_MODEL)
processor = Wav2Vec2Processor.from_pretrained(HF_MODEL)
base_model = Wav2Vec2Model.from_pretrained(HF_MODEL).to(DEVICE).eval()

# discover embedding dim
with torch.no_grad():
    # pass a tiny dummy to determine hidden size
    import torch as _t
    x = _t.zeros(1, 16000, dtype=_t.float32).to(DEVICE)
    inputs = processor(x.cpu().numpy()[0], sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
    inputs = {k: v.to(DEVICE) for k,v in inputs.items()}
    hs = base_model(**inputs).last_hidden_state
    EMBED_DIM = hs.shape[-1]
print("Embedding dim:", EMBED_DIM)

# ---------- 2) Extract and save per-file sequence embeddings (padded/truncated) ----------
# We'll also save mean embeddings for tree models
seq_dir = Path(OUT_DIR) / "seq"
mean_dir = Path(OUT_DIR) / "mean"
seq_dir.mkdir(parents=True, exist_ok=True)
mean_dir.mkdir(parents=True, exist_ok=True)

languages = sorted([d for d in os.listdir(DATASET_DIR) if os.path.isdir(os.path.join(DATASET_DIR, d))])
print("Languages found:", languages)

def pad_truncate_feats(feats, max_frames=MAX_FRAMES):
    # feats: (T, D)
    T, D = feats.shape
    if T >= max_frames:
        return feats[:max_frames, :]
    else:
        pad = np.zeros((max_frames - T, D), dtype=feats.dtype)
        return np.vstack([feats, pad])

# Extract embeddings and save files; skip files already processed
print("Extracting embeddings and saving .npy files (this may take a while)")
for lang in languages:
    lang_in = Path(DATASET_DIR) / lang
    out_lang_seq = seq_dir / lang
    out_lang_seq.mkdir(parents=True, exist_ok=True)
    out_lang_mean = mean_dir / lang
    out_lang_mean.mkdir(parents=True, exist_ok=True)

    wav_files = [f for f in sorted(os.listdir(lang_in)) if f.lower().endswith(".wav")]
    for wav in tqdm(wav_files, desc=f"Extract {lang}", leave=False):
        in_path = str(lang_in / wav)
        base = os.path.splitext(wav)[0]
        seq_path = out_lang_seq / (base + ".npy")
        mean_path = out_lang_mean / (base + ".npy")
        if seq_path.exists() and mean_path.exists():
            continue
        try:
            speech, sr = librosa.load(in_path, sr=SAMPLE_RATE, mono=True)
            inputs = processor(speech, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
            inputs = {k: v.to(DEVICE) for k,v in inputs.items()}
            with torch.no_grad():
                hidden = base_model(**inputs).last_hidden_state.squeeze(0).cpu().numpy()  # (T, D)
            seq = pad_truncate_feats(hidden, MAX_FRAMES)  # (MAX_FRAMES, D)
            mean = seq.mean(axis=0)
            np.save(seq_path, seq.astype(np.float32))
            np.save(mean_path, mean.astype(np.float32))
        except Exception as e:
            print("Error:", in_path, e)

# ---------- 3) Build dataset lists (file paths + labels) ----------
seq_files = []
mean_files = []
labels = []
for idx, lang in enumerate(languages):
    seq_folder = seq_dir / lang
    mean_folder = mean_dir / lang
    files = sorted([f for f in os.listdir(seq_folder) if f.endswith(".npy")])
    for f in files:
        seq_files.append(str(seq_folder / f))
        mean_files.append(str(mean_folder / f))
        labels.append(idx)

seq_files = np.array(seq_files)
mean_files = np.array(mean_files)
labels = np.array(labels)
print("Total samples:", len(labels))

# train/test split (stratified)
train_idx, test_idx = train_test_split(np.arange(len(labels)), test_size=0.2, stratify=labels, random_state=RANDOM_SEED)
# optionally you can keep a small validation from train for early stopping; we'll do train/val split
train_idx, val_idx = train_test_split(train_idx, test_size=0.12, stratify=labels[train_idx], random_state=RANDOM_SEED)

# ---------- 4) PyTorch dataset (on-the-fly loading) ----------
class SeqDataset(Dataset):
    def __init__(self, seq_paths, labels):
        self.seq_paths = seq_paths
        self.labels = labels
    def __len__(self):
        return len(self.seq_paths)
    def __getitem__(self, idx):
        seq = np.load(self.seq_paths[idx])  # shape (MAX_FRAMES, D)
        seq = torch.tensor(seq, dtype=torch.float32)  # (T, D)
        label = int(self.labels[idx])
        return seq, label

train_ds = SeqDataset(seq_files[train_idx], labels[train_idx])
val_ds   = SeqDataset(seq_files[val_idx], labels[val_idx])
test_ds  = SeqDataset(seq_files[test_idx], labels[test_idx])

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

# ---------- 5) Model: Conv1D + BiLSTM + Attention pooling ----------
class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.att = nn.Linear(in_dim, 1)
    def forward(self, x):  # x: (B, T, H)
        weights = self.att(x)            # (B,T,1)
        weights = torch.softmax(weights, dim=1)  # (B,T,1)
        context = torch.sum(weights * x, dim=1)  # (B,H)
        return context

class CNN_BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_lstm=256, num_classes=12):
        super().__init__()
        # input: (B, T, D) -> convert to (B, D, T) for Conv1d
        self.conv1 = nn.Conv1d(input_dim, 128, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout = nn.Dropout(0.3)
        self.lstm = nn.LSTM(input_size=256, hidden_size=hidden_lstm, num_layers=1, batch_first=True, bidirectional=True)
        self.att = AttentionPooling(hidden_lstm*2)
        self.fc = nn.Linear(hidden_lstm*2, 256)
        self.out = nn.Linear(256, num_classes)
    def forward(self, x):
        # x: (B, T, D)
        x = x.transpose(1, 2)          # (B, D, T)
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)               # (B, 128, T/2)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)               # (B, 256, T/4)
        x = x.transpose(1, 2)          # (B, T', 256)
        x = self.dropout(x)
        output, _ = self.lstm(x)       # (B, T', H*2)
        x = self.att(output)           # (B, H*2)
        x = F.relu(self.fc(x))
        x = self.dropout(x)
        x = self.out(x)
        return x

num_classes = len(languages)
model = CNN_BiLSTM(input_dim=EMBED_DIM, hidden_lstm=256, num_classes=num_classes).to(DEVICE)
print("Model params:", sum(p.numel() for p in model.parameters())/1e6, "M")

# ---------- 6) Training loop with early stopping & scheduler ----------
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=3, verbose=True)

best_val = 0.0
patience_ctr = 0

for epoch in range(1, NUM_EPOCHS+1):
    model.train()
    running = 0.0
    n = 0
    for xb, yb in tqdm(train_loader, desc=f"Train epoch {epoch}", leave=False):
        xb = xb.to(DEVICE)   # (B, T, D)
        yb = yb.to(DEVICE)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        running += loss.item() * xb.size(0)
        n += xb.size(0)
    train_loss = running / n

    # val
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE); yb = yb.to(DEVICE)
            logits = model(xb)
            preds = logits.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(yb.cpu().numpy())
    val_acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch} TrainLoss: {train_loss:.4f} ValAcc: {val_acc:.4f}")
    scheduler.step(val_acc)

    if val_acc > best_val + 1e-6:
        best_val = val_acc
        torch.save(model.state_dict(), os.path.join(OUT_DIR, "best_cnn_bilstm.pt"))
        patience_ctr = 0
        print("  Saved best model (ValAcc %.4f)" % best_val)
    else:
        patience_ctr += 1
        if patience_ctr >= PATIENCE:
            print("Early stopping triggered at epoch", epoch)
            break

# ---------- 7) Evaluate best CNN+BiLSTM on test set ----------
model.load_state_dict(torch.load(os.path.join(OUT_DIR, "best_cnn_bilstm.pt"), map_location=DEVICE))
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE); yb = yb.to(DEVICE)
        logits = model(xb)
        preds = logits.argmax(dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(yb.cpu().numpy())

print("\n=== CNN+BiLSTM Test Results ===")
print(classification_report(all_labels, all_preds, target_names=languages))
print("Accuracy:", accuracy_score(all_labels, all_preds))

# ---------- 8) Baselines: RF & XGBoost on mean embeddings ----------
# Build arrays by loading mean npy files (we stored mean per file earlier)
mean_X = []
mean_y = []
for p, label in zip(mean_files, labels):
    mean_X.append(np.load(p))
    mean_y.append(int(label))
mean_X = np.array(mean_X); mean_y = np.array(mean_y)
print("Mean embeddings shape:", mean_X.shape)

X_tr, X_te, y_tr, y_te = train_test_split(mean_X, mean_y, test_size=0.2, stratify=mean_y, random_state=RANDOM_SEED)
# RandomForest
rf = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_SEED)
rf.fit(X_tr, y_tr)
rf_pred = rf.predict(X_te)
print("\n--- Random Forest ---")
print("Acc:", accuracy_score(y_te, rf_pred))
print(classification_report(y_te, rf_pred, target_names=languages))

# XGBoost
xgb_clf = xgb.XGBClassifier(n_estimators=800, learning_rate=0.05, max_depth=6, verbosity=0, use_label_encoder=False)
xgb_clf.fit(X_tr, y_tr)
xgb_pred = xgb_clf.predict(X_te)
print("\n--- XGBoost ---")
print("Acc:", accuracy_score(y_te, xgb_pred))
print(classification_report(y_te, xgb_pred, target_names=languages))

# Save baselines & label map
joblib.dump(rf, os.path.join(OUT_DIR, "rf_mean.joblib"))
joblib.dump(xgb_clf, os.path.join(OUT_DIR, "xgb_mean.joblib"))
joblib.dump(languages, os.path.join(OUT_DIR, "languages.pkl"))
print("\nAll done. Models & features saved to:", OUT_DIR)


Loading pretrained model: facebook/wav2vec2-base-960h


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding dim: 768
Languages found: ['Assamese_augmented', 'bengali_augmented', 'english_augmented', 'gujarati_augmented', 'hindi_augmented', 'kannada_augmented', 'malayalam_augmented', 'marathi_augmented', 'nepali_augmented', 'punjabi_augmented', 'tamil_augmented', 'telugu_augmented']
Extracting embeddings and saving .npy files (this may take a while)




Total samples: 12040
Model params: 1.581965 M


TypeError: ReduceLROnPlateau.__init__() got an unexpected keyword argument 'verbose'

training


In [None]:
# ========================================
# STEP 1: Imports
# ========================================
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
import joblib

# ========================================
# STEP 2: Define Dataset (lazy loading)
# ========================================
EMB_DIR = "/content/drive/MyDrive/lid_features/seq"

languages = sorted([d for d in os.listdir(EMB_DIR) if os.path.isdir(os.path.join(EMB_DIR, d))])
print("✅ Languages found:", languages)

file_paths, labels = [], []
for idx, lang in enumerate(languages):
    lang_dir = os.path.join(EMB_DIR, lang)
    npy_files = sorted([os.path.join(lang_dir, f) for f in os.listdir(lang_dir) if f.endswith(".npy")])
    file_paths.extend(npy_files)
    labels.extend([idx] * len(npy_files))

print(f"✅ Total samples: {len(file_paths)}")

from sklearn.model_selection import train_test_split
train_files, test_files, train_labels, test_labels = train_test_split(
    file_paths, labels, test_size=0.2, stratify=labels, random_state=42
)

class LangDataset(Dataset):
    def __init__(self, files, labels):
        self.files = files
        self.labels = labels
    def __len__(self):
        return len(self.files)
    def __getitem__(self, idx):
        data = np.load(self.files[idx])  # (timesteps, features)
        data = data[np.newaxis, :, :]    # add channel dim → (1, timesteps, features)
        return torch.tensor(data, dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

train_ds = LangDataset(train_files, train_labels)
test_ds  = LangDataset(test_files, test_labels)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds, batch_size=16, shuffle=False, num_workers=2, pin_memory=True)

# ========================================
# STEP 3: CNN + BiLSTM Model
# ========================================
class CNNBiLSTM(nn.Module):
    def __init__(self, input_dim, num_classes, hidden_dim=256, num_layers=2):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, input_dim), padding=(1,0))
        self.bn1   = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d((2,1))

        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3,1), padding=(1,0))
        self.bn2   = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d((2,1))

        self.lstm = nn.LSTM(input_size=64, hidden_size=hidden_dim,
                            num_layers=num_layers, batch_first=True, bidirectional=True)

        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool2(x)

        x = x.squeeze(3).permute(0, 2, 1)  # (batch, time, channels)

        lstm_out, _ = self.lstm(x)
        x = lstm_out[:, -1, :]
        x = self.dropout(x)
        return self.fc(x)

# ========================================
# STEP 4: Training
# ========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Peek input dimension from one sample
sample = np.load(train_files[0])
input_dim = sample.shape[1]

model = CNNBiLSTM(input_dim=input_dim, num_classes=len(languages)).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=3)

best_acc = 0
epochs = 30

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    train_loss = total_loss / len(train_loader.dataset)

    # Eval
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            pred_labels = preds.argmax(dim=1)
            all_preds.extend(pred_labels.cpu().numpy())
            all_labels.extend(yb.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    scheduler.step(acc)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {train_loss:.4f} - Acc: {acc:.4f}")

    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "/content/drive/MyDrive/cnn_bilstm_best.pt")

print(f"🔥 Best Accuracy: {best_acc:.4f}")

print("\n📊 Classification Report:")
print(classification_report(all_labels, all_preds, target_names=languages))

joblib.dump(languages, "/content/drive/MyDrive/language_labels.pkl")
print("💾 Saved model and labels to Drive!")


✅ Languages found: ['Assamese_augmented', 'bengali_augmented', 'english_augmented', 'gujarati_augmented', 'hindi_augmented', 'kannada_augmented', 'malayalam_augmented', 'marathi_augmented', 'nepali_augmented', 'punjabi_augmented', 'tamil_augmented', 'telugu_augmented']
✅ Total samples: 12040
Epoch 1/30 - Loss: 2.0816 - Acc: 0.3032
Epoch 2/30 - Loss: 1.7688 - Acc: 0.4012
Epoch 3/30 - Loss: 1.5499 - Acc: 0.4689
Epoch 4/30 - Loss: 1.3842 - Acc: 0.5415
Epoch 5/30 - Loss: 1.2365 - Acc: 0.6055
Epoch 6/30 - Loss: 1.1561 - Acc: 0.6184
Epoch 7/30 - Loss: 1.0519 - Acc: 0.6341
Epoch 8/30 - Loss: 0.9670 - Acc: 0.6474
Epoch 9/30 - Loss: 0.9089 - Acc: 0.6973
Epoch 10/30 - Loss: 0.8687 - Acc: 0.6790
Epoch 11/30 - Loss: 0.8126 - Acc: 0.6769
Epoch 12/30 - Loss: 0.7542 - Acc: 0.7251
Epoch 13/30 - Loss: 0.7102 - Acc: 0.7338
Epoch 14/30 - Loss: 0.6873 - Acc: 0.7633
Epoch 15/30 - Loss: 0.6179 - Acc: 0.7791
Epoch 16/30 - Loss: 0.6005 - Acc: 0.7550
Epoch 17/30 - Loss: 0.5495 - Acc: 0.7861
Epoch 18/30 - Loss:

In [None]:
import os
import numpy as np

MEAN_DIR = "/content/drive/MyDrive/lid_features/mean"

X_mean, y_mean = [], []
languages_mean = sorted([d for d in os.listdir(MEAN_DIR) if os.path.isdir(os.path.join(MEAN_DIR, d))])

print("✅ Languages found in mean features:", languages_mean)

for idx, lang in enumerate(languages_mean):
    lang_dir = os.path.join(MEAN_DIR, lang)
    npy_files = sorted([f for f in os.listdir(lang_dir) if f.endswith(".npy")])

    print(f"{lang}: {len(npy_files)} files")

    for f in npy_files:
        data = np.load(os.path.join(lang_dir, f))   # should be (features,)
        X_mean.append(data)
        y_mean.append(idx)

X_mean = np.array(X_mean)
y_mean = np.array(y_mean)

print("✅ Mean dataset loaded:", X_mean.shape, y_mean.shape)


✅ Languages found in mean features: ['Assamese_augmented', 'bengali_augmented', 'english_augmented', 'gujarati_augmented', 'hindi_augmented', 'kannada_augmented', 'malayalam_augmented', 'marathi_augmented', 'nepali_augmented', 'punjabi_augmented', 'tamil_augmented', 'telugu_augmented']
Assamese_augmented: 1008 files
bengali_augmented: 1008 files
english_augmented: 954 files
gujarati_augmented: 1008 files
hindi_augmented: 1008 files
kannada_augmented: 1008 files
malayalam_augmented: 1006 files
marathi_augmented: 1008 files
nepali_augmented: 1008 files
punjabi_augmented: 1008 files
tamil_augmented: 1008 files
telugu_augmented: 1008 files
✅ Mean dataset loaded: (12040, 768) (12040,)


In [None]:
# =========================================
# Full LID Pipeline: High-Accuracy Version
# =========================================

import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from torch.nn.utils.rnn import pad_sequence
import joblib

# -----------------------
# Paths
# -----------------------
MEAN_DIR = "/content/drive/MyDrive/lid_features/mean"
SEQ_DIR  = "/content/drive/MyDrive/lid_features/seq"
OUT_DIR  = "/content/drive/MyDrive/lid_models"
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------
# Load mean features
# -----------------------
X_mean, y_mean = [], []
languages_mean = sorted([d for d in os.listdir(MEAN_DIR) if os.path.isdir(os.path.join(MEAN_DIR, d))])

for lang in languages_mean:
    lang_dir = os.path.join(MEAN_DIR, lang)
    npy_files = [f for f in os.listdir(lang_dir) if f.endswith(".npy")]
    for f in npy_files:
        arr = np.load(os.path.join(lang_dir, f))
        X_mean.append(arr)
        y_mean.append(lang)

X_mean = np.array(X_mean)
y_mean = np.array(y_mean)

le = LabelEncoder()
y_mean_enc = le.fit_transform(y_mean)

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_mean, y_mean_enc, test_size=0.2, random_state=42, stratify=y_mean_enc
)

print("✅ Mean features ready:", X_train_m.shape, X_test_m.shape)

# -----------------------
# RandomForest baseline
# -----------------------
rf = RandomForestClassifier(n_estimators=500, max_depth=50, random_state=42, n_jobs=-1)
rf.fit(X_train_m, y_train_m)
y_pred_rf = rf.predict(X_test_m)
print("RandomForest Accuracy:", accuracy_score(y_test_m, y_pred_rf))

# -----------------------
# XGBoost baseline (GPU)
# -----------------------
xgb = XGBClassifier(
    n_estimators=500, max_depth=12, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, eval_metric="mlogloss",
    tree_method="gpu_hist", gpu_id=0, n_jobs=-1
)
xgb.fit(X_train_m, y_train_m)
y_pred_xgb = xgb.predict(X_test_m)
print("XGBoost Accuracy:", accuracy_score(y_test_m, y_pred_xgb))

# Save baselines
joblib.dump(rf, os.path.join(OUT_DIR, "rf_mean.joblib"))
joblib.dump(xgb, os.path.join(OUT_DIR, "xgb_mean.joblib"))

# -----------------------
# Load sequence features
# -----------------------
X_seq, y_seq = [], []
languages_seq = sorted([d for d in os.listdir(SEQ_DIR) if os.path.isdir(os.path.join(SEQ_DIR, d))])

for lang in languages_seq:
    lang_dir = os.path.join(SEQ_DIR, lang)
    npy_files = [f for f in os.listdir(lang_dir) if f.endswith(".npy")]
    for f in npy_files:
        arr = np.load(os.path.join(lang_dir, f))
        X_seq.append(arr)
        y_seq.append(lang)

y_seq_enc = le.transform(y_seq)

X_seq_tensors = [torch.tensor(arr, dtype=torch.float32) for arr in X_seq]
X_seq_padded = pad_sequence(X_seq_tensors, batch_first=True)
y_seq_t = torch.tensor(y_seq_enc, dtype=torch.long)

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_seq_padded, y_seq_t, test_size=0.2, random_state=42, stratify=y_seq_t
)

train_data = TensorDataset(X_train_s, y_train_s)
test_data  = TensorDataset(X_test_s, y_test_s)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_data, batch_size=32)

print("✅ Sequence features ready:", X_train_s.shape, y_train_s.shape)

# -----------------------
# CNN + BiLSTM (enhanced)
# -----------------------
class CNN_BiLSTM(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256, num_classes=len(le.classes_)):
        super().__init__()
        self.conv1 = nn.Conv1d(input_dim, 256, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(256, 128, kernel_size=3, padding=1)
        self.relu  = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.lstm  = nn.LSTM(128, hidden_dim, num_layers=2, bidirectional=True, batch_first=True)
        self.fc    = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, x):
        x = x.permute(0,2,1)
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.permute(0,2,1)
        _, (h, _) = self.lstm(x)
        out = torch.cat((h[-2,:,:], h[-1,:,:]), dim=1)
        out = self.dropout(out)
        return self.fc(out)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN_BiLSTM().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)

# -----------------------
# Train CNN + BiLSTM with Early Stopping
# -----------------------
epochs = 25
best_acc = 0
patience = 5
wait = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Evaluate
    model.eval()
    preds_all, labels_all = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            preds_all.extend(torch.argmax(out, dim=1).cpu().numpy())
            labels_all.extend(yb.cpu().numpy())

    acc = accuracy_score(labels_all, preds_all)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(train_loader):.4f} - Acc: {acc:.4f}")

    scheduler.step(acc)

    # Early stopping
    if acc > best_acc:
        best_acc = acc
        wait = 0
        torch.save(model.state_dict(), os.path.join(OUT_DIR, "cnn_bilstm_best.pt"))
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping triggered!")
            break

# Final evaluation
print("Best CNN+BiLSTM Accuracy:", best_acc)
print(classification_report(labels_all, preds_all, target_names=le.classes_))
joblib.dump(le.classes_, os.path.join(OUT_DIR, "languages.pkl"))
print("💾 All models and label map saved!")


✅ Mean features ready: (9632, 768) (2408, 768)


In [None]:
# ========================================
# STEP 1: Imports
# ========================================
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import joblib

# ========================================
# STEP 2: Device
# ========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ========================================
# STEP 3: Model & Feature Extractor
# ========================================
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
base_model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(device)
base_model.eval()  # Freeze by default

# ========================================
# STEP 4: Feature Extraction
# ========================================
def extract_embedding(file_path, augment=False):
    try:
        speech, sr = librosa.load(file_path, sr=16000, mono=True)

        # Optional augmentation
        if augment:
            # Small noise
            speech = speech + 0.005*np.random.randn(len(speech))
            # Time stretch
            rate = np.random.uniform(0.9, 1.1)
            speech = librosa.effects.time_stretch(speech, rate)
            # Pitch shift
            n_steps = np.random.randint(-2, 3)
            speech = librosa.effects.pitch_shift(speech, sr=sr, n_steps=n_steps)

        inputs = feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            hidden_states = base_model(**inputs).last_hidden_state
            embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()

        return embedding
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return None

# ========================================
# STEP 5: Load Dataset
# ========================================
DATASET_DIR = "/content/drive/MyDrive/augmented_dataset"  # change this path
X, y = [], []
languages = sorted([d for d in os.listdir(DATASET_DIR) if os.path.isdir(os.path.join(DATASET_DIR, d))])
print("✅ Languages detected:", languages)

for idx, lang in enumerate(languages):
    lang_dir = os.path.join(DATASET_DIR, lang)
    files = [f for f in os.listdir(lang_dir) if f.endswith(".wav")]

    for f in tqdm(files, desc=f"Processing {lang}"):
        fpath = os.path.join(lang_dir, f)
        emb = extract_embedding(fpath, augment=False)  # No augmentation for main embeddings
        if emb is not None:
            X.append(emb)
            y.append(idx)

X = np.array(X)
y = np.array(y)
print("✅ Dataset loaded:", X.shape, y.shape)

# ========================================
# STEP 6: Normalize Embeddings
# ========================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ========================================
# STEP 7: Train/Test Split
# ========================================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Convert to torch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)

# ========================================
# STEP 8: Neural Network Classifier
# ========================================
class LanguageClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

input_dim = X_train.shape[1]
num_classes = len(languages)
model = LanguageClassifier(input_dim, num_classes).to(device)

# Loss & optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ========================================
# STEP 9: Training Loop
# ========================================
epochs = 25
batch_size = 32

def get_batches(X, y, batch_size):
    for i in range(0, len(X), batch_size):
        yield X[i:i+batch_size], y[i:i+batch_size]

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in get_batches(X_train_tensor, y_train_tensor, batch_size):
        optimizer.zero_grad()
        outputs = model(xb)
        loss = criterion(outputs, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(X_train_tensor):.4f}")

# ========================================
# STEP 10: Evaluation
# ========================================
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    y_pred = torch.argmax(outputs, dim=1).cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred, target_names=languages))

acc = accuracy_score(y_true, y_pred)
print(f"✅ Final Accuracy: {acc:.4f}")

# ========================================
# STEP 11: Save Model & Encoder
# ========================================
torch.save(model.state_dict(), "/content/drive/MyDrive/language_classifier_nn.pth")
joblib.dump(scaler, "/content/drive/MyDrive/language_scaler.pkl")
joblib.dump(languages, "/content/drive/MyDrive/language_labels.pkl")

print("💾 Saved classifier, scaler, and labels to Drive!")


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

✅ Languages detected: ['Assamese_augmented', 'bengali_augmented', 'english_augmented', 'gujarati_augmented', 'hindi_augmented', 'kannada_augmented', 'malayalam_augmented', 'marathi_augmented', 'nepali_augmented', 'punjabi_augmented', 'tamil_augmented', 'telugu_augmented']



Processing Assamese_augmented:   0%|          | 0/1008 [00:00<?, ?it/s][A
Processing Assamese_augmented:   0%|          | 1/1008 [00:33<9:19:01, 33.31s/it][A
Processing Assamese_augmented:   0%|          | 2/1008 [00:34<3:57:13, 14.15s/it][A
Processing Assamese_augmented:   0%|          | 3/1008 [00:35<2:16:21,  8.14s/it][A
Processing Assamese_augmented:   0%|          | 4/1008 [00:35<1:27:12,  5.21s/it][A
Processing Assamese_augmented:   0%|          | 5/1008 [00:36<1:01:11,  3.66s/it][A
Processing Assamese_augmented:   1%|          | 6/1008 [00:37<45:55,  2.75s/it]  [A
Processing Assamese_augmented:   1%|          | 7/1008 [00:38<34:29,  2.07s/it][A
Processing Assamese_augmented:   1%|          | 8/1008 [00:39<28:39,  1.72s/it][A
Processing Assamese_augmented:   1%|          | 9/1008 [00:39<23:17,  1.40s/it][A
Processing Assamese_augmented:   1%|          | 10/1008 [00:40<20:55,  1.26s/it][A
Processing Assamese_augmented:   1%|          | 11/1008 [00:41<18:09,  1.09s/it]

✅ Dataset loaded: (12040, 1024) (12040,)
Epoch 1/25 - Loss: 0.0292
Epoch 2/25 - Loss: 0.0130
Epoch 3/25 - Loss: 0.0101
Epoch 4/25 - Loss: 0.0086
Epoch 5/25 - Loss: 0.0079
Epoch 6/25 - Loss: 0.0088
Epoch 7/25 - Loss: 0.0082
Epoch 8/25 - Loss: 0.0079
Epoch 9/25 - Loss: 0.0073
Epoch 10/25 - Loss: 0.0058
Epoch 11/25 - Loss: 0.0056
Epoch 12/25 - Loss: 0.0054
Epoch 13/25 - Loss: 0.0051
Epoch 14/25 - Loss: 0.0064
Epoch 15/25 - Loss: 0.0043
Epoch 16/25 - Loss: 0.0039
Epoch 17/25 - Loss: 0.0061
Epoch 18/25 - Loss: 0.0083
Epoch 19/25 - Loss: 0.0054
Epoch 20/25 - Loss: 0.0046
Epoch 21/25 - Loss: 0.0050
Epoch 22/25 - Loss: 0.0032
Epoch 23/25 - Loss: 0.0049
Epoch 24/25 - Loss: 0.0032
Epoch 25/25 - Loss: 0.0043

📊 Classification Report:
                     precision    recall  f1-score   support

 Assamese_augmented       1.00      1.00      1.00       201
  bengali_augmented       1.00      1.00      1.00       201
  english_augmented       0.99      0.97      0.98       191
 gujarati_augmented   

above code is 95%

now we have to see

In [None]:
# ========================================
# STEP 1: Imports
# ========================================
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import joblib

# ========================================
# STEP 2: Device
# ========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ========================================
# STEP 3: Model & Feature Extractor
# ========================================
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
base_model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(device)
base_model.eval()  # Freeze backbone

# ========================================
# STEP 4: Feature Extraction (Improved)
# ========================================
def extract_embedding(file_path, augment=False):
    try:
        speech, sr = librosa.load(file_path, sr=16000, mono=True)

        if augment:
            speech = speech + 0.005*np.random.randn(len(speech))
            rate = np.random.uniform(0.9, 1.1)
            speech = librosa.effects.time_stretch(speech, rate)
            n_steps = np.random.randint(-2, 3)
            speech = librosa.effects.pitch_shift(speech, sr=sr, n_steps=n_steps)

        inputs = feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            hidden_states = base_model(**inputs).last_hidden_state
            # Use mean + max pooling for richer embedding
            embedding = torch.cat([hidden_states.mean(dim=1), hidden_states.max(dim=1).values], dim=1)
            embedding = embedding.squeeze().cpu().numpy()

        return embedding
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return None

# ========================================
# STEP 5: Load Dataset
# ========================================
DATASET_DIR = "/content/drive/MyDrive/augmented_dataset"  # change path
X, y = [], []
languages = sorted([d for d in os.listdir(DATASET_DIR) if os.path.isdir(os.path.join(DATASET_DIR, d))])
print("✅ Languages detected:", languages)

for idx, lang in enumerate(languages):
    lang_dir = os.path.join(DATASET_DIR, lang)
    files = [f for f in os.listdir(lang_dir) if f.endswith(".wav")]

    for f in tqdm(files, desc=f"Processing {lang}"):
        fpath = os.path.join(lang_dir, f)
        emb = extract_embedding(fpath, augment=False)
        if emb is not None:
            X.append(emb)
            y.append(idx)

X = np.array(X)
y = np.array(y)
print("✅ Dataset loaded:", X.shape, y.shape)

# ========================================
# STEP 6: Normalize Embeddings
# ========================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ========================================
# STEP 7: Train/Test Split
# ========================================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)

# ========================================
# STEP 8: Improved Neural Network
# ========================================
class LanguageClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

input_dim = X_train.shape[1]
num_classes = len(languages)
model = LanguageClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)  # slightly lower LR

# ========================================
# STEP 9: Training Loop (Improved)
# ========================================
epochs = 35
batch_size = 32

def get_batches(X, y, batch_size):
    for i in range(0, len(X), batch_size):
        yield X[i:i+batch_size], y[i:i+batch_size]

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in get_batches(X_train_tensor, y_train_tensor, batch_size):
        optimizer.zero_grad()
        outputs = model(xb)
        loss = criterion(outputs, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(X_train_tensor):.4f}")

# ========================================
# STEP 10: Evaluation
# ========================================
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    y_pred = torch.argmax(outputs, dim=1).cpu().numpy()
    y_true = y_test_tensor.cpu().numpy()

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred, target_names=languages))

acc = accuracy_score(y_true, y_pred)
print(f"✅ Final Accuracy: {acc:.4f}")

# ========================================
# STEP 11: Save Model & Encoder
# ========================================
torch.save(model.state_dict(), "/content/drive/MyDrive/language_classifier_nn.pth")
joblib.dump(scaler, "/content/drive/MyDrive/language_scaler.pkl")
joblib.dump(languages, "/content/drive/MyDrive/language_labels.pkl")

print("💾 Saved classifier, scaler, and labels to Drive!")


Using device: cuda


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Languages detected: ['Assamese_augmented', 'bengali_augmented', 'english_augmented', 'gujarati_augmented', 'hindi_augmented', 'kannada_augmented', 'malayalam_augmented', 'marathi_augmented', 'nepali_augmented', 'punjabi_augmented', 'tamil_augmented', 'telugu_augmented']


Processing Assamese_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.15it/s]
Processing bengali_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.24it/s]
Processing english_augmented: 100%|██████████| 954/954 [01:11<00:00, 13.31it/s]
Processing gujarati_augmented: 100%|██████████| 1008/1008 [01:15<00:00, 13.38it/s]
Processing hindi_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.25it/s]
Processing kannada_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.22it/s]
Processing malayalam_augmented: 100%|██████████| 1006/1006 [01:14<00:00, 13.49it/s]
Processing marathi_augmented: 100%|██████████| 1008/1008 [01:15<00:00, 13.29it/s]
Processing nepali_augmented: 100%|██████████| 1008/1008 [01:15<00:00, 13.33it/s]
Processing punjabi_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.25it/s]
Processing tamil_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.09it/s]
Processing telugu_augmented: 100%|██████████| 1008/1008 [01:16<00:00, 13.18it/s]


✅ Dataset loaded: (12040, 2048) (12040,)
Epoch 1/35 - Loss: 0.0279
Epoch 2/35 - Loss: 0.0100
Epoch 3/35 - Loss: 0.0069
Epoch 4/35 - Loss: 0.0056
Epoch 5/35 - Loss: 0.0042
Epoch 6/35 - Loss: 0.0035
Epoch 7/35 - Loss: 0.0032
Epoch 8/35 - Loss: 0.0028
Epoch 9/35 - Loss: 0.0027
Epoch 10/35 - Loss: 0.0027
Epoch 11/35 - Loss: 0.0021
Epoch 12/35 - Loss: 0.0021
Epoch 13/35 - Loss: 0.0018
Epoch 14/35 - Loss: 0.0020
Epoch 15/35 - Loss: 0.0016
Epoch 16/35 - Loss: 0.0019
Epoch 17/35 - Loss: 0.0014
Epoch 18/35 - Loss: 0.0013
Epoch 19/35 - Loss: 0.0015
Epoch 20/35 - Loss: 0.0013
Epoch 21/35 - Loss: 0.0014
Epoch 22/35 - Loss: 0.0012
Epoch 23/35 - Loss: 0.0016
Epoch 24/35 - Loss: 0.0014
Epoch 25/35 - Loss: 0.0009
Epoch 26/35 - Loss: 0.0012
Epoch 27/35 - Loss: 0.0010
Epoch 28/35 - Loss: 0.0008
Epoch 29/35 - Loss: 0.0012
Epoch 30/35 - Loss: 0.0014
Epoch 31/35 - Loss: 0.0009
Epoch 32/35 - Loss: 0.0005
Epoch 33/35 - Loss: 0.0010
Epoch 34/35 - Loss: 0.0011
Epoch 35/35 - Loss: 0.0008

📊 Classification Repor

In [None]:
# ===============================
# Install dependencies
# ===============================
#!pip install SpeechRecognition googletrans==4.0.0-rc1 gTTS pydub ipywidgets moviepy soundfile torch transformers

# ===============================
# Imports
# ===============================
import numpy as np
import speech_recognition as sr
from googletrans import Translator
from gtts import gTTS
from IPython.display import Audio, display, clear_output, Javascript
from google.colab import drive, output as colab_output
import ipywidgets as widgets
import joblib
import torch
import torch.nn as nn
from moviepy.editor import VideoFileClip
import base64
import soundfile as sf
import io
import os
import time
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa

# ===============================
# Mount Google Drive
# ===============================
#drive.mount('/content/drive')

# ===============================
# Load saved Wav2Vec2 classifier
# ===============================
# Paths to your trained files
MODEL_PATH = "/content/drive/MyDrive/nn/language_classifier_nn.pth"
SCALER_PATH = "/content/drive/MyDrive/nn/language_scaler.pkl"
LABELS_PATH = "/content/drive/MyDrive/nn/language_labels.pkl"

scaler = joblib.load(SCALER_PATH)
labels = joblib.load(LABELS_PATH)
languages = labels

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define same NN as training
# Define same NN as training (exact)
class LanguageClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Load model
input_dim = 2048  # matches Wav2Vec2 embedding size
num_classes = len(languages)
model = LanguageClassifier(input_dim, num_classes).to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()


# Load Wav2Vec2 feature extractor
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
BASE_MODEL = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53").to(device)
BASE_MODEL.eval()

# ===============================
# gTTS language mapping
# ===============================
lang_code_map = {
    "english_augmented": "en",
    "hindi_augmented": "hi",
    "bengali_augmented": "bn",
    "tamil_augmented": "ta",
    "telugu_augmented": "te",
    "kannada_augmented": "kn",
    "malayalam_augmented": "ml",
    "marathi_augmented": "mr",
    "gujarati_augmented": "gu",
    "punjabi_augmented": "pa",
    "nepali_augmented": "ne",
    "Assamese_augmented": "as"
}

# ===============================
# Initialize recognizer & translator
# ===============================
recognizer = sr.Recognizer()
translator = Translator()

# ===============================
# Feature extraction & prediction
# ===============================
# ===============================
# Feature extraction & prediction (training-style)
# ===============================
def extract_embedding(file_path, augment=False):
    """
    Extract embedding from audio using Wav2Vec2 backbone.
    Matches training: mean + max pooling of last hidden state.
    """
    try:
        # Load audio
        speech, sr_ = librosa.load(file_path, sr=16000, mono=True)

        if augment:
            # Optional small augmentation
            speech = speech + 0.005*np.random.randn(len(speech))
            rate = np.random.uniform(0.9, 1.1)
            speech = librosa.effects.time_stretch(speech, rate)
            n_steps = np.random.randint(-2, 3)
            speech = librosa.effects.pitch_shift(speech, sr=sr_, n_steps=n_steps)

        # Feature extraction
        inputs = FEATURE_EXTRACTOR(speech, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Forward pass through Wav2Vec2
        with torch.no_grad():
            hidden_states = BASE_MODEL(**inputs).last_hidden_state
            # mean + max pooling (matches training)
            embedding = torch.cat([hidden_states.mean(dim=1), hidden_states.max(dim=1).values], dim=1)
            embedding = embedding.squeeze().cpu().numpy()

        return embedding
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return None

def predict_language(file_path):
    """
    Predict the language of an audio file.
    """
    emb = extract_embedding(file_path)
    if emb is None:
        return "[Embedding Error]"

    emb_scaled = scaler.transform([emb])
    tensor = torch.tensor(emb_scaled, dtype=torch.float32).to(device)

    with torch.no_grad():
        outputs = model(tensor)
        pred_idx = torch.argmax(outputs, dim=1).item()

    return labels[pred_idx]


# ===============================
# Transcribe audio
# ===============================
def transcribe_audio(file_path):
    with sr.AudioFile(file_path) as source:
        audio = recognizer.record(source)
        try:
            return recognizer.recognize_google(audio)
        except:
            return "[Could not transcribe audio]"

# ===============================
# Translate & speak text
# ===============================
def translate_text(text, target_lang):
    t_code = lang_code_map.get(target_lang, "en")
    try:
        return translator.translate(text, dest=t_code).text
    except:
        return text

def speak_text(text, target_lang):
    t_code = lang_code_map.get(target_lang, "en")
    tts = gTTS(text=text, lang=t_code)
    tts.save("output.mp3")
    display(Audio("output.mp3", autoplay=True))

# ===============================
# Handle audio/video/mic
# ===============================
def handle_audio_file(file_path, target_lang):
    with output_area:
        clear_output()
        print(f"Processing file: {file_path}")
        detected_lang = predict_language(file_path)
        print("Predicted Language:", detected_lang)
        text = transcribe_audio(file_path)
        print("Transcribed Text:", text)
        translated = translate_text(text, target_lang)
        print(f"Translated ({target_lang}):", translated)
        speak_text(translated, target_lang)

def handle_video_file(file_path, target_lang):
    audio_path = file_path.rsplit('.', 1)[0] + "_audio.wav"
    video = VideoFileClip(file_path)
    video.audio.write_audiofile(audio_path, fps=16000)
    handle_audio_file(audio_path, target_lang)

# ===============================
# Mic recording via JS
# ===============================
recorded_audio = None
RECORD_JS = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
var recorder, gumStream;
var recordButton = document.createElement("button");
recordButton.innerHTML = "🎙️ Start Recording";
recordButton.style.fontSize = "20px";
recordButton.style.padding = "10px";
recordButton.style.margin = "10px";
recordButton.onclick = async () => {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      recordButton.innerHTML = "🎙️ Start Recording";
      return;
  }
  const stream = await navigator.mediaDevices.getUserMedia({audio:true});
  gumStream = stream;
  recorder = new MediaRecorder(stream);
  let chunks = [];
  recorder.ondataavailable = e => { if (e.data.size > 0) chunks.push(e.data); };
  recorder.onstop = async ()=> {
      const blob = new Blob(chunks, { type: 'audio/wav' });
      let arrayBuffer = await new Response(blob).arrayBuffer();
      let base64String = btoa(String.fromCharCode(...new Uint8Array(arrayBuffer)));
      google.colab.kernel.invokeFunction('notebook.get_audio', [base64String], {});
  };
  recorder.start();
  recordButton.innerHTML = "⏹️ Stop Recording";
};
document.body.appendChild(recordButton);
"""

def get_audio(b64string):
    global recorded_audio
    recorded_audio = base64.b64decode(b64string)

colab_output.register_callback('notebook.get_audio', get_audio)

save_dir = "/content/drive/MyDrive/mic_recordings"
os.makedirs(save_dir, exist_ok=True)

def save_recorded_audio(b64_bytes, save_path):
    try:
        audio_buffer = io.BytesIO(b64_bytes)
        data, samplerate = sf.read(audio_buffer)
        sf.write(save_path, data, samplerate)
        print(f"✅ Mic recording saved to Drive: {save_path}")
    except Exception as e:
        print(f"⚠️ Could not save audio properly: {e}")

def handle_mic_click(b):
    with output_area:
        clear_output()
        if recorded_audio:
            filename = f"mic_recording_{int(time.time())}.wav"
            tmp_path = os.path.join(save_dir, filename)
            save_recorded_audio(recorded_audio, tmp_path)
            handle_audio_file(tmp_path, lang_dropdown.value)
        else:
            display(Javascript(RECORD_JS))
            print("⚠️ Please record audio using the button above and click again.")

# ===============================
# GUI
# ===============================
output_area = widgets.Output()

lang_dropdown = widgets.Dropdown(
    options=languages,
    description="Translate to:",
    value="hindi_augmented"
)

audio_upload = widgets.FileUpload(accept=".wav,.mp3", multiple=False)
video_upload = widgets.FileUpload(accept=".mp4,.avi,.mkv,.mov", multiple=False)
mic_button = widgets.Button(description="🎤 Record from Mic")

def on_audio_upload(change):
    if change.new:
        uploaded_file = list(change.new.values())[0]
        with open(uploaded_file['metadata']['name'], "wb") as f:
            f.write(uploaded_file['content'])
        handle_audio_file(uploaded_file['metadata']['name'], lang_dropdown.value)

def on_video_upload(change):
    if change.new:
        uploaded_file = list(change.new.values())[0]
        with open(uploaded_file['metadata']['name'], "wb") as f:
            f.write(uploaded_file['content'])
        handle_video_file(uploaded_file['metadata']['name'], lang_dropdown.value)

audio_upload.observe(on_audio_upload, names='value')
video_upload.observe(on_video_upload, names='value')
mic_button.on_click(handle_mic_click)

display(widgets.VBox([
    widgets.Label("Upload Audio/Video or Record from Mic for Translation"),
    widgets.HBox([audio_upload, video_upload, mic_button]),
    lang_dropdown,
    output_area
]))


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Label(value='Upload Audio/Video or Record from Mic for Translation'), HBox(children=(FileUpload…

with whisper

In [None]:
# ===============================
# Install dependencies
# ===============================
#!pip install SpeechRecognition googletrans==4.0.0-rc1 gTTS pydub ipywidgets moviepy soundfile torch transformers
#!pip install openai-whisper
# ===============================
# Imports
# ===============================




import numpy as np
import speech_recognition as sr
from googletrans import Translator
from gtts import gTTS
from IPython.display import Audio, display, clear_output, Javascript
from google.colab import drive, output as colab_output
import ipywidgets as widgets
import joblib
import torch
import torch.nn as nn
from moviepy.editor import VideoFileClip
import base64
import soundfile as sf
import io
import os
import time
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa

# ===============================
# Mount Google Drive
# ===============================
#drive.mount('/content/drive')

# ===============================
# Load saved Wav2Vec2 classifier
# ===============================
# Paths to your trained files
MODEL_PATH = "/content/drive/MyDrive/nn/language_classifier_nn.pth"
SCALER_PATH = "/content/drive/MyDrive/nn/language_scaler.pkl"
LABELS_PATH = "/content/drive/MyDrive/nn/language_labels.pkl"

scaler = joblib.load(SCALER_PATH)
labels = joblib.load(LABELS_PATH)
languages = labels

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define same NN as training
# Define same NN as training (exact)
class LanguageClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Load model
input_dim = 2048  # matches Wav2Vec2 embedding size
num_classes = len(languages)
model = LanguageClassifier(input_dim, num_classes).to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()


# Load Wav2Vec2 feature extractor
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
BASE_MODEL = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53").to(device)
BASE_MODEL.eval()

# ===============================
# gTTS language mapping
# ===============================
lang_code_map = {
    "english_augmented": "en",
    "hindi_augmented": "hi",
    "bengali_augmented": "bn",
    "tamil_augmented": "ta",
    "telugu_augmented": "te",
    "kannada_augmented": "kn",
    "malayalam_augmented": "ml",
    "marathi_augmented": "mr",
    "gujarati_augmented": "gu",
    "punjabi_augmented": "pa",
    "nepali_augmented": "ne",
    "Assamese_augmented": "as"
}

# ===============================
# Initialize recognizer & translator
# ===============================
recognizer = sr.Recognizer()
translator = Translator()

# ===============================
# Feature extraction & prediction
# ===============================
# ===============================
# Feature extraction & prediction (training-style)
# ===============================
def extract_embedding(file_path, augment=False):
    """
    Extract embedding from audio using Wav2Vec2 backbone.
    Matches training: mean + max pooling of last hidden state.
    """
    try:
        # Load audio
        speech, sr_ = librosa.load(file_path, sr=16000, mono=True)

        if augment:
            # Optional small augmentation
            speech = speech + 0.005*np.random.randn(len(speech))
            rate = np.random.uniform(0.9, 1.1)
            speech = librosa.effects.time_stretch(speech, rate)
            n_steps = np.random.randint(-2, 3)
            speech = librosa.effects.pitch_shift(speech, sr=sr_, n_steps=n_steps)

        # Feature extraction
        inputs = FEATURE_EXTRACTOR(speech, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Forward pass through Wav2Vec2
        with torch.no_grad():
            hidden_states = BASE_MODEL(**inputs).last_hidden_state
            # mean + max pooling (matches training)
            embedding = torch.cat([hidden_states.mean(dim=1), hidden_states.max(dim=1).values], dim=1)
            embedding = embedding.squeeze().cpu().numpy()

        return embedding
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return None

def predict_language(file_path):
    """
    Predict the language of an audio file.
    """
    emb = extract_embedding(file_path)
    if emb is None:
        return "[Embedding Error]"

    emb_scaled = scaler.transform([emb])
    tensor = torch.tensor(emb_scaled, dtype=torch.float32).to(device)

    with torch.no_grad():
        outputs = model(tensor)
        pred_idx = torch.argmax(outputs, dim=1).item()

    return labels[pred_idx]


# ===============================
# Transcribe audio
# ===============================
import whisper

# Load Whisper model (choose "small", "medium", or "large")
whisper_model = whisper.load_model("large")  # you can try "medium" or "large" for higher accuracy

# ===============================
# Transcribe audio with Whisper
# ===============================
def transcribe_audio(file_path):
    """
    Transcribe audio using OpenAI Whisper.
    """
    try:
        result = whisper_model.transcribe(file_path, fp16=False)
        return result["text"]
    except Exception as e:
        print(f"⚠️ Whisper transcription failed: {e}")
        return "[Could not transcribe audio]"

# ===============================
# Translate & speak text
# ===============================
def translate_text(text, target_lang):
    t_code = lang_code_map.get(target_lang, "en")
    try:
        return translator.translate(text, dest=t_code).text
    except:
        return text

def speak_text(text, target_lang):
    t_code = lang_code_map.get(target_lang, "en")
    tts = gTTS(text=text, lang=t_code)
    tts.save("output.mp3")
    display(Audio("output.mp3", autoplay=True))

# ===============================
# Handle audio/video/mic
# ===============================
def handle_audio_file(file_path, target_lang):
    with output_area:
        clear_output()
        print(f"Processing file: {file_path}")
        detected_lang = predict_language(file_path)
        print("Predicted Language:", detected_lang)
        text = transcribe_audio(file_path)
        print("Transcribed Text:", text)
        translated = translate_text(text, target_lang)
        print(f"Translated ({target_lang}):", translated)
        speak_text(translated, target_lang)

def handle_video_file(file_path, target_lang):
    audio_path = file_path.rsplit('.', 1)[0] + "_audio.wav"
    video = VideoFileClip(file_path)
    video.audio.write_audiofile(audio_path, fps=16000)
    handle_audio_file(audio_path, target_lang)

# ===============================
# Mic recording via JS
# ===============================
recorded_audio = None
RECORD_JS = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
var recorder, gumStream;
var recordButton = document.createElement("button");
recordButton.innerHTML = "🎙️ Start Recording";
recordButton.style.fontSize = "20px";
recordButton.style.padding = "10px";
recordButton.style.margin = "10px";
recordButton.onclick = async () => {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      recordButton.innerHTML = "🎙️ Start Recording";
      return;
  }
  const stream = await navigator.mediaDevices.getUserMedia({audio:true});
  gumStream = stream;
  recorder = new MediaRecorder(stream);
  let chunks = [];
  recorder.ondataavailable = e => { if (e.data.size > 0) chunks.push(e.data); };
  recorder.onstop = async ()=> {
      const blob = new Blob(chunks, { type: 'audio/wav' });
      let arrayBuffer = await new Response(blob).arrayBuffer();
      let base64String = btoa(String.fromCharCode(...new Uint8Array(arrayBuffer)));
      google.colab.kernel.invokeFunction('notebook.get_audio', [base64String], {});
  };
  recorder.start();
  recordButton.innerHTML = "⏹️ Stop Recording";
};
document.body.appendChild(recordButton);
"""

def get_audio(b64string):
    global recorded_audio
    recorded_audio = base64.b64decode(b64string)

colab_output.register_callback('notebook.get_audio', get_audio)

save_dir = "/content/drive/MyDrive/mic_recordings"
os.makedirs(save_dir, exist_ok=True)

def save_recorded_audio(b64_bytes, save_path):
    try:
        audio_buffer = io.BytesIO(b64_bytes)
        data, samplerate = sf.read(audio_buffer)
        sf.write(save_path, data, samplerate)
        print(f"✅ Mic recording saved to Drive: {save_path}")
    except Exception as e:
        print(f"⚠️ Could not save audio properly: {e}")

def handle_mic_click(b):
    with output_area:
        clear_output()
        if recorded_audio:
            filename = f"mic_recording_{int(time.time())}.wav"
            tmp_path = os.path.join(save_dir, filename)
            save_recorded_audio(recorded_audio, tmp_path)
            handle_audio_file(tmp_path, lang_dropdown.value)
        else:
            display(Javascript(RECORD_JS))
            print("⚠️ Please record audio using the button above and click again.")

# ===============================
# GUI
# ===============================
output_area = widgets.Output()

lang_dropdown = widgets.Dropdown(
    options=languages,
    description="Translate to:",
    value="hindi_augmented"
)

audio_upload = widgets.FileUpload(accept=".wav,.mp3", multiple=False)
video_upload = widgets.FileUpload(accept=".mp4,.avi,.mkv,.mov", multiple=False)
mic_button = widgets.Button(description="🎤 Record from Mic")

def on_audio_upload(change):
    if change.new:
        uploaded_file = list(change.new.values())[0]
        with open(uploaded_file['metadata']['name'], "wb") as f:
            f.write(uploaded_file['content'])
        handle_audio_file(uploaded_file['metadata']['name'], lang_dropdown.value)

def on_video_upload(change):
    if change.new:
        uploaded_file = list(change.new.values())[0]
        with open(uploaded_file['metadata']['name'], "wb") as f:
            f.write(uploaded_file['content'])
        handle_video_file(uploaded_file['metadata']['name'], lang_dropdown.value)

audio_upload.observe(on_audio_upload, names='value')
video_upload.observe(on_video_upload, names='value')
mic_button.on_click(handle_mic_click)

display(widgets.VBox([
    widgets.Label("Upload Audio/Video or Record from Mic for Translation"),
    widgets.HBox([audio_upload, video_upload, mic_button]),
    lang_dropdown,
    output_area
]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Label(value='Upload Audio/Video or Record from Mic for Translation'), HBox(children=(FileUpload…