In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os


DATA_PATH = "/content/drive/MyDrive/dataset"
SAVE_PATH = "/content/drive/MyDrive"

# sanity check
print("States found:", sorted(os.listdir(DATA_PATH))[:10])



Mounted at /content/drive
States found: ['andhra_pradesh', 'gujrat', 'jharkhand', 'karnataka', 'kerala', 'tamil']


In [None]:
import numpy as np
import librosa
from tqdm import tqdm
import torch
from transformers import AutoFeatureExtractor, HubertModel

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
hubert = HubertModel.from_pretrained("facebook/hubert-base-ls960").to(device)
hubert.eval()

features, labels = [], []
states = sorted(os.listdir(DATA_PATH))  # keep this order for consistent labels

for label, state in enumerate(states):
    folder = os.path.join(DATA_PATH, state)
    wavs = [f for f in os.listdir(folder) if f.lower().endswith(".wav")]
    print(f"Processing {state}: {len(wavs)} files")

    for fname in tqdm(wavs, desc=state):
        try:
            path = os.path.join(folder, fname)
            y, sr = librosa.load(path, sr=16000)
            # (optional) normalize + limit to 5s for consistency
            y = librosa.util.normalize(y)
            if len(y) > 16000*5:
                y = y[:16000*5]

            inputs = extractor(y, sampling_rate=16000, return_tensors="pt", padding=True).to(device)
            with torch.no_grad():
                emb = hubert(**inputs).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

            features.append(emb)
            labels.append(label)
        except Exception as e:
            print("skip:", fname, "->", e)

features = np.array(features, dtype=np.float32)
labels = np.array(labels, dtype=np.int64)

np.save(f"{SAVE_PATH}/hubert_features.npy", features)
np.save(f"{SAVE_PATH}/hubert_labels.npy", labels)

import joblib
joblib.dump(states, f"{SAVE_PATH}/hubert_states.pkl")  # ðŸ”’ save labelâ†’state mapping

print("âœ… Saved:")
print("features:", features.shape, "| labels:", labels.shape)
print("states order:", states)


Using device: cpu
Processing andhra_pradesh: 1794 files


andhra_pradesh: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1794/1794 [31:18<00:00,  1.05s/it]


Processing gujrat: 298 files


gujrat: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 298/298 [05:43<00:00,  1.15s/it]


Processing jharkhand: 827 files


jharkhand: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 827/827 [18:09<00:00,  1.32s/it]


Processing karnataka: 1686 files


karnataka: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1686/1686 [39:13<00:00,  1.40s/it]


Processing kerala: 1671 files


kerala: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1671/1671 [33:35<00:00,  1.21s/it]


Processing tamil: 1840 files


tamil: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‰| 1839/1840 [34:44<00:01,  1.21s/it]