In [1]:
import sys
print(sys.executable)


C:\Users\Kushal\anaconda3\envs\accentid\python.exe


In [2]:
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


In [3]:
DATASET_PATH = r"C:\Users\Kushal\OneDrive - Vasavi College Of Engineering\Desktop\Kushal\Studies\IIITH RI\final_project\datasets"  # ← UPDATE THIS PATH

label_map = {
    "andhra_pradesh": "telugu",
    "gujrat": "gujarati",
    "jharkhand": "hindi",
    "karnataka": "kannada",
    "kerala": "malayalam",
    "tamil": "tamil"
}


In [4]:
audio_files = []
labels = []

for region_folder in os.listdir(DATASET_PATH):
    folder_path = os.path.join(DATASET_PATH, region_folder)
    if os.path.isdir(folder_path):
        language_label = label_map.get(region_folder)
        for file in os.listdir(folder_path):
            if file.endswith(".wav"):
                audio_files.append(os.path.join(folder_path, file))
                labels.append(language_label)

print("Total audio samples:", len(audio_files))
print("Example:", audio_files[0], "→", labels[0])


Total audio samples: 8116
Example: C:\Users\Kushal\OneDrive - Vasavi College Of Engineering\Desktop\Kushal\Studies\IIITH RI\final_project\datasets\andhra_pradesh\Andhra_speaker (1).wav → telugu


In [5]:
le = LabelEncoder()
encoded_labels = le.fit_transform(labels)
print("Classes:", le.classes_)


Classes: ['gujarati' 'hindi' 'kannada' 'malayalam' 'tamil' 'telugu']


In [6]:
import librosa

def extract_mfcc(file_path, n_mfcc=40, max_len=200):
    audio, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)

    # Normalize length by padding/truncation → fixed shape
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0,0),(0,pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]

    return mfcc


In [7]:
import os
import numpy as np

mfcc_cache = "mfcc_features.npy"
label_cache = "labels.npy"

if os.path.exists(mfcc_cache) and os.path.exists(label_cache):
    print("✅ MFCC features already extracted. Loading from cache...")
    X = np.load(mfcc_cache)
    y = np.load(label_cache)
else:
    print("⏳ Extracting MFCC features... (one-time operation)")
    X = []
    for f in tqdm(audio_files):
        mfcc = extract_mfcc(f)
        X.append(mfcc)
    X = np.array(X)
    y = np.array(encoded_labels)

    np.save(mfcc_cache, X)
    np.save(label_cache, y)

print("MFCC feature shape:", X.shape)

✅ MFCC features already extracted. Loading from cache...
MFCC feature shape: (8116, 40, 200)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, " Test:", X_test.shape)


Train: (6492, 40, 200)  Test: (1624, 40, 200)


In [9]:
from sklearn.ensemble import RandomForestClassifier

X_train_flat = X_train.reshape(len(X_train), -1)
X_test_flat = X_test.reshape(len(X_test), -1)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train_flat, y_train)

y_pred = clf.predict(X_test_flat)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.9901477832512315
              precision    recall  f1-score   support

    gujarati       1.00      0.97      0.98        60
       hindi       1.00      0.98      0.99       166
     kannada       1.00      0.99      1.00       337
   malayalam       0.99      0.99      0.99       334
       tamil       0.99      0.99      0.99       368
      telugu       0.98      0.99      0.98       359

    accuracy                           0.99      1624
   macro avg       0.99      0.99      0.99      1624
weighted avg       0.99      0.99      0.99      1624



In [10]:
import torch
import torchaudio
from transformers import HubertModel, AutoFeatureExtractor

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
hubert = HubertModel.from_pretrained("facebook/hubert-base-ls960").to(device)


  from .autonotebook import tqdm as notebook_tqdm


Using: cpu


In [11]:
def extract_hubert_embedding(file_path):
    # Load audio
    audio, sr = torchaudio.load(file_path)

    # If stereo → convert to mono by averaging channels
    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)

    # Resample to 16000 Hz if needed
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        audio = resampler(audio)

    audio = audio.squeeze()  # shape: (samples,)

    # Prepare input for HuBERT
    inputs = feature_extractor(
        audio,
        sampling_rate=16000,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = hubert(**inputs)
        hidden_states = outputs.last_hidden_state  # (1, frames, 768)

    # Mean pooling across time frames → (768,)
    embedding = hidden_states.mean(dim=1).cpu().numpy().squeeze()

    return embedding


In [12]:
emb = extract_hubert_embedding(audio_files[0])
emb.shape


(768,)

In [13]:
hubert_cache = "hubert_features.npy"

if os.path.exists(hubert_cache):
    print("✅ HuBERT embeddings already extracted. Loading from cache...")
    hubert_features = np.load(hubert_cache)
else:
    print("⏳ Extracting HuBERT embeddings... (this is the heavy step, only once)")
    hubert_features = []
    for f in tqdm(audio_files):
        emb = extract_hubert_embedding(f)
        hubert_features.append(emb)

    hubert_features = np.array(hubert_features)
    np.save(hubert_cache, hubert_features)

print("HuBERT feature matrix:", hubert_features.shape)


✅ HuBERT embeddings already extracted. Loading from cache...
HuBERT feature matrix: (8116, 768)


In [14]:
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
    hubert_features, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train_h.shape, "Test:", X_test_h.shape)


Train: (6492, 768) Test: (1624, 768)


In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Dropout, Reshape

num_classes = len(le.classes_)

model = Sequential([
    Reshape((1, 768), input_shape=(768,)),       # convert vector to sequence of length 1
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.35),
    Dense(128, activation='relu'),
    Dropout(0.35),
    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 1, 768)            0         
                                                                 
 bidirectional (Bidirection  (None, 256)               918528    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 6)                 774       
                                                        

In [16]:
import numpy as np
import tensorflow as tf

print("NumPy:", np.__version__)
print("TF:", tf.__version__)


NumPy: 1.26.4
TF: 2.14.0


In [17]:
history = model.fit(
    X_train_h, y_train_h,
    validation_data=(X_test_h, y_test_h),
    epochs=8,
    batch_size=32
)


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [18]:
y_pred_probs = model.predict(X_test_h)
y_pred = np.argmax(y_pred_probs, axis=1)

print("BiLSTM Accuracy:", accuracy_score(y_test_h, y_pred))
print(classification_report(y_test_h, y_pred, target_names=le.classes_))


BiLSTM Accuracy: 0.9926108374384236
              precision    recall  f1-score   support

    gujarati       1.00      1.00      1.00        60
       hindi       0.99      1.00      0.99       166
     kannada       0.99      1.00      1.00       337
   malayalam       0.99      0.99      0.99       334
       tamil       1.00      0.99      0.99       368
      telugu       0.99      0.99      0.99       359

    accuracy                           0.99      1624
   macro avg       0.99      0.99      0.99      1624
weighted avg       0.99      0.99      0.99      1624



In [19]:
adult_files = []
child_files = []
adult_labels = []
child_labels = []

for file, lbl in zip(audio_files, labels):
    name = os.path.basename(file).lower()
    if "child" in name:
        child_files.append(file)
        child_labels.append(lbl)
    else:
        adult_files.append(file)
        adult_labels.append(lbl)

print("Adult samples:", len(adult_files))
print("Child samples:", len(child_files))


Adult samples: 8116
Child samples: 0


In [20]:
word_files = []
sentence_files = []
word_labels = []
sentence_labels = []

for f, lbl in zip(audio_files, labels):
    audio, sr = librosa.load(f, sr=16000)
    duration = librosa.get_duration(y=audio, sr=sr)

    if duration < 2.5:
        word_files.append(f)
        word_labels.append(lbl)
    else:
        sentence_files.append(f)
        sentence_labels.append(lbl)

print("Word samples:", len(word_files))
print("Sentence samples:", len(sentence_files))


Word samples: 730
Sentence samples: 7386


In [21]:
import os
import numpy as np

word_cache = "word_hubert_features.npy"
word_label_cache = "word_labels.npy"
sentence_cache = "sentence_hubert_features.npy"
sentence_label_cache = "sentence_labels.npy"

# WORD FEATURES
if os.path.exists(word_cache) and os.path.exists(word_label_cache):
    print("✅ Word-level HuBERT embeddings already cached. Loading...")
    word_emb = np.load(word_cache)
    word_labels_enc = np.load(word_label_cache)
else:
    print("⏳ Extracting word-level embeddings... (one-time operation)")
    word_emb = np.array([extract_hubert_embedding(f) for f in tqdm(word_files)])
    word_labels_enc = le.transform(word_labels)

    np.save(word_cache, word_emb)
    np.save(word_label_cache, word_labels_enc)
    print("✅ Saved word-level embeddings for future reuse.")

# SENTENCE FEATURES
if os.path.exists(sentence_cache) and os.path.exists(sentence_label_cache):
    print("✅ Sentence-level HuBERT embeddings already cached. Loading...")
    sentence_emb = np.load(sentence_cache)
    sentence_labels_enc = np.load(sentence_label_cache)
else:
    print("⏳ Extracting sentence-level embeddings... (one-time operation)")
    sentence_emb = np.array([extract_hubert_embedding(f) for f in tqdm(sentence_files)])
    sentence_labels_enc = le.transform(sentence_labels)

    np.save(sentence_cache, sentence_emb)
    np.save(sentence_label_cache, sentence_labels_enc)
    print("✅ Saved sentence-level embeddings for future reuse.")

print("Word Embeddings:", word_emb.shape)
print("Sentence Embeddings:", sentence_emb.shape)


✅ Word-level HuBERT embeddings already cached. Loading...
✅ Sentence-level HuBERT embeddings already cached. Loading...
Word Embeddings: (730, 768)
Sentence Embeddings: (7386, 768)


In [22]:
word_preds = np.argmax(model.predict(word_emb), axis=1)
sentence_preds = np.argmax(model.predict(sentence_emb), axis=1)

print("Word Accuracy:", accuracy_score(word_labels_enc, word_preds))
print("Sentence Accuracy:", accuracy_score(sentence_labels_enc, sentence_preds))


Word Accuracy: 0.9945205479452055
Sentence Accuracy: 0.9983753046303818


In [23]:
def get_speaker_id(path):
    name = os.path.basename(path)
    # remove extension
    name = name.split(".")[0]
    # split on space / underscore / parenthesis
    for sep in ["(", "_", "-"]:
        name = name.split(sep)[0]
    return name.lower().strip()

speaker_ids = [get_speaker_id(f) for f in audio_files]

print("Unique speakers:", len(set(speaker_ids)))


Unique speakers: 7


In [24]:
test_file = audio_files[5678]   # or any path

emb = extract_hubert_embedding(test_file)
emb = emb.reshape(1, -1)
pred = np.argmax(model.predict(emb), axis=1)
print("Predicted Accent:", le.inverse_transform(pred)[0])


Predicted Accent: malayalam


In [25]:
import pickle
import numpy as np
import tensorflow as tf

# Paths for saved artifacts
MODEL_PATH = "accent_bilstm_model.h5"
ENCODER_PATH = "label_encoder.pkl"
HISTORY_PATH = "train_history.pkl"
MFCC_PATH = "mfcc_features.npy"
HUBERT_PATH = "hubert_features.npy"
LABELS_PATH = "labels.npy"

# ----------------------------
# SAVE SECTION
# ----------------------------

# Save trained model
model.save(MODEL_PATH)

# Save label encoder
with open(ENCODER_PATH, "wb") as f:
    pickle.dump(le, f)

# Save training history
with open(HISTORY_PATH, "wb") as f:
    pickle.dump(history.history, f)

# Save feature matrices (only if they exist in memory)
if 'X' in globals():
    np.save(MFCC_PATH, X)
if 'hubert_features' in globals():
    np.save(HUBERT_PATH, hubert_features)
if 'y' in globals():
    np.save(LABELS_PATH, y)

print("✅ All model components saved successfully.")


  saving_api.save_model(


✅ All model components saved successfully.


In [26]:
import pickle
import numpy as np
import tensorflow as tf

MODEL_PATH = "accent_bilstm_model.h5"
ENCODER_PATH = "label_encoder.pkl"
HISTORY_PATH = "train_history.pkl"
MFCC_PATH = "mfcc_features.npy"
HUBERT_PATH = "hubert_features.npy"
LABELS_PATH = "labels.npy"

# Load model
model = tf.keras.models.load_model(MODEL_PATH)

# Load label encoder
with open(ENCODER_PATH, "rb") as f:
    le = pickle.load(f)

# Load history
with open(HISTORY_PATH, "rb") as f:
    history_dict = pickle.load(f)

# Load cached features
X = np.load(MFCC_PATH)
hubert_features = np.load(HUBERT_PATH)
y = np.load(LABELS_PATH)

print("✅ Model, encoder, features, and history loaded successfully.")
print("Feature shapes:", X.shape, hubert_features.shape)


✅ Model, encoder, features, and history loaded successfully.
Feature shapes: (8116, 40, 200) (8116, 768)
