In [2]:
import torch
import torch.nn as nn
import torchaudio
from transformers import Wav2Vec2FeatureExtractor, HubertModel
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "facebook/hubert-base-ls960"
)

hubert = HubertModel.from_pretrained(
    "facebook/hubert-base-ls960"
).to(device)
hubert.eval()


Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): HubertEncoder(
    (pos_conv_embed): HubertPositionalConvEmbedding(
      (conv): Para

# **age generalisation for cnn-bn model**

In [4]:
import soundfile as sf

def extract_hubert_features(path, target_len=300):
    # Load audio using soundfile
    wav, sr = sf.read(path)

    # Convert to tensor
    wav = torch.tensor(wav, dtype=torch.float32)

    # If stereo, take mean to convert to mono
    if wav.ndim > 1:
        wav = wav.mean(dim=1)

    # Resample to 16k
    wav = torchaudio.functional.resample(wav, sr, 16000)

    # HuBERT expects shape (1, T)
    wav = wav.squeeze(0) if wav.ndim > 1 else wav

    inputs = feature_extractor(
        wav,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    )

    input_values = inputs.input_values.to(device)

    with torch.no_grad():
        out = hubert(input_values).last_hidden_state.squeeze(0)  # (T, 768)

    # Pad / crop to 300
    if out.shape[0] > target_len:
        out = out[:target_len]
    else:
        pad = target_len - out.shape[0]
        out = torch.cat([out, torch.zeros(pad, 768).to(device)], 0)

    return out


In [5]:
#CNN baseline + batch normalization
class CNN1D_BN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(768, 256, 5, padding=2),
            nn.BatchNorm1d(256),
            nn.ReLU(),

            nn.Conv1d(256, 128, 5, padding=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),

            nn.AdaptiveAvgPool1d(1)
        )
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.transpose(1, 2)   # (B, 768, T)
        x = self.conv(x)        # (B, 128, 1)
        x = x.squeeze(-1)       # (B, 128)
        return self.fc(x)


In [6]:
import torch.serialization
# allow CNN1D_BN class to be unpickled (required by PyTorch 2.6+)
torch.serialization.add_safe_globals([])   # empty is ok if model is fully pickled

model = torch.load(
    "/content/drive/MyDrive/models/cnn_bn_fullmodel.pt",
    weights_only=False
)
model.to(device)
model.eval()

print("Model loaded successfully.")



model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Model loaded successfully.


In [7]:
child_folder = "/content/drive/MyDrive/childrendset-wavs/*.wav"
files = glob.glob(child_folder)

preds = []
fnames = []

for f in files:
    emb = extract_hubert_features(f)      # (300,768)
    emb = emb.unsqueeze(0).to(device)     # (1,300,768)

    with torch.no_grad():
        out = model(emb)
        pred = out.argmax(1).item()

    preds.append(pred)
    fnames.append(f)

print("Total children files:", len(preds))



Total children files: 75


In [8]:
label_to_idx = {
    "andhra":0,
    "gujrat":1,
    "jharkhand":2,
    "karnataka":3,
    "kerala":4,
    "tamil":5
}

idx_to_label = {v:k for k,v in label_to_idx.items()}


In [9]:
from collections import Counter
child_count = Counter(preds)

for idx, c in child_count.items():
    print(f"{idx_to_label[idx]} : {c}")


tamil : 55
karnataka : 4
andhra : 14
jharkhand : 2


In [10]:
confidences = []

for f in files:
    emb = extract_hubert_features(f).unsqueeze(0).to(device)

    with torch.no_grad():
        out = model(emb)
        prob = torch.softmax(out, dim=1).max().item()

    confidences.append(prob)


In [11]:
import numpy as np

print("Avg confidence:", np.mean(confidences))
print("Min confidence:", np.min(confidences))
print("Max confidence:", np.max(confidences))


Avg confidence: 0.664229862689972
Min confidence: 0.3184518814086914
Max confidence: 0.9802047610282898


# **for mfcc model**

In [26]:
import joblib

randf = joblib.load("/content/drive/MyDrive/models/mfcc_random_forest.pkl")


In [27]:
print(type(randf))


<class 'sklearn.ensemble._forest.RandomForestClassifier'>


In [31]:
import librosa
import numpy as np

def extract_mfcc_features(path, n_mfcc=13):
    y, sr = librosa.load(path, sr=16000)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)  # shape (13, T)

    # mean across time → (13,)
    mfcc_mean = np.mean(mfcc, axis=1)

    return mfcc_mean.reshape(1, -1)


In [29]:
import glob

children_files = glob.glob("/content/drive/MyDrive/childrendset-wavs/*.wav")
len(children_files)


75

In [32]:
rf_preds = []
rf_confidences = []

for f in children_files:
    feat = extract_mfcc_features(f, n_mfcc=13)

    prob = randf.predict_proba(feat)[0]
    pred = randf.predict(feat)[0]

    rf_preds.append(pred)
    rf_confidences.append(np.max(prob))




In [33]:
import numpy as np

print("MFCC RF – Children Avg Confidence:", np.mean(rf_confidences))
print("MFCC RF – Children Min Confidence:", np.min(rf_confidences))
print("MFCC RF – Children Max Confidence:", np.max(rf_confidences))


MFCC RF – Children Avg Confidence: 0.35840000000000005
MFCC RF – Children Min Confidence: 0.24
MFCC RF – Children Max Confidence: 0.58
