<a href="https://colab.research.google.com/github/MohiteYash/baby/blob/main/Fusion_old_New.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchaudio torchvision transformers
import torch
import torchaudio
import torchaudio.transforms as T
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
from transformers import ViTModel, ViTFeatureExtractor
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import librosa
import librosa.display
import matplotlib.pyplot as plt


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import ViTModel, ViTFeatureExtractor
from torch.cuda.amp import autocast, GradScaler  # For mixed precision training

# 1. Define Dataset Class
class BabyCryDataset(Dataset):
    def __init__(self, file_paths, labels, sr=22050, max_len=100):
        self.file_paths = file_paths
        self.labels = labels
        self.sr = sr
        self.max_len = max_len
        self.vit_processor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]

        # Load audio
        y, sr = librosa.load(file_path, sr=self.sr)

        # Compute Mel Spectrogram (ResNet & ViT)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec = torch.tensor(mel_spec).unsqueeze(0)  # (1, 128, Time)
        mel_spec_resized = torch.nn.functional.interpolate(mel_spec.unsqueeze(0), size=(224, 224), mode="bilinear", align_corners=False).squeeze(0)

        # Convert 1-channel spectrogram to 3-channel for ResNet & ViT
        vit_input = torch.cat([mel_spec_resized] * 3, dim=0)  # (3, 224, 224)
        resnet_input = torch.cat([mel_spec_resized] * 3, dim=0)  # (3, 224, 224)

        # Compute MFCCs (Bi-LSTM)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        mfcc = np.pad(mfcc, ((0, 0), (0, max(0, self.max_len - mfcc.shape[1]))), mode='constant')[:, :self.max_len]
        mfcc = torch.tensor(mfcc).T  # (max_len, 20)

        return vit_input, resnet_input, mfcc, torch.tensor(label, dtype=torch.long)

#2. Load Dataset
data_dir = "/content/drive/MyDrive/augmented_baby_cry"
file_paths, labels = [], []

for class_idx, class_name in enumerate(os.listdir(data_dir)):
    class_path = os.path.join(data_dir, class_name)
    if not os.path.isdir(class_path) or not os.listdir(class_path):  # Skip empty folders
        continue
    for fname in os.listdir(class_path):
        if fname.endswith(".wav"):
            file_paths.append(os.path.join(class_path, fname))
            labels.append(class_idx)

# Split dataset
train_paths, test_paths, train_labels, test_labels = train_test_split(file_paths, labels, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = BabyCryDataset(train_paths, train_labels)
test_dataset = BabyCryDataset(test_paths, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 📌 3. Define Fusion Model
class FusionModel(nn.Module):
    def __init__(self, num_classes=5):
        super(FusionModel, self).__init__()

        # ResNet-50 for spectrograms
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
        self.resnet.fc = nn.Identity()  # Remove final classification layer

        # ViT for spectrograms
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.vit_fc = nn.Linear(768, 512)

        # Bi-LSTM for MFCCs
        self.lstm = nn.LSTM(input_size=20, hidden_size=128, num_layers=2, batch_first=True)
        self.lstm_fc = nn.Linear(128, 256)

        # Fusion & Classification
        self.fc1 = nn.Linear(512 + 2048 + 256, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, vit_input, resnet_input, mfcc):
        resnet_features = self.resnet(resnet_input)
        vit_outputs = self.vit(vit_input)
        vit_features = vit_outputs.last_hidden_state[:, 0, :]  # CLS token
        vit_features = self.vit_fc(vit_features)

        lstm_out, _ = self.lstm(mfcc)
        lstm_features = self.lstm_fc(lstm_out[:, -1, :])

        fusion = torch.cat((resnet_features, vit_features, lstm_features), dim=1)
        x = self.fc1(fusion)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x

# 📌 4. Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FusionModel(num_classes=5).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scaler = GradScaler()  # Mixed precision scaler

num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for vit_input, resnet_input, mfcc, labels in train_loader:
        vit_input, resnet_input, mfcc, labels = vit_input.to(device), resnet_input.to(device), mfcc.to(device), labels.to(device)
        labels = labels.squeeze().long()

        optimizer.zero_grad()

        with autocast():
            outputs = model(vit_input, resnet_input, mfcc)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 131MB/s]


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

  scaler = GradScaler()  # Mixed precision scaler
  with autocast():


Epoch 1/50, Loss: 1.0299
Epoch 2/50, Loss: 0.6337
Epoch 3/50, Loss: 0.5196
Epoch 4/50, Loss: 0.5330
Epoch 5/50, Loss: 0.6162
Epoch 6/50, Loss: 0.6144
Epoch 7/50, Loss: 0.6157
Epoch 8/50, Loss: 0.4748
Epoch 9/50, Loss: 0.4919
Epoch 10/50, Loss: 0.3194
Epoch 11/50, Loss: 0.2583
Epoch 12/50, Loss: 0.2324
Epoch 13/50, Loss: 0.2739
Epoch 14/50, Loss: 0.4246
Epoch 15/50, Loss: 0.1926
Epoch 16/50, Loss: 0.4673
Epoch 17/50, Loss: 0.2483
Epoch 18/50, Loss: 0.4703
Epoch 19/50, Loss: 0.5408
Epoch 20/50, Loss: 0.3388
Epoch 21/50, Loss: 0.3693
Epoch 22/50, Loss: 0.3352
Epoch 23/50, Loss: 0.3002
Epoch 24/50, Loss: 0.3081
Epoch 25/50, Loss: 0.2119
Epoch 26/50, Loss: 0.2459
Epoch 27/50, Loss: 0.3049
Epoch 28/50, Loss: 0.1145
Epoch 29/50, Loss: 0.1580
Epoch 30/50, Loss: 0.1244
Epoch 31/50, Loss: 0.1612
Epoch 32/50, Loss: 0.1400
Epoch 33/50, Loss: 0.2514
Epoch 34/50, Loss: 0.0844
Epoch 35/50, Loss: 0.0624
Epoch 36/50, Loss: 0.0605
Epoch 37/50, Loss: 0.0937
Epoch 38/50, Loss: 0.1315
Epoch 39/50, Loss: 0.

In [3]:
torch.save(model.state_dict(), "fusion_model.pth")
print("Model saved successfully!")


Model saved successfully!


In [4]:
model = FusionModel(num_classes=5).to(device)
model.load_state_dict(torch.load("fusion_model.pth"))
model.eval()  # Set to evaluation mode
print("Model loaded successfully!")


Model loaded successfully!


In [5]:
from sklearn.metrics import accuracy_score, classification_report

# Move model to evaluation mode
model.eval()

true_labels = []
predicted_labels = []

with torch.no_grad():
    for vit_input, resnet_input, mfcc, labels in test_loader:
        vit_input, resnet_input, mfcc, labels = vit_input.to(device), resnet_input.to(device), mfcc.to(device), labels.to(device)

        outputs = model(vit_input, resnet_input, mfcc)
        _, preds = torch.max(outputs, 1)  # Get the highest probability class

        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(preds.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Display precision, recall, F1-score
print("Classification Report:\n", classification_report(true_labels, predicted_labels))


Test Accuracy: 99.31%
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99        73
           1       1.00      1.00      1.00        77
           2       0.97      1.00      0.99        74
           4       1.00      0.99      0.99        67

    accuracy                           0.99       291
   macro avg       0.99      0.99      0.99       291
weighted avg       0.99      0.99      0.99       291



In [6]:
class CustomDataset(Dataset):
    def __init__(self, file_paths, sr=22050, max_len=100):
        self.file_paths, self.sr, self.max_len = file_paths, sr, max_len

    def __len__(self): return len(self.file_paths)

    def __getitem__(self, idx):
        y, sr = librosa.load(self.file_paths[idx], sr=self.sr)
        mel_spec = librosa.power_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128), ref=np.max)
        mel_spec = torch.tensor(mel_spec).unsqueeze(0)
        mel_spec_resized = torch.nn.functional.interpolate(mel_spec.unsqueeze(0), size=(224, 224), mode="bilinear").squeeze(0)
        vit_input = resnet_input = torch.cat([mel_spec_resized] * 3, dim=0)
        mfcc = torch.tensor(np.pad(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20), ((0, 0), (0, max(0, self.max_len - mel_spec.shape[1]))), mode='constant')[:, :self.max_len]).T
        return vit_input, resnet_input, mfcc

# Load Dataset
test_path = "/content/drive/MyDrive/test_baby_mix"
file_paths = [os.path.join(test_path, f) for f in os.listdir(test_path) if f.endswith(".wav")]

if not file_paths:
    raise FileNotFoundError("🚨 No .wav files found in the directory!")

test_loader = DataLoader(CustomDataset(file_paths), batch_size=1, shuffle=False)
print(f"✅ {len(file_paths)} Test Samples Loaded!")


✅ 47 Test Samples Loaded!


In [7]:
predictions = []
with torch.no_grad():
    for vit_input, resnet_input, mfcc in test_loader:
        vit_input, resnet_input, mfcc = vit_input.to(device), resnet_input.to(device), mfcc.to(device)
        _, predicted = torch.max(model(vit_input, resnet_input, mfcc), 1)
        predictions.append(predicted.item())

print("✅ Inference Complete! 🎯")
print(f"Predicted Classes: {predictions}")


✅ Inference Complete! 🎯
Predicted Classes: [0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 0, 4, 0, 2, 4, 0, 0, 4, 4, 4, 4, 0, 2, 2, 4, 0, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [8]:
import os
import torch
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, root_dir, sr=22050, max_len=100):
        self.sr = sr
        self.max_len = max_len
        self.file_paths = []
        self.labels = []

        # Scan all subdirectories
        class_folders = sorted(os.listdir(root_dir))  # Ensure consistent order for class labels
        self.class_to_idx = {class_name: idx for idx, class_name in enumerate(class_folders)}

        for class_name in class_folders:
            class_path = os.path.join(root_dir, class_name)
            if os.path.isdir(class_path):  # Ensure it's a directory
                for file_name in os.listdir(class_path):
                    if file_name.endswith(".wav"):
                        self.file_paths.append(os.path.join(class_path, file_name))
                        self.labels.append(self.class_to_idx[class_name])  # Assign correct label

    def __len__(self): return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]

        # Load audio
        y, sr = librosa.load(file_path, sr=self.sr)

        # Compute Mel Spectrogram (for ResNet & ViT)
        mel_spec = librosa.power_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128), ref=np.max)
        mel_spec = torch.tensor(mel_spec).unsqueeze(0)  # (1, 128, Time)
        mel_spec_resized = torch.nn.functional.interpolate(mel_spec.unsqueeze(0), size=(224, 224), mode="bilinear").squeeze(0)

        # Convert grayscale spectrogram to 3-channel for ResNet & ViT
        vit_input = resnet_input = torch.cat([mel_spec_resized] * 3, dim=0)  # (3, 224, 224)

        # Compute MFCCs (for Bi-LSTM)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        mfcc = np.pad(mfcc, ((0, 0), (0, max(0, self.max_len - mfcc.shape[1]))), mode='constant')[:, :self.max_len]
        mfcc = torch.tensor(mfcc).T  # (max_len, 20)

        return vit_input, resnet_input, mfcc, torch.tensor(label, dtype=torch.long)

# Load Dataset
test_path = "/content/drive/MyDrive/testing data_baby"
test_dataset = CustomDataset(test_path)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

print(f"✅ {len(test_dataset)} Test Samples Loaded with {len(test_dataset.class_to_idx)} Classes!")
print(f"Class Mapping: {test_dataset.class_to_idx}")  # Debugging class-label mapping


✅ 109 Test Samples Loaded with 5 Classes!
Class Mapping: {'belly_pain': 0, 'burping': 1, 'discomfort': 2, 'hungry': 3, 'tired': 4}


In [9]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for vit_input, resnet_input, mfcc, labels in test_loader:
        vit_input, resnet_input, mfcc, labels = vit_input.to(device), resnet_input.to(device), mfcc.to(device), labels.to(device)

        outputs = model(vit_input, resnet_input, mfcc)  # ✅ No .unsqueeze(0)
        _, predicted = torch.max(outputs, 1)

        y_true.append(labels.item())
        y_pred.append(predicted.item())

print(f"Predicted Labels: {y_pred}")
print(f"True Labels: {y_true}")


Predicted Labels: [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 0, 0, 4, 4, 2, 4, 0, 2, 1, 0, 4, 2, 4, 0, 4, 0, 4, 2, 2, 4, 0, 1, 0, 0, 4, 4, 2, 4, 0, 0, 4, 4, 0, 4, 0, 4, 2, 4, 4, 2, 4, 4, 0, 0, 1, 0, 0, 4, 0, 2, 0, 4, 2, 0, 2, 1, 4, 2, 0, 4, 1, 4, 4, 4, 2, 4, 1, 4, 2, 4, 2, 1, 2, 0, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
True Labels: [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(f"📊 Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print(f"📊 Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
print(f"📊 Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
print(f"📊 F1 Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")


📊 Accuracy: 0.2661
📊 Precision: 0.0735
📊 Recall: 0.2661
📊 F1 Score: 0.1145


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
