<a href="https://colab.research.google.com/github/MohiteYash/baby/blob/main/baby_fus_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchaudio torchvision transformers
import torch
import torchaudio
import torchaudio.transforms as T
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
from transformers import ViTModel, ViTFeatureExtractor
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import librosa
import librosa.display
import matplotlib.pyplot as plt


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [3]:
class BabyCryDataset(Dataset):
    def __init__(self, file_paths, labels, sr=22050, max_len=100):
        self.file_paths = file_paths
        self.labels = labels
        self.sr = sr
        self.max_len = max_len
        self.vit_processor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]

        # Load audio
        y, sr = librosa.load(file_path, sr=self.sr)

        # Compute Mel Spectrogram (ResNet & ViT)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec = torch.tensor(mel_spec).unsqueeze(0)  # (1, 128, Time)
        mel_spec_resized = torch.nn.functional.interpolate(mel_spec.unsqueeze(0), size=(224, 224), mode="bilinear", align_corners=False).squeeze(0)

        # ViT expects 3 channels, duplicate spectrogram across channels
        vit_input = torch.cat([mel_spec_resized] * 3, dim=0)  # (3, 224, 224)

        # Compute MFCCs (Bi-LSTM)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        mfcc = np.pad(mfcc, ((0, 0), (0, max(0, self.max_len - mfcc.shape[1]))), mode='constant')[:, :self.max_len]
        mfcc = torch.tensor(mfcc).T  # (max_len, 20)

        return vit_input, mel_spec_resized, mfcc, torch.tensor(label, dtype=torch.long)

# Load dataset
data_dir = "/content/drive/MyDrive/augmented_baby_cry"
file_paths, labels = [], []

for class_idx, class_name in enumerate(os.listdir(data_dir)):
    class_path = os.path.join(data_dir, class_name)

    if not os.path.isdir(class_path) or not os.listdir(class_path):  # Skip non-folders and empty folders
        continue

    for fname in os.listdir(class_path):
        file_path = os.path.join(class_path, fname)
        if not file_path.endswith(".wav"):  # Ensure only .wav files are used
            continue
        file_paths.append(file_path)
        labels.append(class_idx)

# Split dataset
train_paths, test_paths, train_labels, test_labels = train_test_split(file_paths, labels, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = BabyCryDataset(train_paths, train_labels)
test_dataset = BabyCryDataset(test_paths, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [4]:
class FusionModel(nn.Module):
    def __init__(self, num_classes=5):
        super(FusionModel, self).__init__()

        # ResNet-50 for spectrograms
        self.resnet = models.resnet50(pretrained=True)
        self.resnet.fc = nn.Identity()  # Remove final classification layer (2048-dim output)

        # ViT for spectrograms
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.vit_fc = nn.Linear(768, 512)  # Project ViT output to match ResNet

        # Bi-LSTM for MFCCs
        self.lstm = nn.LSTM(input_size=20, hidden_size=128, num_layers=2, batch_first=True)
        self.lstm_fc = nn.Linear(128, 256)

        # Fusion & Classification
        self.fc1 = nn.Linear(512 + 2048 + 256, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, vit_input, resnet_input, mfcc):
        # ResNet Feature Extraction (batch, 3, 224, 224) → (batch, 2048)
        resnet_features = self.resnet(resnet_input)

        # ViT Feature Extraction (batch, 3, 224, 224) → (batch, 768)
        vit_outputs = self.vit(vit_input)
        vit_features = vit_outputs.last_hidden_state[:, 0, :]  # CLS token
        vit_features = self.vit_fc(vit_features)

        # LSTM Feature Extraction (batch, sequence_length=100, features=20)
        lstm_out, _ = self.lstm(mfcc)  # (batch, seq_len, 128)
        lstm_features = self.lstm_fc(lstm_out[:, -1, :])  # Extract last time-step

        # Fusion (batch, 2048 + 512 + 256) → (batch, 512)
        fusion = torch.cat((resnet_features, vit_features, lstm_features), dim=1)
        x = self.fc1(fusion)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x

In [7]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FusionModel(num_classes=5).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for vit_input, resnet_input, mfcc, labels in train_loader:
        # Move data to GPU
        vit_input, resnet_input, mfcc, labels = vit_input.to(device), resnet_input.to(device), mfcc.to(device), labels.to(device)
        labels = labels.squeeze().long()  # Ensure labels are in correct shape

        # Forward pass
        optimizer.zero_grad()
        outputs = model(vit_input, resnet_input, mfcc)

        # Compute loss
        loss = criterion(outputs, labels)
        loss.backward()

        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update parameters
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[16, 1, 224, 224] to have 3 channels, but got 1 channels instead

In [8]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import ViTModel, ViTFeatureExtractor
from torch.cuda.amp import autocast, GradScaler  # For mixed precision training

# 📌 1. Define Dataset Class
class BabyCryDataset(Dataset):
    def __init__(self, file_paths, labels, sr=22050, max_len=100):
        self.file_paths = file_paths
        self.labels = labels
        self.sr = sr
        self.max_len = max_len
        self.vit_processor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]

        # Load audio
        y, sr = librosa.load(file_path, sr=self.sr)

        # Compute Mel Spectrogram (ResNet & ViT)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec = torch.tensor(mel_spec).unsqueeze(0)  # (1, 128, Time)
        mel_spec_resized = torch.nn.functional.interpolate(mel_spec.unsqueeze(0), size=(224, 224), mode="bilinear", align_corners=False).squeeze(0)

        # Convert 1-channel spectrogram to 3-channel for ResNet & ViT
        vit_input = torch.cat([mel_spec_resized] * 3, dim=0)  # (3, 224, 224)
        resnet_input = torch.cat([mel_spec_resized] * 3, dim=0)  # (3, 224, 224)

        # Compute MFCCs (Bi-LSTM)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        mfcc = np.pad(mfcc, ((0, 0), (0, max(0, self.max_len - mfcc.shape[1]))), mode='constant')[:, :self.max_len]
        mfcc = torch.tensor(mfcc).T  # (max_len, 20)

        return vit_input, resnet_input, mfcc, torch.tensor(label, dtype=torch.long)

# 📌 2. Load Dataset
data_dir = "/content/drive/MyDrive/augmented_baby_cry"
file_paths, labels = [], []

for class_idx, class_name in enumerate(os.listdir(data_dir)):
    class_path = os.path.join(data_dir, class_name)
    if not os.path.isdir(class_path) or not os.listdir(class_path):  # Skip empty folders
        continue
    for fname in os.listdir(class_path):
        if fname.endswith(".wav"):
            file_paths.append(os.path.join(class_path, fname))
            labels.append(class_idx)

# Split dataset
train_paths, test_paths, train_labels, test_labels = train_test_split(file_paths, labels, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = BabyCryDataset(train_paths, train_labels)
test_dataset = BabyCryDataset(test_paths, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 📌 3. Define Fusion Model
class FusionModel(nn.Module):
    def __init__(self, num_classes=5):
        super(FusionModel, self).__init__()

        # ResNet-50 for spectrograms
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
        self.resnet.fc = nn.Identity()  # Remove final classification layer

        # ViT for spectrograms
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.vit_fc = nn.Linear(768, 512)

        # Bi-LSTM for MFCCs
        self.lstm = nn.LSTM(input_size=20, hidden_size=128, num_layers=2, batch_first=True)
        self.lstm_fc = nn.Linear(128, 256)

        # Fusion & Classification
        self.fc1 = nn.Linear(512 + 2048 + 256, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, vit_input, resnet_input, mfcc):
        resnet_features = self.resnet(resnet_input)
        vit_outputs = self.vit(vit_input)
        vit_features = vit_outputs.last_hidden_state[:, 0, :]  # CLS token
        vit_features = self.vit_fc(vit_features)

        lstm_out, _ = self.lstm(mfcc)
        lstm_features = self.lstm_fc(lstm_out[:, -1, :])

        fusion = torch.cat((resnet_features, vit_features, lstm_features), dim=1)
        x = self.fc1(fusion)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x

# 📌 4. Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FusionModel(num_classes=5).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scaler = GradScaler()  # Mixed precision scaler

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for vit_input, resnet_input, mfcc, labels in train_loader:
        vit_input, resnet_input, mfcc, labels = vit_input.to(device), resnet_input.to(device), mfcc.to(device), labels.to(device)
        labels = labels.squeeze().long()

        optimizer.zero_grad()

        with autocast():
            outputs = model(vit_input, resnet_input, mfcc)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


  scaler = GradScaler()  # Mixed precision scaler
  with autocast():


Epoch 1/10, Loss: 0.9702
Epoch 2/10, Loss: 0.6490
Epoch 3/10, Loss: 0.6608
Epoch 4/10, Loss: 0.4686
Epoch 5/10, Loss: 0.5847
Epoch 6/10, Loss: 0.4410
Epoch 7/10, Loss: 0.8218
Epoch 8/10, Loss: 0.6894
Epoch 9/10, Loss: 0.6811
Epoch 10/10, Loss: 0.4462


In [9]:
torch.save(model.state_dict(), "fusion_model.pth")
print("Model saved successfully!")


Model saved successfully!


In [10]:
model = FusionModel(num_classes=5).to(device)
model.load_state_dict(torch.load("fusion_model.pth"))
model.eval()  # Set to evaluation mode
print("Model loaded successfully!")


Model loaded successfully!


In [11]:
from sklearn.metrics import accuracy_score, classification_report

# Move model to evaluation mode
model.eval()

true_labels = []
predicted_labels = []

with torch.no_grad():
    for vit_input, resnet_input, mfcc, labels in test_loader:
        vit_input, resnet_input, mfcc, labels = vit_input.to(device), resnet_input.to(device), mfcc.to(device), labels.to(device)

        outputs = model(vit_input, resnet_input, mfcc)
        _, preds = torch.max(outputs, 1)  # Get the highest probability class

        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(preds.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Display precision, recall, F1-score
print("Classification Report:\n", classification_report(true_labels, predicted_labels))


Test Accuracy: 82.47%
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.47      0.64        73
           1       0.94      0.96      0.95        77
           2       0.88      0.91      0.89        74
           4       0.64      0.97      0.77        67

    accuracy                           0.82       291
   macro avg       0.86      0.83      0.81       291
weighted avg       0.87      0.82      0.81       291

