<a href="https://colab.research.google.com/github/MohiteYash/baby/blob/Vit/baby_VIT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision torchaudio transformers librosa


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [26]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import ViTModel, ViTFeatureExtractor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Custom Dataset Class
class BabyCryDataset(Dataset):
    def __init__(self, file_paths, labels, sr=22050, max_len=224):
        self.file_paths = file_paths
        self.labels = labels
        self.sr = sr
        self.max_len = max_len
        self.vit_processor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]

        # Load audio
        y, sr = librosa.load(file_path, sr=self.sr)

        # Compute Mel Spectrogram
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Resize to (224, 224) for ViT
        mel_spec = torch.tensor(mel_spec).unsqueeze(0)  # (1, 128, Time)
        mel_spec_resized = torch.nn.functional.interpolate(mel_spec.unsqueeze(0), size=(224, 224), mode="bilinear", align_corners=False).squeeze(0)

        # Convert to 3-channel format for ViT
        vit_input = torch.cat([mel_spec_resized] * 3, dim=0)  # (3, 224, 224)

        return vit_input, torch.tensor(label, dtype=torch.long)


In [27]:
# Set data path
data_dir = "/content/drive/MyDrive/augmented_baby_cry"
file_paths, labels = [], []

# Load all .wav files & labels
for class_idx, class_name in enumerate(os.listdir(data_dir)):
    class_path = os.path.join(data_dir, class_name)
    if not os.path.isdir(class_path) or not os.listdir(class_path):  # Skip empty folders
        continue
    for fname in os.listdir(class_path):
        if fname.endswith(".wav"):
            file_paths.append(os.path.join(class_path, fname))
            labels.append(class_idx)

# Split dataset
train_paths, test_paths, train_labels, test_labels = train_test_split(file_paths, labels, test_size=0.2, random_state=42)

# Create Datasets
train_dataset = BabyCryDataset(train_paths, train_labels)
test_dataset = BabyCryDataset(test_paths, test_labels)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0, pin_memory=True)


In [28]:
# Define ViT Model for Classification
class ViTClassifier(nn.Module):
    def __init__(self, num_classes=5):
        super(ViTClassifier, self).__init__()
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.fc = nn.Linear(768, num_classes)  # ViT outputs 768 features

    def forward(self, x):
        vit_outputs = self.vit(x)
        cls_token = vit_outputs.last_hidden_state[:, 0, :]  # CLS token
        out = self.fc(cls_token)
        return out


In [31]:
from torch.amp import autocast, GradScaler  # ✅ Correct AMP usage

# Training Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViTClassifier(num_classes=5).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scaler = GradScaler()

num_epochs = 20
best_accuracy = 0.0  # Track best accuracy for model saving
best_model_path = "best_vit_model.pth"  # Define the path for saving the best model

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for vit_input, labels in train_loader:
        vit_input, labels = vit_input.to(device), labels.to(device)
        optimizer.zero_grad()

        with autocast("cuda"):  # ✅ Fixed usage
            outputs = model(vit_input)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Evaluate after each epoch
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for vit_input, labels in test_loader:
            vit_input, labels = vit_input.to(device), labels.to(device)
            outputs = model(vit_input)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

    # ✅ Save only if accuracy improves
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), best_model_path)
        print(f"✅ Best model saved with Accuracy: {best_accuracy * 100:.2f}%")
    else:
        print("🔹 No improvement, model not saved.")


Epoch 1/20, Loss: 1.4142
Test Accuracy: 31.96%
✅ Best model saved with Accuracy: 31.96%
Epoch 2/20, Loss: 1.3052
Test Accuracy: 46.74%
✅ Best model saved with Accuracy: 46.74%
Epoch 3/20, Loss: 1.0231
Test Accuracy: 61.51%
✅ Best model saved with Accuracy: 61.51%
Epoch 4/20, Loss: 0.6028
Test Accuracy: 70.45%
✅ Best model saved with Accuracy: 70.45%
Epoch 5/20, Loss: 0.2619
Test Accuracy: 73.88%
✅ Best model saved with Accuracy: 73.88%
Epoch 6/20, Loss: 0.0930
Test Accuracy: 78.69%
✅ Best model saved with Accuracy: 78.69%
Epoch 7/20, Loss: 0.0367
Test Accuracy: 80.07%
✅ Best model saved with Accuracy: 80.07%
Epoch 8/20, Loss: 0.0209
Test Accuracy: 80.41%
✅ Best model saved with Accuracy: 80.41%
Epoch 9/20, Loss: 0.0145
Test Accuracy: 81.10%
✅ Best model saved with Accuracy: 81.10%
Epoch 10/20, Loss: 0.0115
Test Accuracy: 80.41%
🔹 No improvement, model not saved.
Epoch 11/20, Loss: 0.0095
Test Accuracy: 80.41%
🔹 No improvement, model not saved.
Epoch 12/20, Loss: 0.0080
Test Accuracy: 7

In [32]:
test_acc, class_report = evaluate_model(model, test_loader)


Test Accuracy: 80.07%

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81        73
           1       0.91      0.87      0.89        77
           2       0.78      0.69      0.73        74
           4       0.71      0.82      0.76        67

    accuracy                           0.80       291
   macro avg       0.80      0.80      0.80       291
weighted avg       0.80      0.80      0.80       291

