In [3]:
import os
import torch
import torch.nn as nn
import numpy as np
import librosa
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm

# ========== CONFIG ==========
N_MFCC = 13
MAX_AUDIO_LEN = 100
IMG_SIZE = (128, 128)
BATCH_SIZE = 8
EPOCHS = 10

# ========== AUDIO FEATURE ==========
def extract_mfcc(wav_path, n_mfcc=N_MFCC, max_len=MAX_AUDIO_LEN):
    y, sr = librosa.load(wav_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return torch.tensor(mfcc, dtype=torch.float32)

# ========== DATASET ==========
class MultiModalSpeechDataset(Dataset):
    def __init__(self, root_dir, label, transform=None):
        self.samples = []
        self.label = label
        self.transform = transform
        for root, _, files in os.walk(root_dir):
            for file in files:
                if file.endswith(".ult"):
                    base = os.path.splitext(file)[0]
                    wav_path = os.path.join(root, base + ".wav")
                    if os.path.exists(wav_path):
                        self.samples.append((os.path.join(root, file), wav_path))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ult_path, wav_path = self.samples[idx]
        img = Image.fromarray(np.random.randint(0, 255, IMG_SIZE, dtype=np.uint8))
        img = self.transform(img) if self.transform else transforms.ToTensor()(img)
        mfcc = extract_mfcc(wav_path)
        return img, mfcc, self.label

# ========== MODEL ==========
class MultiModalNet(nn.Module):
    def __init__(self, audio_feat_dim, num_classes=3):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten()
        )
        self.audio_net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(audio_feat_dim, 64), nn.ReLU(),
            nn.Linear(64, 32)
        )
        self.classifier = nn.Sequential(
            nn.Linear(32 + 32, 64), nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, ult_img, audio_feat):
        x1 = self.cnn(ult_img)
        x2 = self.audio_net(audio_feat)
        x = torch.cat((x1, x2), dim=1)
        return self.classifier(x)

# ========== TRAINING SCRIPT ==========
if __name__ == "__main__":
    transform = transforms.Compose([
        transforms.Resize(IMG_SIZE),
        transforms.ToTensor()
    ])

    # 🔍 Load datasets
    dataset_uxtd = MultiModalSpeechDataset("D:/UltraSuite/core-uxtd/core", label=0, transform=transform)
    dataset_uxssd = MultiModalSpeechDataset("D:/UltraSuite/core-uxssd/core", label=1, transform=transform)
    dataset_upx   = MultiModalSpeechDataset("D:/UltraSuite/core-upx/core", label=2, transform=transform)

    full_dataset = dataset_uxtd + dataset_uxssd + dataset_upx
    print(f"📦 Total samples: {len(full_dataset)}")

    # ✂ Split
    train_size = int(0.8 * len(full_dataset))
    test_size = len(full_dataset) - train_size
    train_data, test_data = random_split(full_dataset, [train_size, test_size])
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

    # 🚀 Model Setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultiModalNet(audio_feat_dim=N_MFCC * MAX_AUDIO_LEN).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    print(f"🚀 Training on {device}...\n")
    for epoch in range(EPOCHS):
        model.train()
        total_loss, correct, total = 0, 0, 0
        for imgs, mfccs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            imgs, mfccs, labels = imgs.to(device), mfccs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(imgs, mfccs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)
        train_acc = 100 * correct / total

        # ========== Evaluate on Test Set ==========
        model.eval()
        test_correct, test_total = 0, 0
        with torch.no_grad():
            for imgs, mfccs, labels in test_loader:
                imgs, mfccs, labels = imgs.to(device), mfccs.to(device), labels.to(device)
                outputs = model(imgs, mfccs)
                test_correct += (outputs.argmax(1) == labels).sum().item()
                test_total += labels.size(0)
        test_acc = 100 * test_correct / test_total

        print(f"📊 Epoch {epoch+1}: Loss={total_loss:.4f} | Train Acc={train_acc:.2f}% | Test Acc={test_acc:.2f}%\n")

    # 🧪 Final Evaluation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for imgs, mfccs, labels in test_loader:
            imgs, mfccs = imgs.to(device), mfccs.to(device)
            outputs = model(imgs, mfccs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())

    from sklearn.metrics import classification_report, confusion_matrix
    print(f"✅ Final Test Accuracy: {accuracy_score(all_labels, all_preds) * 100:.2f}%")
    print("\n📋 Classification Report:")
    print(classification_report(all_labels, all_preds, digits=3))
    print("\n📊 Confusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))


📦 Total samples: 4167
🚀 Training on cpu...



Epoch 1/10: 100%|██████████| 417/417 [01:09<00:00,  5.97it/s]


📊 Epoch 1: Loss=282.0607 | Train Acc=76.03% | Test Acc=80.94%



Epoch 2/10: 100%|██████████| 417/417 [01:07<00:00,  6.20it/s]


📊 Epoch 2: Loss=225.6155 | Train Acc=80.98% | Test Acc=81.06%



Epoch 3/10: 100%|██████████| 417/417 [01:04<00:00,  6.45it/s]


📊 Epoch 3: Loss=209.9828 | Train Acc=81.40% | Test Acc=80.70%



Epoch 4/10: 100%|██████████| 417/417 [01:02<00:00,  6.66it/s]


📊 Epoch 4: Loss=205.6538 | Train Acc=81.85% | Test Acc=81.06%



Epoch 5/10: 100%|██████████| 417/417 [01:02<00:00,  6.65it/s]


📊 Epoch 5: Loss=197.3443 | Train Acc=82.18% | Test Acc=80.58%



Epoch 6/10: 100%|██████████| 417/417 [01:03<00:00,  6.59it/s]


📊 Epoch 6: Loss=194.8964 | Train Acc=82.36% | Test Acc=81.06%



Epoch 7/10: 100%|██████████| 417/417 [01:17<00:00,  5.39it/s]


📊 Epoch 7: Loss=190.4005 | Train Acc=82.33% | Test Acc=77.10%



Epoch 8/10: 100%|██████████| 417/417 [01:18<00:00,  5.34it/s]


📊 Epoch 8: Loss=185.2120 | Train Acc=82.36% | Test Acc=80.94%



Epoch 9/10: 100%|██████████| 417/417 [01:20<00:00,  5.16it/s]


📊 Epoch 9: Loss=185.1583 | Train Acc=82.45% | Test Acc=80.82%



Epoch 10/10: 100%|██████████| 417/417 [01:15<00:00,  5.55it/s]


📊 Epoch 10: Loss=178.9655 | Train Acc=82.90% | Test Acc=79.38%

✅ Final Test Accuracy: 79.38%

📋 Classification Report:
              precision    recall  f1-score   support

           0      0.455     0.037     0.068       135
           1      0.787     0.943     0.858       546
           2      0.840     0.928     0.882       153

    accuracy                          0.794       834
   macro avg      0.694     0.636     0.603       834
weighted avg      0.743     0.794     0.735       834


📊 Confusion Matrix:
[[  5 128   2]
 [  6 515  25]
 [  0  11 142]]


In [5]:
import os
import torch
import torch.nn as nn
import numpy as np
import librosa
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm

# ========== CONFIG ==========
N_MFCC = 13
MAX_AUDIO_LEN = 100
IMG_SIZE = (128, 128)
BATCH_SIZE = 8
EPOCHS = 20
LEARNING_RATE = 0.001

# ========== AUDIO FEATURE ==========
def extract_mfcc(wav_path, n_mfcc=N_MFCC, max_len=MAX_AUDIO_LEN):
    y, sr = librosa.load(wav_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return torch.tensor(mfcc, dtype=torch.float32)

# ========== DATASET ==========
class MultiModalSpeechDataset(Dataset):
    def __init__(self, root_dir, label, transform=None):
        self.samples = []
        self.label = label
        self.transform = transform
        for root, _, files in os.walk(root_dir):
            for file in files:
                if file.endswith(".ult"):
                    base = os.path.splitext(file)[0]
                    wav_path = os.path.join(root, base + ".wav")
                    if os.path.exists(wav_path):
                        self.samples.append((os.path.join(root, file), wav_path))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ult_path, wav_path = self.samples[idx]
        img = Image.fromarray(np.random.randint(0, 255, IMG_SIZE, dtype=np.uint8))
        img = self.transform(img) if self.transform else transforms.ToTensor()(img)
        mfcc = extract_mfcc(wav_path)
        return img, mfcc, self.label

# ========== MODEL ==========
class MultiModalNet(nn.Module):
    def __init__(self, audio_feat_dim, num_classes=3):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(), nn.BatchNorm2d(16), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.BatchNorm2d(32), nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten()
        )
        self.audio_net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(audio_feat_dim, 64), nn.ReLU(),
            nn.Linear(64, 32)
        )
        self.classifier = nn.Sequential(
            nn.Linear(32 + 32, 64), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64, num_classes)
        )

    def forward(self, ult_img, audio_feat):
        x1 = self.cnn(ult_img)
        x2 = self.audio_net(audio_feat)
        x = torch.cat((x1, x2), dim=1)
        return self.classifier(x)

# ========== TRAINING ==========
if __name__ == "__main__":
    transform = transforms.Compose([
        transforms.Resize(IMG_SIZE),
        transforms.ToTensor()
    ])

    dataset_uxtd = MultiModalSpeechDataset("D:/UltraSuite/core-uxtd/core", label=0, transform=transform)
    dataset_uxssd = MultiModalSpeechDataset("D:/UltraSuite/core-uxssd/core", label=1, transform=transform)
    dataset_upx   = MultiModalSpeechDataset("D:/UltraSuite/core-upx/core", label=2, transform=transform)

    full_dataset = dataset_uxtd + dataset_uxssd + dataset_upx
    print(f"📦 Total samples: {len(full_dataset)}")

    train_size = int(0.8 * len(full_dataset))
    test_size = len(full_dataset) - train_size
    train_data, test_data = random_split(full_dataset, [train_size, test_size])
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultiModalNet(audio_feat_dim=N_MFCC * MAX_AUDIO_LEN).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

    print(f"🚀 Training on {device}...\n")
    for epoch in range(EPOCHS):
        model.train()
        total_loss, correct, total = 0, 0, 0
        for imgs, mfccs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            imgs, mfccs, labels = imgs.to(device), mfccs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(imgs, mfccs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)
        train_acc = 100 * correct / total

        # Evaluation on test set
        model.eval()
        test_correct, test_total = 0, 0
        with torch.no_grad():
            for imgs, mfccs, labels in test_loader:
                imgs, mfccs, labels = imgs.to(device), mfccs.to(device), labels.to(device)
                outputs = model(imgs, mfccs)
                test_correct += (outputs.argmax(1) == labels).sum().item()
                test_total += labels.size(0)
        test_acc = 100 * test_correct / test_total
        print(f"Epoch {epoch+1}: Loss={total_loss:.4f} | Train Acc={train_acc:.2f}% | Test Acc={test_acc:.2f}%\n")
        scheduler.step()

    # 🧪 Final Evaluation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for imgs, mfccs, labels in test_loader:
            imgs, mfccs = imgs.to(device), mfccs.to(device)
            outputs = model(imgs, mfccs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())

    print(f"✅ Final Test Accuracy: {accuracy_score(all_labels, all_preds) * 100:.2f}%")
    print("\n📋 Classification Report:")
    print(classification_report(all_labels, all_preds, digits=3))
    print("\n📊 Confusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))


📦 Total samples: 4167
🚀 Training on cpu...



Epoch 1/20: 100%|██████████| 417/417 [01:17<00:00,  5.37it/s]


Epoch 1: Loss=336.9858 | Train Acc=72.49% | Test Acc=80.10%



Epoch 2/20: 100%|██████████| 417/417 [01:16<00:00,  5.48it/s]


Epoch 2: Loss=236.7455 | Train Acc=79.60% | Test Acc=83.81%



Epoch 3/20: 100%|██████████| 417/417 [01:12<00:00,  5.74it/s]


Epoch 3: Loss=208.0756 | Train Acc=81.43% | Test Acc=84.17%



Epoch 4/20: 100%|██████████| 417/417 [01:03<00:00,  6.60it/s]


Epoch 4: Loss=205.6213 | Train Acc=81.43% | Test Acc=83.81%



Epoch 5/20: 100%|██████████| 417/417 [01:05<00:00,  6.40it/s]


Epoch 5: Loss=196.4777 | Train Acc=82.48% | Test Acc=85.25%



Epoch 6/20: 100%|██████████| 417/417 [01:17<00:00,  5.40it/s]


Epoch 6: Loss=173.6596 | Train Acc=84.22% | Test Acc=84.65%



Epoch 7/20: 100%|██████████| 417/417 [01:14<00:00,  5.58it/s]


Epoch 7: Loss=174.5874 | Train Acc=84.43% | Test Acc=85.61%



Epoch 8/20: 100%|██████████| 417/417 [01:14<00:00,  5.60it/s]


Epoch 8: Loss=175.8236 | Train Acc=84.52% | Test Acc=84.53%



Epoch 9/20: 100%|██████████| 417/417 [01:14<00:00,  5.56it/s]


Epoch 9: Loss=173.1142 | Train Acc=84.67% | Test Acc=77.82%



Epoch 10/20: 100%|██████████| 417/417 [01:15<00:00,  5.56it/s]


Epoch 10: Loss=164.1296 | Train Acc=85.42% | Test Acc=82.97%



Epoch 11/20: 100%|██████████| 417/417 [01:04<00:00,  6.51it/s]


Epoch 11: Loss=154.9363 | Train Acc=86.17% | Test Acc=84.41%



Epoch 12/20: 100%|██████████| 417/417 [01:02<00:00,  6.68it/s]


Epoch 12: Loss=146.2685 | Train Acc=86.68% | Test Acc=84.41%



Epoch 13/20: 100%|██████████| 417/417 [01:03<00:00,  6.60it/s]


Epoch 13: Loss=144.6661 | Train Acc=87.01% | Test Acc=85.49%



Epoch 14/20: 100%|██████████| 417/417 [01:02<00:00,  6.64it/s]


Epoch 14: Loss=146.4268 | Train Acc=86.89% | Test Acc=82.13%



Epoch 15/20: 100%|██████████| 417/417 [01:02<00:00,  6.70it/s]


Epoch 15: Loss=139.7864 | Train Acc=87.13% | Test Acc=83.09%



Epoch 16/20: 100%|██████████| 417/417 [01:03<00:00,  6.60it/s]


Epoch 16: Loss=131.6991 | Train Acc=87.79% | Test Acc=84.53%



Epoch 17/20: 100%|██████████| 417/417 [01:02<00:00,  6.62it/s]


Epoch 17: Loss=130.9026 | Train Acc=88.06% | Test Acc=84.29%



Epoch 18/20: 100%|██████████| 417/417 [01:02<00:00,  6.68it/s]


Epoch 18: Loss=127.5266 | Train Acc=88.51% | Test Acc=85.13%



Epoch 19/20: 100%|██████████| 417/417 [01:02<00:00,  6.68it/s]


Epoch 19: Loss=128.0576 | Train Acc=89.02% | Test Acc=84.29%



Epoch 20/20: 100%|██████████| 417/417 [01:02<00:00,  6.62it/s]


Epoch 20: Loss=124.2887 | Train Acc=88.81% | Test Acc=84.05%

✅ Final Test Accuracy: 84.05%

📋 Classification Report:
              precision    recall  f1-score   support

           0      0.505     0.446     0.474       112
           1      0.875     0.905     0.890       588
           2      0.937     0.888     0.912       134

    accuracy                          0.841       834
   macro avg      0.772     0.746     0.758       834
weighted avg      0.835     0.841     0.837       834


📊 Confusion Matrix:
[[ 50  62   0]
 [ 48 532   8]
 [  1  14 119]]
