In [3]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB 65.4 kB/s eta 0:03:16
     ---------------------------------------- 0.0/12.8 MB 65.4 kB/s eta 0:03:16
     ---------------------------------------- 0.0/12.8 MB 65.4 kB/s eta 0:03:16
     ---

In [9]:
!pip install torch torchvision torchaudio scikit-learn tqdm spacy
!python -m spacy download en_core_web_sm


Collecting numpy (from torchvision)
  Downloading numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------------------------- ------------ 41.0/60.8 kB 991.0 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 816.2 kB/s eta 0:00:00
Downloading numpy-2.2.6-cp311-cp311-win_amd64.whl (12.9 MB)
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
   ----------------------------------------

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
catboost 1.2.7 requires numpy<2.0,>=1.16.0, but you have numpy 2.2.6 which is incompatible.
pandas-plink 2.3.1 requires numpy<2.0,>=1.0, but you have numpy 2.2.6 which is incompatible.
pandas-stubs 2.2.2.240805 requires numpy<2.0.0,>=1.23.5; python_version >= "3.9" and python_version < "3.12", but you have numpy 2.2.6 which is incompatible.
tensorflow-intel 2.17.0 requires h5py>=3.10.0, but you have h5py 3.9.0 which is incompatible.
tensorflow-intel 2.17.0 requires numpy<2.0.0,>=1.23.5; python_version <= "3.11", but you have numpy 2.2.6 which is incompatible.
tensorflow-intel 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 6.31.1 which is incompatible.


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 907.3 kB/s eta 0:00:15
      --------------------------------------- 0.2/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.4/12.8 MB 2.2 MB/s eta 0:00:06
     -- ------------------------------------- 0.8/12.8 MB 4.0 MB/s eta 0:00:04
     --- ------------------------------------ 1.3/12.8 MB 5.0 MB/s eta 0:00:03
     ----- ---------------------------------- 1.9/12.8 MB 6.2 MB/s eta 0:00:02
     ------- -------------------------------- 2.3/12.8 MB 6.5 MB/s eta 0:00:02
     -------- ------------------------------- 2.7/12.8

In [12]:
import os
import torch
import torch.nn as nn
import numpy as np
import librosa
import spacy
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm

# ========== CONFIG ==========
N_MFCC = 13
MAX_AUDIO_LEN = 100
IMG_SIZE = (128, 128)
BATCH_SIZE = 8
EPOCHS = 10
TEXT_EMBED_DIM = 96  # SpaCy small model gives 96-dim vectors

# ========== TEXT MODEL ==========
nlp = spacy.load("en_core_web_sm")

def text_to_tensor(text):
    doc = nlp(text)
    if doc.has_vector:
        return torch.tensor(doc.vector, dtype=torch.float32)
    else:
        return torch.zeros(TEXT_EMBED_DIM)

# ========== AUDIO FEATURE ==========
def extract_mfcc(wav_path, n_mfcc=N_MFCC, max_len=MAX_AUDIO_LEN):
    y, sr = librosa.load(wav_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return torch.tensor(mfcc, dtype=torch.float32)

# ========== DATASET ==========
class MultiModalSpeechDataset(Dataset):
    def __init__(self, root_dir, label, transform=None):
        self.samples = []
        self.label = label
        self.transform = transform
        for root, _, files in os.walk(root_dir):
            for file in files:
                if file.endswith(".ult"):
                    base = os.path.splitext(file)[0]
                    wav_path = os.path.join(root, base + ".wav")
                    txt_path = os.path.join(root, base + ".txt")
                    if os.path.exists(wav_path) and os.path.exists(txt_path):
                        self.samples.append((os.path.join(root, file), wav_path, txt_path))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ult_path, wav_path, txt_path = self.samples[idx]
        # Simulate ultrasound image
        img = Image.fromarray(np.random.randint(0, 255, IMG_SIZE, dtype=np.uint8))
        img = self.transform(img) if self.transform else transforms.ToTensor()(img)

        mfcc = extract_mfcc(wav_path)

        with open(txt_path, 'r') as f:
            text = f.read().strip()
        text_vec = text_to_tensor(text)

        return img, mfcc, text_vec, self.label

# ========== MODEL ==========
class TriModalNet(nn.Module):
    def __init__(self, audio_feat_dim, text_feat_dim, num_classes=3):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten()
        )
        self.audio_net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(audio_feat_dim, 64), nn.ReLU(),
            nn.Linear(64, 32)
        )
        self.text_net = nn.Sequential(
            nn.Linear(text_feat_dim, 32), nn.ReLU()
        )
        self.classifier = nn.Sequential(
            nn.Linear(32 + 32 + 32, 64), nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, img, audio_feat, text_feat):
        x1 = self.cnn(img)
        x2 = self.audio_net(audio_feat)
        x3 = self.text_net(text_feat)
        x = torch.cat((x1, x2, x3), dim=1)
        return self.classifier(x)

# ========== MAIN ==========
if __name__ == "__main__":
    transform = transforms.Compose([
        transforms.Resize(IMG_SIZE),
        transforms.ToTensor()
    ])

    # Load data from each class
    dataset_uxtd = MultiModalSpeechDataset("D:/UltraSuite/core-uxtd/core", label=0, transform=transform)
    dataset_uxssd = MultiModalSpeechDataset("D:/UltraSuite/core-uxssd/core", label=1, transform=transform)
    dataset_upx   = MultiModalSpeechDataset("D:/UltraSuite/core-upx/core", label=2, transform=transform)

    full_dataset = dataset_uxtd + dataset_uxssd + dataset_upx
    print(f"📦 Total samples: {len(full_dataset)}")

    train_size = int(0.8 * len(full_dataset))
    test_size = len(full_dataset) - train_size
    train_data, test_data = random_split(full_dataset, [train_size, test_size])

    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = TriModalNet(audio_feat_dim=N_MFCC * MAX_AUDIO_LEN, text_feat_dim=TEXT_EMBED_DIM).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    print(f"\n🚀 Training on {device}...\n")
    for epoch in range(EPOCHS):
        model.train()
        total_loss, correct, total = 0, 0, 0
        for imgs, mfccs, texts, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            imgs, mfccs, texts, labels = imgs.to(device), mfccs.to(device), texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(imgs, mfccs, texts)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)
        train_acc = 100 * correct / total

        # Evaluate
        model.eval()
        test_correct, test_total = 0, 0
        with torch.no_grad():
            for imgs, mfccs, texts, labels in test_loader:
                imgs, mfccs, texts, labels = imgs.to(device), mfccs.to(device), texts.to(device), labels.to(device)
                outputs = model(imgs, mfccs, texts)
                test_correct += (outputs.argmax(1) == labels).sum().item()
                test_total += labels.size(0)
        test_acc = 100 * test_correct / test_total
        print(f"📊 Epoch {epoch+1}: Loss={total_loss:.4f} | Train Acc={train_acc:.2f}% | Test Acc={test_acc:.2f}%")

    # Final report
    print("\n✅ Final Evaluation:")
    all_preds, all_labels = [], []
    model.eval()
    with torch.no_grad():
        for imgs, mfccs, texts, labels in test_loader:
            imgs, mfccs, texts = imgs.to(device), mfccs.to(device), texts.to(device)
            outputs = model(imgs, mfccs, texts)
            preds = outputs.argmax(1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())

    print(f"\n✅ Final Test Accuracy: {accuracy_score(all_labels, all_preds)*100:.2f}%")
    print("\n📋 Classification Report:")
    print(classification_report(all_labels, all_preds, digits=3))
    print("\n📊 Confusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))


📦 Total samples: 4167

🚀 Training on cpu...



Epoch 1/10: 100%|██████████| 417/417 [02:13<00:00,  3.12it/s]


📊 Epoch 1: Loss=196.5577 | Train Acc=83.35% | Test Acc=96.64%


Epoch 2/10: 100%|██████████| 417/417 [01:37<00:00,  4.27it/s]


📊 Epoch 2: Loss=52.2527 | Train Acc=96.61% | Test Acc=96.64%


Epoch 3/10: 100%|██████████| 417/417 [01:40<00:00,  4.17it/s]


📊 Epoch 3: Loss=35.1641 | Train Acc=97.60% | Test Acc=97.48%


Epoch 4/10: 100%|██████████| 417/417 [01:31<00:00,  4.56it/s]


📊 Epoch 4: Loss=26.8690 | Train Acc=97.96% | Test Acc=96.40%


Epoch 5/10: 100%|██████████| 417/417 [01:31<00:00,  4.56it/s]


📊 Epoch 5: Loss=16.2731 | Train Acc=98.62% | Test Acc=97.72%


Epoch 6/10: 100%|██████████| 417/417 [01:31<00:00,  4.56it/s]


📊 Epoch 6: Loss=16.1564 | Train Acc=98.83% | Test Acc=98.56%


Epoch 7/10: 100%|██████████| 417/417 [01:31<00:00,  4.56it/s]


📊 Epoch 7: Loss=9.5687 | Train Acc=99.22% | Test Acc=98.44%


Epoch 8/10: 100%|██████████| 417/417 [01:30<00:00,  4.59it/s]


📊 Epoch 8: Loss=12.9816 | Train Acc=99.13% | Test Acc=98.08%


Epoch 9/10: 100%|██████████| 417/417 [01:36<00:00,  4.32it/s]


📊 Epoch 9: Loss=7.6796 | Train Acc=99.28% | Test Acc=97.96%


Epoch 10/10: 100%|██████████| 417/417 [01:50<00:00,  3.76it/s]


📊 Epoch 10: Loss=4.6288 | Train Acc=99.70% | Test Acc=98.68%

✅ Final Evaluation:

✅ Final Test Accuracy: 98.68%

📋 Classification Report:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       128
           1      0.981     1.000     0.991       578
           2      1.000     0.914     0.955       128

    accuracy                          0.987       834
   macro avg      0.994     0.971     0.982       834
weighted avg      0.987     0.987     0.987       834


📊 Confusion Matrix:
[[128   0   0]
 [  0 578   0]
 [  0  11 117]]
