In [None]:
%pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128

In [None]:
%pip install opencv-python-headless
%pip install opencv-python
%pip install scikit-image
%pip install scikit-learn
%pip install tqdm
%pip install sympy==1.13.3
%pip install librosa soundfile

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import librosa
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


In [None]:
class AudioSpectrogramDataset(Dataset):
    def __init__(self, wav_dir, spec_dir, classes, segment_len=160000):
        self.wav_dir = wav_dir
        self.spec_dir = spec_dir
        self.classes = classes
        self.segment_len = segment_len
        self.data = []

        for cls_index, cls_name in enumerate(classes):
            wav_cls_path = os.path.join(wav_dir, cls_name)
            spec_cls_path = os.path.join(spec_dir, cls_name)

            for filename in os.listdir(wav_cls_path):
                if filename.endswith(".wav"):
                    base = os.path.splitext(filename)[0]
                    wav_path = os.path.join(wav_cls_path, filename)
                    spec_path = os.path.join(spec_cls_path, base + ".npy")

                    if os.path.exists(spec_path):
                        self.data.append((wav_path, spec_path, cls_index))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        wav_path, spec_path, label = self.data[idx]

        y, sr = librosa.load(wav_path, sr=16000)
        y = y / np.max(np.abs(y)) 

        if len(y) < self.segment_len:
            y = np.pad(y, (0, self.segment_len - len(y)))
        else:
            y = y[:self.segment_len]

        mel = np.load(spec_path)

        return (
            torch.tensor(y, dtype=torch.float32),
            torch.tensor(mel, dtype=torch.float32),
            torch.tensor(label, dtype=torch.long)
        )


In [None]:
import os

wav_root = "D:/Licenta/Datasets/Audio/data/Split_Wav/test"
npy_root = "D:/Licenta/Datasets/Audio/data/MelSpectrograms/test"

missing = []

for cls in os.listdir(wav_root):
    wav_cls_path = os.path.join(wav_root, cls)
    npy_cls_path = os.path.join(npy_root, cls)

    for file in os.listdir(wav_cls_path):
        if file.endswith(".wav"):
            base = os.path.splitext(file)[0]
            npy_file = os.path.join(npy_cls_path, base + ".npy")
            if not os.path.exists(npy_file):
                missing.append((cls, base + ".npy"))

if missing:
    print("Lipsesc fișiere .npy pentru următoarele segmente:")
    for cls, fname in missing:
        print(f" - {cls}/{fname}")
else:
    print("Toate fișierele .wav au .npy corespunzător.")


In [None]:
class_names = ["Alzheimer", "Parkinson", "Healthy"]

train_dataset = AudioSpectrogramDataset(
    wav_dir="D:/Licenta/Datasets/Audio/data/Split_Wav/train",
    spec_dir="D:/Licenta/Datasets/Audio/data/MelSpectrograms/train",
    classes=class_names
)

test_dataset = AudioSpectrogramDataset(
    wav_dir="D:/Licenta/Datasets/Audio/data/Split_Wav/test",
    spec_dir="D:/Licenta/Datasets/Audio/data/MelSpectrograms/test",
    classes=class_names
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print(f"Train: {len(train_dataset)} samples |  Test: {len(test_dataset)} samples")


In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

class AudioSpectrogramClassifier(nn.Module):
    def __init__(self, n_classes=3, pretrained_resnet=True):
        super(AudioSpectrogramClassifier, self).__init__()

        
        self.audio_branch = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=9, stride=2, padding=4),
            nn.ReLU(),
            nn.BatchNorm1d(16),

            nn.Conv1d(16, 32, kernel_size=9, stride=2, padding=4),
            nn.ReLU(),
            nn.BatchNorm1d(32),

            nn.Conv1d(32, 64, kernel_size=7, stride=2, padding=3),
            nn.ReLU(),
            nn.BatchNorm1d(64),

            nn.AdaptiveAvgPool1d(32),
            nn.Flatten(),
            nn.Dropout(0.3) 
        )

        self.spec_branch = models.resnet18(pretrained=pretrained_resnet)
        self.spec_branch.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.spec_branch.fc = nn.Identity() 

        self.fusion_dim = 512 + 64 * 32 
        self.attention = nn.Sequential(
            nn.Linear(self.fusion_dim, self.fusion_dim),
            nn.Tanh()
        )

        self.classifier = nn.Sequential(
            nn.Linear(self.fusion_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, n_classes)
        )

    def forward(self, audio_input, mel_input):
        x1 = self.audio_branch(audio_input.unsqueeze(1))    
        x2 = self.spec_branch(mel_input.unsqueeze(1))        
        x = torch.cat((x1, x2), dim=1)                       
        att = self.attention(x)                              
        x = x * att                                           
        return self.classifier(x)


In [None]:
model = AudioSpectrogramClassifier(n_classes=3, pretrained_resnet=True).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [None]:
def evaluate(model, dataloader):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for audio, mel, labels in dataloader:
            audio, mel, labels = audio.to(device), mel.to(device), labels.to(device)
            outputs = model(audio, mel)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total


In [None]:
n_epochs = 20
best_acc = 0.0

train_losses = []
train_accuracies = []
test_accuracies = []

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0

    for audio, mel, labels in train_loader:
        audio, mel, labels = audio.to(device), mel.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(audio, mel)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    train_acc = evaluate(model, train_loader)
    test_acc = evaluate(model, test_loader)

    train_losses.append(avg_loss)
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)

    print(f"Epoch {epoch+1}/{n_epochs} | Loss: {avg_loss:.4f} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), "best_model.pt")
        print(f"Model salvat (Test Acc: {best_acc:.4f})")


In [None]:
import matplotlib.pyplot as plt

epochs = range(1, n_epochs + 1)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(epochs, train_accuracies, label='Train Accuracy')
plt.plot(epochs, test_accuracies, label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy per Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, train_losses, label='Train Loss', color='orange')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss per Epoch')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
torch.save(model.state_dict(), "dual_branch_model.pth")
print("Model salvat cu succes în dual_branch_model.pth")

In [None]:
def predict(model, wav_path, spec_path, classes):
    y, sr = librosa.load(wav_path, sr=16000)
    y = y / np.max(np.abs(y))
    if len(y) < 160000:
        y = np.pad(y, (0, 160000 - len(y)))
    else:
        y = y[:160000]

    mel = np.load(spec_path)
    if mel.shape != (128, 128):
        raise ValueError("Spectrogram should have shape (128, 128)")

    audio_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(0).to(device)  
    mel_tensor = torch.tensor(mel, dtype=torch.float32).unsqueeze(0).to(device)  

    model.eval()
    with torch.no_grad():
        output = model(audio_tensor, mel_tensor)
        _, pred = torch.max(output, 1)
        return classes[pred.item()]


In [None]:

model = AudioSpectrogramClassifier(n_classes=3).to(device)
model.load_state_dict(torch.load("dual_branch_model.pth", map_location=device))

class_names = ["Alzheimer", "Parkinson", "Healthy"]

wav_path = "D:/Licenta/Datasets/Audio/data/Split_Wav/test\Alzheimer/add_reverb_54_adrso232_vol_down_part8.wav"
spec_path = "D:/Licenta/Datasets/Audio/data/MelSpectrograms/test\Alzheimer/add_reverb_54_adrso232_vol_up_part8.npy"

predicted_class = predict(model, wav_path, spec_path, class_names)
print("Predicție:", predicted_class)


In [None]:
%pip install scikit-learn matplotlib seaborn

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
def evaluate_with_metrics(model, dataloader, class_names):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for audio, mel, labels in dataloader:
            audio = audio.to(device)
            mel = mel.to(device)
            labels = labels.to(device)

            outputs = model(audio, mel)
            _, preds = torch.max(outputs, 1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    cm = confusion_matrix(all_labels, all_preds)
    cr = classification_report(all_labels, all_preds, target_names=class_names)

    print("Classification Report:\n", cr)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.show()


In [None]:
evaluate_with_metrics(model, test_loader, class_names)

In [None]:
def predict_from_wav(model, wav_path, class_names, device="cuda", show_probs=False):
    import librosa

    n_mels = 128
    target_shape = (128, 128)
    segment_len = 160000

    y, sr = librosa.load(wav_path, sr=16000)
    y = y / np.max(np.abs(y))
    if len(y) < segment_len:
        y = np.pad(y, (0, segment_len - len(y)))
    else:
        y = y[:segment_len]

    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_db = (mel_db - np.mean(mel_db)) / np.std(mel_db)
    if mel_db.shape[1] < target_shape[1]:
        mel_db = np.pad(mel_db, ((0,0), (0, target_shape[1] - mel_db.shape[1])))
    else:
        mel_db = mel_db[:, :target_shape[1]]

    audio_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(0).to(device)
    mel_tensor = torch.tensor(mel_db, dtype=torch.float32).unsqueeze(0).to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(audio_tensor, mel_tensor)
        probs = torch.softmax(outputs, dim=1).cpu().numpy()[0]
        pred = np.argmax(probs)

    print(f"Predicție: {class_names[pred]}")
    if show_probs:
        for i, c in enumerate(class_names):
            print(f"  {c}: {probs[i]:.4f}")
    return class_names[pred]


In [None]:
model.load_state_dict(torch.load("dual_branch_model.pth"))
predict_from_wav(model, "D:/Licenta/Datasets/Audio/data/Split_Wav/test\Parkinson/add_reverb_47_ID18_pd_4_3_3_readtext_pitch_down_part2.wav", class_names)
predict_from_wav(model, "D:\Licenta\Datasets\Audio\data\Augmented_Output\Alzheimer/add_reverb_20_adrso031_noise.wav", class_names)
predict_from_wav(model, "d:\Licenta\Datasets\Audio\data\Augmented\Alzheimer/adrso031.wav", class_names)
predict_from_wav(model, "D:\Licenta\Datasets\Audio\data\Augmented_Output\Healthy/add_reverb_20_adrso286_noise.wav", class_names)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import librosa.display

def view_npy_spectrogram(npy_path, title=None):
    mel = np.load(npy_path)

    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel, x_axis='time', y_axis='mel', cmap='magma')
    plt.colorbar(format='%+2.0f dB')
    plt.title(title or f"Spectrogram: {npy_path}")
    plt.tight_layout()
    plt.show()

In [None]:
view_npy_spectrogram("D:\Licenta\Datasets\Audio\data\MelSpectrograms/test/Alzheimer/add_reverb_19_adrso215_vol_up_part5.npy")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import librosa.display

def view_multiple_spectrograms(npy_paths, titles=None):
    plt.figure(figsize=(10, 4 * len(npy_paths)))

    for i, path in enumerate(npy_paths):
        mel = np.load(path)
        plt.subplot(len(npy_paths), 1, i + 1)
        librosa.display.specshow(mel, x_axis='time', y_axis='mel', cmap='magma')
        plt.colorbar(format='%+2.0f dB')
        plt.title(titles[i] if titles else f"Spectrogram {i+1}: {path}")

    plt.tight_layout()
    plt.show()


In [None]:
view_multiple_spectrograms(
    [
        "D:\Licenta\Datasets\Audio\data\MelSpectrograms/test\Alzheimer/add_reverb_44_adrso144_shifted_part2.npy",
        "D:\Licenta\Datasets\Audio\data\MelSpectrograms/test\Healthy/adrso002_vol_up_part4.npy",
        "D:\Licenta\Datasets\Audio\data\MelSpectrograms/test\Parkinson/add_reverb_47_ID18_pd_4_3_3_readtext_pitch_down_part2.npy"
    ],
    titles=["Alzheimer", "Parkinson", "Healthy"]
)


In [None]:
import random
import matplotlib.pyplot as plt

def show_random_predictions(model, dataloader, class_names, count=100):
    model.eval()
    all_data = []

    with torch.no_grad():
        for audio, mel, labels in dataloader:
            audio, mel, labels = audio.to(device), mel.to(device), labels.to(device)
            outputs = model(audio, mel)
            probs = torch.softmax(outputs, dim=1)
            preds = torch.argmax(probs, dim=1)

            for i in range(len(labels)):
                all_data.append({
                    "true": labels[i].item(),
                    "pred": preds[i].item(),
                    "conf": probs[i][preds[i]].item()
                })

    sampled = random.sample(all_data, min(count, len(all_data)))

    for i, d in enumerate(sampled):
        correct = d["true"] == d["pred"]
        label_color = "\033[92m" if correct else "\033[91m"  
        print(f"{label_color}[{i+1}] True: {class_names[d['true']]:10} | Pred: {class_names[d['pred']]:10} | Conf: {d['conf']:.2f}\033[0m")


In [None]:
show_random_predictions(model, test_loader, class_names, count=100)

In [None]:
import matplotlib.pyplot as plt
import librosa.display

def show_wrong_spectrograms(model, dataloader, class_names, max_examples=100):
    model.eval()
    wrong_examples = []
    total = 0
    wrong = 0

    with torch.no_grad():
        for audio, mel, labels in dataloader:
            audio = audio.to(device)
            mel = mel.to(device)
            labels = labels.to(device)

            outputs = model(audio, mel)
            preds = torch.argmax(outputs, dim=1)

            for i in range(len(labels)):
                total += 1
                if preds[i] != labels[i]:
                    wrong += 1
                    wrong_examples.append({
                        "mel": mel[i].cpu().numpy(),
                        "true": labels[i].item(),
                        "pred": preds[i].item()
                    })
                if len(wrong_examples) >= max_examples:
                    break
            if len(wrong_examples) >= max_examples:
                break

    print(f"Predicții greșite: {wrong}/{total} ({(wrong/total)*100:.2f}%)")

    n = len(wrong_examples)
    cols = 3
    rows = (n + cols - 1) // cols
    plt.figure(figsize=(cols * 5, rows * 4))

    for idx, item in enumerate(wrong_examples):
        plt.subplot(rows, cols, idx + 1)
        librosa.display.specshow(item["mel"], x_axis='time', y_axis='mel', cmap='magma')
        plt.title(f"True: {class_names[item['true']]}\nPred: {class_names[item['pred']]}", color='red')
        plt.colorbar(format='%+2.0f dB')

    plt.tight_layout()
    plt.show()


In [None]:
show_wrong_spectrograms(model, test_loader, class_names, max_examples=200)