In [None]:
import librosa
import numpy as np
import matplotlib
import os
import glob
import random
import soundfile as sf
from pathlib import Path

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

In [None]:
def load_fix_audio(path, target_sr=16000, target_len_sec=3.0):
    y, sr = librosa.load(path, sr=target_sr, mono=True)

    n_target = int(target_sr * target_len_sec)

    if len(y) < n_target:
        y = np.pad(y, (0, n_target - len(y)), mode='constant')
    else:
        y = y[:n_target]

    return y

In [None]:
# функуия для разбиения данных
def split_dataset(root_dir, test_size=0.15, val_size=0.15, min_per_class=2):
    root_path = Path(root_dir)
    files = list(root_path.rglob("*.wav"))
    files = [str(p) for p in files]
    # files = glob.glob(os.path.join(root_dir, "**/*.wav"), recursive=True)
    labels = [1 if "barbie" in f else 0 for f in files]

    n_barbie = sum(labels)
    n_puppy = len(labels) - n_barbie
    print(f"Всего: {len(files)} файлов (barbie={n_barbie}, puppy={n_puppy})")

    X_train, X_temp, y_train, y_temp = train_test_split(
        files, labels, test_size=test_size + val_size, stratify=labels, random_state=42
    )

    rel_val = val_size / (test_size + val_size)

    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=rel_val, stratify=y_temp, random_state=42
    )

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)


In [None]:
def extract_logmel(y, sr=16000, n_mels=64):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    logmel = librosa.power_to_db(mel, ref=np.max)
    return logmel

In [None]:
# Две аугментации: слабая (почти оригинал) и сильная (для self-augmentation)
# Идея: всегда учимся на weak, а strong "подключаем" через consistency loss, когда модель уверена.

weak_transform = Compose([
    AddGaussianNoise(min_amplitude=0.0003, max_amplitude=0.003, p=0.3),
])

strong_transform = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.6),
    TimeStretch(min_rate=0.85, max_rate=1.20, p=0.6),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.6),
    Shift(p=0.5),
])


In [None]:
class AudioDataset(Dataset):
    """Dataset для бинарной классификации аудио.

    Если self_augment=True и train=True:
      - возвращает пару (x_weak, x_strong, y)
      - x_weak = оригинал или слабая аугментация
      - x_strong = сильная аугментация

    Иначе:
      - возвращает (x, x, y) чтобы код обучения/валидации был единым.
    """

    def __init__(self, files, labels, feature_type="logmel", train=True, self_augment=False):
        self.files = files
        self.labels = labels
        self.feature_type = feature_type
        self.train = train
        self.self_augment = self_augment

    def __len__(self):
        return len(self.files)

    def _to_features(self, y):
        if self.feature_type == "logmel":
            feat = extract_logmel(y)
        else:
            raise ValueError("Unknown feature type")

        # Нормализация по примеру
        feat = (feat - feat.mean()) / (feat.std() + 1e-8)
        feat = torch.tensor(feat, dtype=torch.float32).unsqueeze(0)  # [1, n_mels, time]
        return feat

    def __getitem__(self, idx):
        path = self.files[idx]
        label = self.labels[idx]

        y = load_fix_audio(path)  # 1D float32, фикс. длина/частота

        # weak
        y_w = y
        if self.train and self.self_augment:
            y_w = weak_transform(samples=y_w, sample_rate=16000)

        # strong
        y_s = y
        if self.train and self.self_augment:
            y_s = strong_transform(samples=y_s, sample_rate=16000)

        x_w = self._to_features(y_w)
        x_s = self._to_features(y_s)

        return x_w, x_s, torch.tensor(label, dtype=torch.long)


In [None]:
class MLP(nn.Module):
    def __init__(self, input_shape, hidden_size=256):
        super().__init__()

        freq, time = input_shape
        input_dim = freq * time

        self.model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_dim, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(hidden_size, 2)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

root_dir = "data"
(train_files, train_labels), (val_files, val_labels), (test_files, test_labels) = split_dataset(root_dir)

feature_type = "logmel"

# Включаем self-augmentation только на train
train_ds = AudioDataset(train_files, train_labels, feature_type=feature_type, train=True,  self_augment=True)
val_ds   = AudioDataset(val_files,   val_labels,   feature_type=feature_type, train=False, self_augment=False)
test_ds  = AudioDataset(test_files,  test_labels,  feature_type=feature_type, train=False, self_augment=False)

train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl   = DataLoader(val_ds,   batch_size=16, shuffle=False)
test_dl  = DataLoader(test_ds,  batch_size=16, shuffle=False)


Всего: 98 файлов (barbie=50, puppy=48)


In [None]:
len(train_ds)

68

In [None]:
sample_x_w, sample_x_s, _ = train_ds[0]
input_shape = sample_x_w.squeeze(0).shape
input_shape


torch.Size([64, 94])

In [None]:
model = MLP(input_shape=input_shape, hidden_size=256).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def rampup_weight(epoch, warmup_epochs=3, rampup_epochs=5, max_weight=1.0):
    """Плавно наращиваем вклад consistency loss после warmup."""
    if epoch <= warmup_epochs:
        return 0.0
    t = (epoch - warmup_epochs) / max(1, rampup_epochs)
    t = max(0.0, min(1.0, t))
    # smoothstep
    return max_weight * (t * t * (3 - 2 * t))


def run_epoch(model, dataloader, epoch, optimizer=None,
              conf_thresh=0.80, warmup_epochs=3, rampup_epochs=5, lam_max=1.0):
    """Один проход по датасету.

    Обучение:
      loss = CE(logits_w, y) + lam(epoch) * I[conf>th] * KL(p_w || p_s)

    Валидация/тест:
      считаем только CE на weak (x_w), strong игнорируем (но датасет всё равно отдаёт x_s=x_w).
    """
    train_mode = optimizer is not None
    model.train() if train_mode else model.eval()

    total_loss = 0.0
    total_sup = 0.0
    total_con = 0.0
    total_correct = 0
    total_samples = 0
    total_masked = 0

    lam = rampup_weight(epoch, warmup_epochs=warmup_epochs, rampup_epochs=rampup_epochs, max_weight=lam_max)

    for x_w, x_s, y in dataloader:
        x_w = x_w.to(device)
        x_s = x_s.to(device)
        y = y.to(device)

        if train_mode:
            optimizer.zero_grad()

        logits_w = model(x_w)
        loss_sup = criterion(logits_w, y)

        loss = loss_sup

        # Consistency только в train и только когда lam>0
        if train_mode and lam > 0:
            with torch.no_grad():
                p_w = torch.softmax(logits_w, dim=1)
                conf, _ = p_w.max(dim=1)
                mask = (conf > conf_thresh).float()  # [B]
            logits_s = model(x_s)
            p_s = torch.softmax(logits_s, dim=1)

            # KL(p_w || p_s) по батчу
            # kl_div ожидает log-prob на входе и prob как target
            per_sample_kl = torch.nn.functional.kl_div(
                torch.log(p_s + 1e-8), p_w, reduction="none"
            ).sum(dim=1)

            loss_con = (per_sample_kl * mask).mean()
            loss = loss + lam * loss_con

            total_con += loss_con.item() * y.size(0)
            total_masked += int(mask.sum().item())

        if train_mode:
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * y.size(0)
        total_sup += loss_sup.item() * y.size(0)

        preds = logits_w.argmax(dim=1)
        total_correct += (preds == y).sum().item()
        total_samples += y.size(0)

    avg_loss = total_loss / total_samples
    avg_sup = total_sup / total_samples
    avg_con = total_con / max(1, total_samples)
    avg_acc = total_correct / total_samples
    masked_ratio = total_masked / max(1, total_samples)

    return avg_loss, avg_acc, avg_sup, avg_con, masked_ratio, lam


In [None]:
# Self-augmentation параметры
num_epochs = 15
conf_thresh = 0.80
warmup_epochs = 3
rampup_epochs = 6
lam_max = 1.0

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc, train_sup, train_con, train_masked, lam = run_epoch(
        model, train_dl, epoch,
        optimizer=optimizer,
        conf_thresh=conf_thresh,
        warmup_epochs=warmup_epochs,
        rampup_epochs=rampup_epochs,
        lam_max=lam_max
    )

    val_loss, val_acc, val_sup, val_con, val_masked, _ = run_epoch(
        model, val_dl, epoch,
        optimizer=None,
        conf_thresh=conf_thresh,
        warmup_epochs=warmup_epochs,
        rampup_epochs=rampup_epochs,
        lam_max=lam_max
    )

    print(
        f"Epoch {epoch:02d} | "
        f"lam={lam:.2f} | "
        f"train_loss={train_loss:.4f} (sup={train_sup:.4f}, con={train_con:.4f}, masked={train_masked:.2%}) | "
        f"train_acc={train_acc:.3f} | "
        f"val_loss={val_loss:.4f} | val_acc={val_acc:.3f}"
    )


Epoch 01 | train_loss=0.6010, train_acc=0.647 | val_loss=0.6160, val_acc=0.600
Epoch 02 | train_loss=0.7575, train_acc=0.500 | val_loss=0.5863, val_acc=0.667
Epoch 03 | train_loss=0.7216, train_acc=0.574 | val_loss=0.6048, val_acc=0.600
Epoch 04 | train_loss=0.6836, train_acc=0.603 | val_loss=0.6602, val_acc=0.533
Epoch 05 | train_loss=0.7230, train_acc=0.662 | val_loss=0.7342, val_acc=0.533
Epoch 06 | train_loss=0.7144, train_acc=0.559 | val_loss=0.7167, val_acc=0.467
Epoch 07 | train_loss=0.7191, train_acc=0.632 | val_loss=0.6645, val_acc=0.600
Epoch 08 | train_loss=0.6779, train_acc=0.618 | val_loss=0.6582, val_acc=0.667
Epoch 09 | train_loss=0.7070, train_acc=0.529 | val_loss=0.6516, val_acc=0.533
Epoch 10 | train_loss=0.6929, train_acc=0.662 | val_loss=0.6401, val_acc=0.667
Epoch 11 | train_loss=0.6730, train_acc=0.618 | val_loss=0.6364, val_acc=0.600
Epoch 12 | train_loss=0.6510, train_acc=0.691 | val_loss=0.6299, val_acc=0.600
Epoch 13 | train_loss=0.6400, train_acc=0.632 | val_

In [None]:
test_loss, test_acc, test_sup, test_con, test_masked, _ = run_epoch(
    model, test_dl, epoch=num_epochs, optimizer=None
)
print(f"\nTest: loss={test_loss:.4f}, acc={test_acc:.3f}")



Test: loss=0.6481, acc=0.467


In [None]:
def predict(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for x_w, x_s, y in dataloader:
            x_w = x_w.to(device)
            y = y.to(device)
            logits = model(x_w)
            preds = logits.argmax(dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    return np.array(all_preds), np.array(all_labels)


In [None]:
print("Распределение в тестовой выборке:")
print(f"  True labels:  unique={np.unique(test_labels)}, counts={np.bincount(test_labels)}")
print(f"  Predictions:  unique={np.unique(test_preds)}, counts={np.bincount(test_preds)}")

Распределение в тестовой выборке:
  True labels:  unique=[0 1], counts=[8 7]
  Predictions:  unique=[0 1], counts=[6 9]


In [None]:
class_names = ['puppy', 'barbie']

print(classification_report(
    test_labels,
    test_preds,
    labels=[0, 1],
    target_names=class_names,
    zero_division=0
))


              precision    recall  f1-score   support

       puppy       0.00      0.00      0.00         0
      barbie       1.00      1.00      1.00        15

    accuracy                           1.00        15
   macro avg       0.50      0.50      0.50        15
weighted avg       1.00      1.00      1.00        15

