In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torchaudio
import librosa
import random
import matplotlib.pyplot as plt
from IPython.display import Audio
from IPython.display import FileLink
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from tqdm.notebook import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Глобальные параметры 
sample_rate = 16_000 # частота дискретизации
seconds_length = 30
target_length = sample_rate * seconds_length # длина аудио
num_classes = 3
batch_size = 4
device = ('cuda:0' if torch.cuda.is_available() else 'cpu')
device

# Prepare data

### Create csv

In [None]:
# Функция для создания DataFrame с колонкой подклассов
def collect_file_info(root_dir):
    file_info = []
    for main_class in os.listdir(root_dir):
        main_class_path = os.path.join(root_dir, main_class)
        if os.path.isdir(main_class_path):
            for file_name in os.listdir(main_class_path):
                file_path = os.path.join(main_class_path, file_name)
                if os.path.isfile(file_path):
                    file_info.append({
                        'path': file_path,
                        'label': main_class,
                        'sub_class': main_class
                    })
    
    return file_info

In [None]:
# Считывание конкретного класса
def collect_file_info_v(root_dir):
    file_info = []
    for file_name in os.listdir(root_dir):
        file_path = os.path.join(root_dir, file_name)
        if os.path.isfile(file_path):
            file_info.append({
                'path': file_path,
                'label': 'wolf',
                'sub_class': 'wolf_v'
            })
    
    return file_info

In [None]:
root_dir = '/kaggle/input/wolf-dog-cutted-dataset/new_dataset' # Путь до датасета
file_info = collect_file_info(root_dir)
df = pd.DataFrame(file_info)
df['label'].unique()

In [None]:
class_counts = df['label'].value_counts()
print("Количество элементов в каждом классе:")
print(class_counts)
df.head(5)

In [None]:
df2 = pd.DataFrame(collect_file_info_v('/kaggle/input/wolf-dog-val/val_dataset/wolf')) # Путь до второго датасета
print(len(df2))

In [None]:
# Взятие каждого второго аудио из второго DataFrame
df2_cutted = df2.iloc[::2]
df2_cutted

In [None]:
# Объединение DataFrame'ов
df = pd.concat([df, df2_cutted], ignore_index=True)
df

In [None]:
# Создание csv
df.to_csv('file_info.csv', index=False)

### Read and split

In [None]:
# Считывание csv
df = pd.read_csv("/kaggle/working/file_info.csv")
df

In [None]:
# Выборка определённого количества аудио из определённых классов (в этом случае определённое количество negative_new и negative)
def filter_by_index(row):
    try:
        if ('negative' in row):
            if('_new' in row): # negative_new
                index = int(row.split('_')[-1].split('.')[0])
                return index <= 1000
            else: # negative
                index = int(row.split('_')[-1].split('.')[0])
                return index <= 1000
        return True
    except:
        return True
df_filtred = df[df['path'].apply(filter_by_index)]
df_filtred

In [None]:
df = df_filtred
df

### Delete some subclasses

In [None]:
# Вывод количества объектов в каждом классе и подклассе 
class_names = df['label'].unique()
class_dict = {class_name: idx for idx, class_name in enumerate(class_names)}
class_dict = {'wolf': 2, 'dog': 1, 'negative': 0}
subclass_names = df['sub_class'].unique()
subclass_dict = {subclass_name: idx for idx, subclass_name in enumerate(subclass_names)}
subclass_dict = {'wolf': 2, 'dog': 1, 'negative': 0, 'wolf_v': 3}
class_name_reverse = {idx: class_name for class_name, idx in class_dict.items()}
subclass_name_reverse = {idx: subclass_name for subclass_name, idx in subclass_dict.items()}
print("\nClass dictionary:")
print(class_dict)

print("\nSubclass dictionary:")
print(subclass_dict)

In [None]:
# Преобразование меток классов в числа
data = df
data['label'] = df['label'].map(class_dict)
data['sub_class'] = df['sub_class'].map(subclass_dict)

print(data['label'].value_counts())
print(data.groupby(['label', 'sub_class']).size())
data.head(5)

In [None]:
# Разбиение данных на тренировочную и валидационную выборки
train, test = train_test_split(data, test_size=0.2, random_state=52, stratify=data["label"])
train = train.reset_index(drop=True)

# Вывод количества объектов в каждом классе в тренировочной выборке
class_counts = train['label'].replace(class_name_reverse).value_counts()
print(class_counts, '\n\n')
train.head(5)

In [None]:
# Вывод количества объектов в каждом классе в валидационной выборке
class_counts = test['label'].replace(class_name_reverse).value_counts()
print(class_counts, '\n\n')
test.head(5)

# Preprocess data

In [None]:
# Класс аугментаций
class ComposeTransforms:
    def __init__(self, transforms_probs):
        self.transforms_probs = transforms_probs
    def __call__(self, waveform):
        for transform, prob in self.transforms_probs:
            if random.random() < prob:
                waveform = transform(waveform)
        return waveform

In [None]:
# Класс датасета
class CustomAudioDataset(Dataset):
    def __init__(self, paths, labels, rate, target_length=480000, transform=None):
        self.paths = paths
        self.labels = labels
        self.rate = rate
        self.target_length = target_length
        self.transform = transform
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        waveform = self.length_processing(idx)
        if self.transform:
            waveform = self.transform(waveform)
        label = self.labels[idx]
        return waveform, label, idx
    
    # приведение к нужной частоте дискретизации и длине
    def length_processing(self, idx):
        waveform, sample_rate = torchaudio.load(self.paths[idx])
        waveform = waveform.mean(dim=0, keepdim=True)
        if sample_rate != self.rate:
            waveform = torchaudio.functional.resample(waveform, sample_rate, self.rate)
        current_length = waveform.shape[1]
        if current_length > self.target_length:
            waveform = waveform[:, :self.target_length]
        elif current_length < self.target_length:
            waveform = torch.nn.functional.pad(waveform, (0, self.target_length - current_length))
        return waveform.mean(dim=0)

In [None]:
# Добавление шума
def add_noise(waveform):
    noise = torch.rand(len(waveform))
    noise_factor = random.choice([0.05, 0.025, 0.001])
    return waveform + noise_factor * noise

In [None]:
# Изменение тона
def change_pitch(waveform, sample_rate=16000):
    n_steps = random.choice([-2, -1, 1, 2])
    return torch.tensor(librosa.effects.pitch_shift(waveform.numpy(), sr=sample_rate, n_steps=n_steps))

In [None]:
# Изменение скорости
def change_speed(waveform, length= 30 * 16000):
    speed = random.choice([0.9, 0.95, 1.1, 1.2])
    result = librosa.effects.time_stretch(waveform.numpy(), rate=speed)
    if len(result) > length:
        result = result[:length]
    else:
        begin = (length - len(result)) // 2
        end = (length - len(result)) - begin
        result = np.pad(result, (begin, end), "constant")
    return torch.tensor(result)

In [None]:
# инициализация аугментаций
transform = ComposeTransforms([
    #(add_noise, 0.25),
    (change_pitch, 0.1),
    (change_speed, 0.1)
])

In [None]:
# Создание датасетов и DataLoader'ов
train_audio_dataset = CustomAudioDataset(train["path"].tolist(), train["label"].tolist(), sample_rate, transform=transform)
test_audio_dataset = CustomAudioDataset(test["path"].tolist(), test["label"].tolist(), sample_rate)
train_dataloader = DataLoader(train_audio_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_audio_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Вывод аудио класса волка из тренировочного DataLoader
samples, labels, idx = next(iter(train_dataloader))
while int(labels[0]) != 2:
    samples, labels, idx = next(iter(train_dataloader))
print(samples[0].shape)
print(labels[0])
Audio(samples[0].numpy(), rate=sample_rate)

In [None]:
# Вывод случайного аудио из тренировочного DataLoader
samples, labels, idx = next(iter(test_dataloader))
print(samples[0].shape)
print(labels[0])
Audio(samples[0].numpy(), rate=sample_rate)

# Model

In [None]:
# Получение и загрузка предтренировонной модели
bundle = torchaudio.pipelines.WAV2VEC2_BASE
model = bundle.get_model()

In [None]:
# Класс модели для нашей задачи
class CustomAudioModel(nn.Module):
    def __init__(self, model, num_classes):
        super().__init__()
        self.model = model
        self.num_classes = num_classes
        self.num_features = model.encoder.transformer.layers[-1].final_layer_norm.normalized_shape[0]
        
        # Классификационная голова
        self.linear_stack = nn.Sequential(
            nn.Linear(self.num_features, 64),
            nn.ReLU(),
            nn.Linear(64, self.num_classes),
        )
    def forward(self, x):
        x = self.model(x)
        x = x[0].mean(dim=1)
        x = self.linear_stack(x)
        return x

# Training

In [None]:
# Цикл обучения
batchs_for_update = 16
def train_epoch(model, loss_fn, optimizer, dataloader):
    model.train()
    sum_loss = 0
    pbar = tqdm(dataloader, ascii=True, desc='Train')
    try:
        for i, (samples, labels, idx) in enumerate(pbar):
            samples, labels = samples.to(device), labels.to(device)
            pred = model(samples)
            loss = loss_fn(pred, labels)
            sum_loss += loss.item()
            loss.backward()
            
            # Обновляем не каждый батч, а через определённое кол-во, т.к. батч маленький
            if (i + 1) % batchs_for_update == 0: # обновление, как будто батч = 64
                optimizer.step()
                optimizer.zero_grad()            
    except Exception as e:
        print(f"Error\n:{e}\n")
        
    if (i + 1) % batchs_for_update != 0:
        optimizer.step()
        optimizer.zero_grad()
    avg_loss = sum_loss / len(dataloader)
    return avg_loss

In [None]:
# Вычесление метрик по каждому классу
def compute_metrics(preds, labels):
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average=None, labels=[0, 1, 2])
    return p, r, f1

# Валидационный цикл
def eval_epoch(model, loss_fn, dataloader):
    model.eval()
    sum_recall = 0
    sum_precision = 0
    sum_f1 = 0
    sum_loss = 0
    all_preds = []
    all_labels = []
    pbar = tqdm(dataloader, ascii=True, desc='Val')
    with torch.no_grad():
        for i, (samples, labels, idx) in enumerate(pbar):
            samples, labels = samples.to(device), labels.to(device)
            pred = model(samples)
            loss = loss_fn(pred, labels)
            sum_loss += loss.item()
            
            _, predicted_labels = torch.max(pred, 1)
            all_preds.extend(predicted_labels.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            precision = precision_score(labels.cpu(), predicted_labels.cpu(), zero_division=0, average='macro')
            recall = recall_score(labels.cpu(), predicted_labels.cpu(), zero_division=0, average='macro')
            f1 = f1_score(labels.cpu(), predicted_labels.cpu(), zero_division=0, average='macro')
            sum_precision += precision
            sum_recall += recall
            sum_f1 += f1
            
    avg_recall = sum_recall / len(dataloader)
    avg_precision = sum_precision / len(dataloader)
    avg_f1 = sum_f1 / len(dataloader)
    avg_loss = sum_loss / len(dataloader) 
    print(f"Eval Loss: {avg_loss}")
    
    precision, recall, f1 = compute_metrics(all_preds, all_labels)
    for j, (p, r, f) in enumerate(zip(precision, recall, f1)):
        print(f"Class {j} - Precision: {p:.4f}, Recall: {r:.4f}, F1-score: {f:.4f}")
    print(f"Average - Precision: {avg_precision:.4f}, Recall: {avg_recall:.4f}, F1 score: {avg_f1:.4f}")
    return avg_loss

In [None]:
# Параметры обучения
torch.cuda.empty_cache()
learning_rate = 0.0001
custom_model = CustomAudioModel(model, num_classes)
custom_model = custom_model.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(custom_model.parameters(), lr=learning_rate)

In [None]:
# Основной цикл
counter = 0
early_stop = 7
best_val_loss = 1000000
epochs = 50
for i in range(epochs):
    print(f"\nEpoch {i+1}\n----------------------------------------------------------------------------------")
    
    train_loss = train_epoch(custom_model, loss_function, optimizer, train_dataloader)
    print(f"\nTrain loss: {train_loss}")
    
    val_loss = eval_epoch(custom_model, loss_function, test_dataloader)
    torch.save(custom_model.state_dict(), "checkpoint.pth") # Сохранение модели после каждой эпохи
    
    # Ранняя остановка и сохранение лучшей модели (на основе валидационного лоса)
    if (val_loss < best_val_loss):
        best_val_loss = val_loss
        counter = 0
        torch.save(custom_model.state_dict(), "best_model.pth")
    else:
        counter += 1
        if counter >= early_stop:
            print(f"Early stop on the epoch: {i + 1}, best validation loss: {best_val_loss}")
            break

### Links for downloading best and last model

In [None]:
FileLink(r'best_model.pth')

In [None]:
FileLink(r'checkpoint.pth')