SincNet основана на функциях sinc, которые реализуют полосовые фильтры. В отличие от стандартных CNN в SincNet непосредственно из данных изучаются только низкие и высокие частоты среза.
Низкая и высокая частоты среза являются единственными параметрами фильтра, обучаемыми из данных
Сеть начинается с SincNet слоя, за которым следуют стандартные слои CNN и завершается полносвязными слоями для классификации.
![image.png](attachment:36eb2123-5fdd-4ff7-86c6-ae85ba225a40.png)


In [None]:
!kaggle datasets download -d awsaf49/asvpoof-2019-dataset
!unzip asvpoof-2019-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/awsaf49/asvpoof-2019-dataset
License(s): ODC Attribution License (ODC-By)
Downloading asvpoof-2019-dataset.zip to /content
 19% 4.49G/23.6G [03:56<15:24, 22.2MB/s]

In [None]:
!sudo rm -rf /content/PA

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import roc_curve

In [None]:
train_audio_folder = 'C:/Users/Ksenia/Desktop/content/LA/LA/ASVspoof2019_LA_train/flac'
train_metadata_file = 'C:/Users/Ksenia/Desktop/content/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt'

In [None]:
test_audio_folder = 'C:/Users/Ksenia/Desktop/content/LA/LA/ASVspoof2019_LA_eval/flac'
test_metadata_file = 'C:/Users/Ksenia/Desktop/content/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt'

In [None]:
train_metadata = pd.read_csv(train_metadata_file, sep=' ', header=None)
train_metadata.columns = ['speaker', 'filename', 'sep1', 'sep2', 'label']
train_metadata = train_metadata[['filename', 'label']]

In [None]:
test_metadata = pd.read_csv(test_metadata_file, sep=' ', header=None)
test_metadata.columns = ['speaker', 'filename', 'sep1', 'sep2', 'label']
test_metadata = test_metadata[['filename', 'label']]

In [None]:
train_audio_file_names = [f"{file_name}.flac" for file_name in train_metadata['filename'].tolist()]
train_labels = train_metadata['label'].tolist()

In [None]:
test_audio_file_names = [f"{file_name}.flac" for file_name in test_metadata['filename'].tolist()]
test_labels = test_metadata['label'].tolist()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.fit_transform(test_labels)

In [None]:
class AudioMNISTDataset(torch.utils.data.Dataset):
    def __init__(self, audio_dir_path, audio_file_names, num_samples, labels):
        super().__init__()
        self.audio_dir_path = audio_dir_path
        self.audio_file_names = audio_file_names
        self.num_samples = num_samples
        self.labels = labels

    def __getitem__(self, index):
        path = os.path.join(self.audio_dir_path, self.audio_file_names[index])
        signal, sr = torchaudio.load(path.replace("\\", "/"))
        signal = self.mix_down_if_necessary(signal)
        signal = self.cut_if_necessary(signal)
        signal = self.right_pad_if_necessary(signal)
        label = (self.labels[index])
        return signal, label

    def __len__(self):
        return len(self.labels)

    def mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim = 0, keepdims = True)
        return signal

    def cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :num_samples]
        return signal

    def right_pad_if_necessary(self, signal):
        length = signal.shape[1]
        if self.num_samples > length:
            pad_last_dim = (0, num_samples - length)
            signal = torch.nn.functional.pad(signal, pad_last_dim)
        return signal

In [None]:
num_samples = 16000

In [None]:
train_dataset = AudioMNISTDataset(train_audio_folder, train_audio_file_names, num_samples, train_labels)
test_dataset = AudioMNISTDataset(test_audio_folder, test_audio_file_names, num_samples, test_labels)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle = True, batch_size = 128)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle = False, batch_size = 128)

In [None]:
class SincConv(nn.Module):
    def __init__(self, out_channels, kernel_size, sample_rate):
        super(SincConv, self).__init__()
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.sample_rate = sample_rate
        self.band_pass = nn.Parameter(torch.Tensor(out_channels, 2))
        self.init_kernels()

    def init_kernels(self):
        self.band_pass.data[:, 0] = torch.linspace(30, 300, self.out_channels)
        self.band_pass.data[:, 1] = torch.linspace(3000, 8000, self.out_channels)

    def forward(self, x):
        filters = self.create_filters()
        return nn.functional.conv1d(x, filters, stride=1, padding=self.kernel_size//2)

    def create_filters(self):
        filters = torch.zeros(self.out_channels, 1, self.kernel_size)
        for i in range(self.out_channels):
            low, high = self.band_pass[i]
            filters[i, 0, :] = self.sinc_filter(low, high)
        return filters

    def sinc_filter(self, low, high):
        t = torch.linspace(-self.kernel_size//2, self.kernel_size//2, self.kernel_size)
        t = t.detach().numpy()
        sinc_filter = (np.sin(2 * np.pi * high.item() * t) - np.sin(2 * np.pi * low.item() * t)) / (np.pi * t)
        sinc_filter[t == 0] = 2 * (high.item() - low.item())
        window = 0.54 - 0.46 * np.cos(2 * np.pi * np.arange(self.kernel_size) / (self.kernel_size - 1))
        return torch.from_numpy(sinc_filter * window).float()

In [None]:
class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(out_channels)

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += residual
        out = self.relu(out)
        return out

In [None]:
class SincNetResNet(nn.Module):
    def __init__(self, kernel_size, sample_rate, resnet_blocks, sinc_out_channels=20):
        super(SincNetResNet, self).__init__()
        self.sinc_conv = SincConv(sinc_out_channels, kernel_size, sample_rate)
        self.resnet_blocks = nn.Sequential(*[BasicBlock(sinc_out_channels, sinc_out_channels) for _ in range(resnet_blocks)])
        self.fc = nn.Linear(sinc_out_channels, 1)

    def forward(self, x):
        x = self.sinc_conv(x)
        x = self.resnet_blocks(x)
        x = torch.mean(x, dim=-1)
        x = self.fc(x)
        x = torch.sigmoid(x)
        return x

In [None]:
import tqdm
from tqdm.auto import tqdm

def train_stochastic(model, loader, criterion, optimizer, num_epoch):
    for epoch in tqdm(range(num_epoch)):
        y_true = []
        y_pred = []
        epoch_loss = []

        for X_batch, y_batch in loader:

            y_batch = y_batch.float().unsqueeze(1)

            outputs = model(X_batch)
            y_true.append(y_batch.detach().cpu().numpy())
            y_pred.append(outputs.detach().cpu().numpy())

            loss = criterion(outputs, y_batch)
            epoch_loss.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        y_true = np.concatenate(y_true)
        y_pred = np.concatenate(y_pred)
        train_eer = EER(y_true, y_pred)
        print(f'Epoch {epoch+1}, EER: {train_eer}')

        print(f'Epoch {epoch+1}, Loss: {np.mean(epoch_loss)}')

    return model

In [None]:
def EER(labels, outputs):
    fpr, tpr, threshold = roc_curve(labels, outputs, pos_label=1)
    fnr = 1 - tpr
    eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
    eer_threshold
    eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
    return eer

In [None]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
kernel_size = 251
sample_rate = 16000
resnet_blocks = 3
learning_rate = 0.001
num_epochs = 10

model = SincNetResNet(kernel_size, sample_rate, resnet_blocks)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
train_stochastic(model, train_loader, criterion, optimizer, num_epochs)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1, EER: 0.29496124031007753
Epoch 1, Loss: 0.2799726954506869
Epoch 2, EER: 0.14534883720930233
Epoch 2, Loss: 0.20754224822000045
Epoch 3, EER: 0.14186046511627906
Epoch 3, Loss: 0.19375415689232361
Epoch 4, EER: 0.12829457364341085
Epoch 4, Loss: 0.1794133602674283
Epoch 5, EER: 0.12248062015503876
Epoch 5, Loss: 0.17163822717552807
Epoch 6, EER: 0.11976744186046512
Epoch 6, Loss: 0.16738982120500737
Epoch 7, EER: 0.11317829457364341
Epoch 7, Loss: 0.16208714246749878
Epoch 8, EER: 0.11356589147286822
Epoch 8, Loss: 0.16042103614639397
Epoch 9, EER: 0.11124031007751937
Epoch 9, Loss: 0.1587205512364905
Epoch 10, EER: 0.1127906976744186
Epoch 10, Loss: 0.15913340739493992


SincNetResNet(
  (sinc_conv): SincConv()
  (resnet_blocks): Sequential(
    (0): BasicBlock(
      (conv1): Conv1d(20, 20, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(20, 20, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv1d(20, 20, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(20, 20, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): BasicBlock(
      (conv1): Conv1d(20, 20, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(20, eps=1e-05, 

In [None]:
new_outputs = []
new_labels = []

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X, y in test_loader:
        outputs = model(X)

        new_outputs.append(y.detach().cpu().numpy())
        new_labels.append(outputs.detach().cpu().numpy())

In [None]:
labels = np.concatenate(new_labels)
outputs = np.concatenate(new_outputs)
print(labels.shape, outputs.shape)

(71237, 1) (71237,)


In [None]:
EER(labels, outputs)

0.5420320532204415

In [None]:
kernel_size = 251
sample_rate = 16000
resnet_blocks = 3
learning_rate = 0.001
num_epochs = 10

model2 = SincNetResNet(kernel_size, sample_rate, resnet_blocks, 50)
criterion = nn.BCELoss()
optimizer = optim.Adam(model2.parameters(), lr=learning_rate)

In [None]:
for epoch in tqdm(range(num_epochs)):
        y_true = []
        y_pred = []
        epoch_loss = []

        for X_batch, y_batch in train_loader:

            y_batch = y_batch.float().unsqueeze(1)

            outputs = model2(X_batch)
            y_true.append(y_batch.detach().cpu().numpy())
            y_pred.append(outputs.detach().cpu().numpy())

            loss = criterion(outputs, y_batch)
            epoch_loss.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        y_true = np.concatenate(y_true)
        y_pred = np.concatenate(y_pred)
        train_eer = EER(y_true, y_pred)
        print(f'Epoch {epoch+1}, EER: {train_eer}')

        print(f'Epoch {epoch+1}, Loss: {np.mean(epoch_loss)}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1, EER: 0.2957364341085271
Epoch 1, Loss: 0.2748691032280275
Epoch 2, EER: 0.14573643410852713
Epoch 2, Loss: 0.20665016534490202
Epoch 3, EER: 0.1387596899224806
Epoch 3, Loss: 0.19543003818797106
Epoch 4, EER: 0.12906976744186047
Epoch 4, Loss: 0.1797170125734267


KeyboardInterrupt: 

Думала может получше будет, но очень много времени заняло, а результат пока не сильно отличается, значит напортачила я тут эхэх

GitHub: https://github.com/mravanelli/SincNet