In [None]:
!pip install librosa



In [82]:
import os
import zipfile
import requests
import librosa
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [8]:
zip_url = 'https://github.com/NancyBlackkk/hindi-speech-classification/raw/ba7539879fc9419ec242a2db4b6cf291e8996f27/hindi_audio.zip'
csv_url = 'https://github.com/NancyBlackkk/hindi-speech-classification/raw/ba7539879fc9419ec242a2db4b6cf291e8996f27/hindi.csv'

data_dir = '/content/'
os.makedirs(data_dir, exist_ok=True)

def download_file(url, destination):
    response = requests.get(url)
    if response.status_code == 200:
        with open(destination, 'wb') as file:
            file.write(response.content)
        print(f"Файл загружен: {destination}")
    else:
        print(f"Ошибка при загрузке файла: {url}")

zip_file_path = os.path.join(data_dir, 'hindi.zip')
download_file(zip_url, zip_file_path)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(data_dir)

print("ZIP-файл успешно распакован!")

csv_file_path = os.path.join(data_dir, 'hindi.csv')
download_file(csv_url, csv_file_path)

Файл загружен: /content/hindi.zip
ZIP-файл успешно распакован!
Файл загружен: /content/hindi.csv


In [29]:
labels_df = pd.read_csv(csv_file_path)
print("Первые несколько строк меток:")
print(labels_df.head())

Первые несколько строк меток:
                    file_id gender
0  common_voice_hi_26042241   male
1  common_voice_hi_24026282   male
2  common_voice_hi_24026319   male
3  common_voice_hi_26019764   male
4  common_voice_hi_25579063   male


In [30]:
labels_df['gender'] = labels_df['gender'].map({'male': 0, 'female': 1})

In [33]:
audio_dir = os.path.join(data_dir, 'hindi')

In [34]:
def find_max_mfcc_length(audio_dir):
    max_len = 0
    for file_name in os.listdir(audio_dir):
        if file_name.endswith(".mp3"):
            mp3_file = os.path.join(audio_dir, file_name)
            signal, sr = librosa.load(mp3_file, sr=None)
            mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)
            if mfcc.shape[1] > max_len:
                max_len = mfcc.shape[1]
    return max_len

In [14]:
max_len = find_max_mfcc_length(audio_dir)
print(f"Максимальная длина MFCC: {max_len}")

Максимальная длина MFCC: 892


In [48]:
def add_noise(signal, noise_factor=0.005):
    noise = np.random.randn(len(signal))
    augmented_signal = signal + noise_factor * noise
    return augmented_signal

def pitch_shift(signal, sr, n_steps=2):
    return librosa.effects.pitch_shift(signal, sr=sr, n_steps=n_steps)

def time_stretch(signal, speed_factor=0.8):
    return librosa.effects.time_stretch(signal, rate=speed_factor)

def reverse_signal(signal):
    return np.flip(signal)

In [58]:
def extract_mfcc_with_augmentation(mp3_file, n_mfcc=13, max_len=max_len, augment=False):
    try:
        signal, sr = librosa.load(mp3_file, sr=None)

        if augment:
            if random.random() < 0.25:  # Добавляем шум с вероятностью 25%
                signal = add_noise(signal)
            if random.random() < 0.25:  # Меняем высоту тона с вероятностью 25%
                signal = pitch_shift(signal, sr)
            if random.random() < 0.25:  # Меняем скорость воспроизведения с вероятностью 25%
                signal = time_stretch(signal)
            if random.random() < 0.25:  # Реверсируем сигнал с вероятностью 25%
                signal = reverse_signal(signal)

        mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)

        if mfcc.shape[1] < max_len:
            mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), mode='constant')

        return mfcc
    except Exception as e:
        print(f"Ошибка при обработке файла {mp3_file}: {e}")
        return None

In [68]:
def process_audio_data(audio_dir, labels_df, augment=False):
    mfcc_data = []
    file_ids = []

    for file_name in os.listdir(audio_dir):
        if file_name.endswith(".mp3"):
            mp3_file = os.path.join(audio_dir, file_name)
            mfcc = extract_mfcc_with_augmentation(mp3_file, augment=augment)

            if mfcc is not None:
                mfcc_data.append(mfcc)
                file_id = file_name.replace('.mp3', '')
                label = labels_df[labels_df['file_id'] == file_id]['gender'].values

                if len(label) > 0:
                    file_ids.append(file_id)
                else:
                    print(f"Метка для файла {file_id} не найдена")

    return np.array(mfcc_data), file_ids

In [109]:
mfcc_data, file_ids = process_audio_data(audio_dir, labels_df)
labels = np.array([labels_df[labels_df['file_id'] == file_id]['gender'].values[0] for file_id in file_ids])

In [110]:
mfcc_data_augmented, file_ids_augmented = process_audio_data(audio_dir, labels_df, augment=True)
labels_augmented = np.array([labels_df[labels_df['file_id'] == file_id]['gender'].values[0] for file_id in file_ids_augmented])

In [111]:
train_data_combined = np.concatenate((mfcc_data, mfcc_data_augmented), axis=0)
train_labels_combined = np.concatenate((labels, labels_augmented), axis=0)

In [112]:
train_data, test_data, train_labels, test_labels = train_test_split(train_data_combined, train_labels_combined, test_size=0.25, random_state=42)

train_data = torch.tensor(train_data, dtype=torch.float32)
test_data = torch.tensor(test_data, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.long)
test_labels = torch.tensor(test_labels, dtype=torch.long)

In [113]:
# Проверим размер выборок
print(f"Размер обучающей выборки: {train_data.shape}")
print(f"Размер тестовой выборки: {test_data.shape}")

Размер обучающей выборки: torch.Size([750, 13, 892])
Размер тестовой выборки: torch.Size([250, 13, 892])


In [127]:
class SingleLayerCNN(nn.Module):
    def __init__(self, max_len):
        super(SingleLayerCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=13, out_channels=16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = None

        self._initialize_weights(max_len)

    def _initialize_weights(self, max_len):
        x = torch.zeros(1, 13, max_len)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        self.output_size = x.size(1)
        self.fc1 = nn.Linear(self.output_size, 2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x


In [129]:
model = SingleLayerCNN(max_len)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

num_epochs = 15
train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    outputs = model(train_data.permute(0, 1, 2))
    loss = criterion(outputs, train_labels)
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())

    model.eval()
    with torch.no_grad():
        test_outputs = model(test_data.permute(0, 1, 2))
        test_loss = criterion(test_outputs, test_labels)
        test_losses.append(test_loss.item())

    _, predicted = torch.max(outputs, 1)
    train_accuracy = (predicted == train_labels).float().mean()
    train_accuracies.append(train_accuracy.item())

    _, test_predicted = torch.max(test_outputs, 1)
    test_accuracy = (test_predicted == test_labels).float().mean()
    test_accuracies.append(test_accuracy.item())

    print(f"Эпоха [{epoch + 1}/{num_epochs}], Потери: {loss.item():.3f}, Точность (train): {train_accuracy:.3f}, Точность (test): {test_accuracy:.3f}")

Эпоха [1/15], Потери: 2.685, Точность (train): 0.707, Точность (test): 0.820
Эпоха [2/15], Потери: 6.083, Точность (train): 0.839, Точность (test): 0.644
Эпоха [3/15], Потери: 6.233, Точность (train): 0.615, Точность (test): 0.780
Эпоха [4/15], Потери: 3.119, Точность (train): 0.800, Точность (test): 0.820
Эпоха [5/15], Потери: 4.151, Точность (train): 0.827, Точность (test): 0.820
Эпоха [6/15], Потери: 3.581, Точность (train): 0.840, Точность (test): 0.808
Эпоха [7/15], Потери: 2.065, Точность (train): 0.804, Точность (test): 0.612
Эпоха [8/15], Потери: 3.699, Точность (train): 0.605, Точность (test): 0.808
Эпоха [9/15], Потери: 1.799, Точность (train): 0.757, Точность (test): 0.820
Эпоха [10/15], Потери: 2.223, Точность (train): 0.825, Точность (test): 0.820
Эпоха [11/15], Потери: 2.433, Точность (train): 0.851, Точность (test): 0.820
Эпоха [12/15], Потери: 2.147, Точность (train): 0.836, Точность (test): 0.832
Эпоха [13/15], Потери: 1.501, Точность (train): 0.809, Точность (test): 0