# Импорт модулей

Датасет взят с [Hugging face](https://huggingface.co/datasets/speech_commands)

In [None]:
!pip install pydub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import os
import json
from google.colab import drive

import wave
import torch
import librosa
import torchaudio
import librosa.display
import soundfile as sf
from scipy import signal
import IPython.display as ipd
from pydub import AudioSegment
import torchaudio.functional as F 
from torchaudio.utils import download_asset

import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt

import shutil
from google.colab import files

# Функции

In [None]:
def get_mean(file_path): #получаю среднее длины аудио
    sum_lenght = []
    too_big = []
    too_low = []

    for root, dirs, files in os.walk(file_path):
        for i in dirs:
            for root2, dirs2, files2 in os.walk(file_path+i):
                for j in files2:
                    path = file_path + i + '/' + j

                    with wave.open(path) as mywav:
                        duration_seconds = mywav.getnframes() / mywav.getframerate()

                    if (duration_seconds < 1.0): 
                        too_low.append((path, duration_seconds))

                    if (duration_seconds > 1.0): 
                        too_big.append(path)

                    sum_lenght.append(duration_seconds)
    
    sum_lenght = np.array(sum_lenght)
    
    return np.mean(sum_lenght), sum_lenght, too_big, too_low

In [None]:
def get_silence(data, sr, path_to_write): # получаю ролики по секунде из длинных
    for i in range(0, int(len(data) / sr) - 1):
        dur = data[i * sr:sr * (i + 1)]
        if(len(dur) < sr): continue

        sf.write(f'{path_to_write}{i}.wav', dur, sr)

In [None]:
def augented_data(file_path, sr, samples, mu, sigma, sounds, SAMPLE_RIR):

    rir_samples = int(1/3 * samples)
    noise_samples = int(1/3 * samples) + rir_samples

    rir_raw, sample_rate = librosa.load(SAMPLE_RIR, sr=sr)
    rir_raw = torch.Tensor([rir_raw])

    sounds_lenght = len(sounds)

    for root, dirs, files in os.walk(file_path):
        lenght = len(files)
        for i in range(samples):
            iter = i % lenght

            path = file_path + files[iter]
            path_write = path.split('.')[0]

            data, sr = librosa.load(path, sr=sr)
            data = torch.Tensor([data])

            if i < rir_samples:
                rir = rir_raw[:, int(sr * 1.01) : int(sr * 1.3)]
                rir = rir / torch.norm(rir, p=2)

                augmented = signal.fftconvolve(data, rir)
                sf.write(f'{path_write}_reverb_{i}.wav', augmented[0], sr)

            elif i < noise_samples:
                noise2 = np.random.normal(mu, sigma, [1,data.shape[1]])
                sf.write(f'{path_write}_noise_{i}.wav', (data + noise2)[0], sr)

            else:
                iter_sound = i % sounds_lenght

                data = AudioSegment.from_file(path, format="wav")
                noise = AudioSegment.from_file(sounds[iter_sound], format="wav") - 25

                overlay = data.overlay(noise, position=0)

                overlay.export(f'{path_write}_back_{i}.wav', format="wav")

        break

# Загрузка данных

In [None]:
f = open('/content/dataset_infos.json')
data = json.load(f)
f.close()

In [None]:
data['v0.02']['download_checksums']

{'https://s3.amazonaws.com/datasets.huggingface.co/SpeechCommands/v0.02/v0.02_train.tar.gz': {'num_bytes': 1944462432,
  'checksum': 'acfc1a9e5f020ef5d20f13bb5c1035dcc19a3cc6d5fd1fe775d99814ce840399'},
 'https://s3.amazonaws.com/datasets.huggingface.co/SpeechCommands/v0.02/v0.02_validation.tar.gz': {'num_bytes': 229117586,
  'checksum': '868bdecd3dc12276ee55d2aeca5b1f02d913d6f17875181c1bf9d465fa2f7be1'},
 'https://s3.amazonaws.com/datasets.huggingface.co/SpeechCommands/v0.02/v0.02_test.tar.gz': {'num_bytes': 112395851,
  'checksum': '45aedb39cb2c9f03e098a8d5c98350d6d8473c432ad4558fce26c6feb478a812'}}

In [None]:
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


In [None]:
!mkdir '/content/train'
!mkdir '/content/test'
!mkdir '/content/dev'

In [None]:
!unzip '/content/sounds.zip'

In [None]:
!tar -xvf '/content/v0.02_train.tar' -C '/content/train'
!tar -xvf '/content/v0.02_validation.tar' -C '/content/dev'

# Обработка бэков для аугментации

In [None]:
SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
SAMPLE_RIR = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")

!mv '/content/sounds' '/content/sounds'

In [None]:
sr = 44100
data, sr = librosa.load('/content/sounds/CarTiresOnGravelEn PE856402.wav', sr=sr)
data = data[:80000]
sf.write(f'/content/sounds/cars.wav', data, sr)

!rm '/content/sounds/CarTiresOnGravelEn PE856402.wav'
!mv '/content/sounds/DogsBarkingCUandDistInfuriated PEHD014302.wav' '/content/sounds/DogsBarking.wav'

In [None]:
dops_sounds = []

file_path = '/content/sounds'

for root, dirs, files in os.walk(file_path):
    dops_sounds = files
    break

dops_sounds

In [None]:
file_path = '/content/sounds/'

for root, dirs, files in os.walk(file_path):
    for iter, i in enumerate(files):
        path = file_path + i
        data, sr = librosa.load(path, sr=44100)

        get_silence(data, sr, '/content/sounds/' + str(iter) + '_')
    break
    
for root, dirs, files in os.walk(file_path):
    print(len(files))

In [None]:
file_path = SAMPLE_NOISE

data, sr = librosa.load(SAMPLE_NOISE, sr=44100)
get_silence(data, sr, '/content/sounds/' + str(6) + '_')

file_path = '/content/sounds/'

for root, dirs, files in os.walk(file_path):
    print(len(files))
    break

In [None]:
for i in dops_sounds:
    path = '/content/sounds/' + i
    !rm $path

file_path = '/content/sounds/'
sounds = []

for root, dirs, files in os.walk(file_path):
    for i in files:
        path = file_path + i
        sounds.append(path)
    print(len(files))
    break

# Аугментация и починка

## Обучающая выборка

### Разбиенние длинных записей

In [None]:
mean_dur_train, dur_train, big_data_train, small_data_train = get_mean('/content/train/')

In [None]:
for i in big_data_train:
    data, sr = librosa.load(i, sr=44100)
    get_silence(data, sr, i.split('.')[0])

for i in big_data_train:
    !rm $i

In [None]:
common_len = 0
for root, dirs, files in os.walk('/content/train/'):
    common_len += len(files)

common_len

### Удаление поврежденных данных

In [None]:
file_path = '/content/train/marvin/'
low_data_marvin_train = []

for root, dirs, files in os.walk(file_path):
    for i in files:
        path = file_path + i

        with wave.open(path) as mywav:
            duration_seconds = mywav.getnframes() / mywav.getframerate()

        if (duration_seconds < 1.0): 
            low_data_marvin_train.append((path, duration_seconds))

In [None]:
count = []
for i in low_data_marvin_train:
    if i[1] < 0.8:
        count.append(i[0])

In [None]:
len(count)

In [None]:
for i in count:
    !rm $i

In [None]:
!rm '/content/train/marvin/88e85150_nohash_0.wav'
!rm '/content/train/marvin/88e85150_nohash_1.wav'

### Аугментация

In [None]:
file_path = '/content/train/'
common = 0

for root, dirs, files in os.walk(file_path):
    for i in dirs:
        for root2, dirs2, files2 in os.walk(file_path + i):
            common += len(files2)
    # print(len(files))
    break

marvin = 0
file_path = '/content/train/marvin'
for root, dirs, files in os.walk(file_path):
    marvin = len(files)
    # print(len(files))
    break

samples = common - 2 * marvin
print('Сколько надо дозаполнить: ', samples)

mu, sigma = 0, 0.02

Сколько надо дозаполнить:  81841


In [None]:
augented_data('/content/train/marvin/', 44100, samples, mu, sigma, sounds, SAMPLE_RIR)

  rir_raw = torch.Tensor([rir_raw])


In [None]:
file_path = '/content/train/'
lenbgt = 0

for root, dirs, files in os.walk(file_path):
    for i in dirs:
        path = file_path + i

        for j, k, l in os.walk(path):
            lenbgt += len(l)
            break

    break

lenbgt

166930

In [None]:
marvin = 0
file_path = '/content/train/marvin'
for root, dirs, files in os.walk(file_path):
    marvin = len(files)
    # print(len(files))
    break

marvin

83465

In [None]:
166930 / 2

83465.0

## Валидационная выборка
### Разбиенние длинных записей

In [None]:
mean_dur_dev, dur_dev, big_data_dev, small_data_dev = get_mean('/content/dev/')

In [None]:
data, sr = librosa.load('/content/dev/_silence_/running_tap.wav', sr=44100)
get_silence(data, sr, '/content/dev/_silence_/silence')
!rm '/content/dev/_silence_/running_tap.wav'

### Аугментация

In [None]:
file_path = '/content/dev/'
common = 0

for root, dirs, files in os.walk(file_path):
    for i in dirs:
        for root2, dirs2, files2 in os.walk(file_path + i):
            common += len(files2)
    # print(len(files))
    break

marvin = 0
file_path = '/content/dev/marvin/'
for root, dirs, files in os.walk(file_path):
    marvin = len(files)
    # print(len(files))
    break

samples = common - 2 * marvin
print('Сколько надо дозаполнить: ', samples)

mu, sigma = 0, 0.02

Сколько надо дозаполнить:  9651


In [None]:
augented_data('/content/dev/marvin/', 44100, samples, mu, sigma, sounds, SAMPLE_RIR)

In [None]:
marvin = 0
file_path = '/content/dev/marvin'
for root, dirs, files in os.walk(file_path):
    marvin = len(files)
    # print(len(files))
    break

marvin

9846

In [None]:
file_path = '/content/dev/'
lenbgt = 0

for root, dirs, files in os.walk(file_path):
    for i in dirs:
        path = file_path + i

        for j, k, l in os.walk(path):
            lenbgt += len(l)
            break

    break

lenbgt

19692

In [None]:
19692 / 2

9846.0