# Импорт модулей

Датасет взят с [Hugging face](https://huggingface.co/datasets/speech_commands)

In [1]:
!pip install pydub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
import os
import json
from google.colab import drive

import wave
import torch
import librosa
import torchaudio
import librosa.display
import soundfile as sf
from scipy import signal
import IPython.display as ipd
from pydub import AudioSegment
import torchaudio.functional as F 
from torchaudio.utils import download_asset

import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt

import shutil
from google.colab import files

# Функции

In [3]:
def get_mean(file_path): #получаю среднее длины аудио
    sum_lenght = []
    too_big = []
    too_low = []

    for root, dirs, files in os.walk(file_path):
        for i in dirs:
            for root2, dirs2, files2 in os.walk(file_path+i):
                for j in files2:
                    path = file_path + i + '/' + j

                    with wave.open(path) as mywav:
                        duration_seconds = mywav.getnframes() / mywav.getframerate()

                    if (duration_seconds < 1.0): 
                        too_low.append((path, duration_seconds))

                    if (duration_seconds > 1.0): 
                        too_big.append(path)

                    sum_lenght.append(duration_seconds)
    
    sum_lenght = np.array(sum_lenght)
    
    return np.mean(sum_lenght), sum_lenght, too_big, too_low

In [4]:
def get_silence(data, sr, path_to_write): # получаю ролики по секунде из длинных
    for i in range(0, int(len(data) / sr) - 1):
        dur = data[i * sr:sr * (i + 1)]
        if(len(dur) < sr): continue

        sf.write(f'{path_to_write}{i}.wav', dur, sr)

In [5]:
def augented_data(file_path, sr, samples, mu, sigma, sounds, SAMPLE_RIR):

    rir_samples = int(1/3 * samples)
    noise_samples = int(1/3 * samples) + rir_samples

    rir_raw, sample_rate = librosa.load(SAMPLE_RIR, sr=sr)
    rir_raw = torch.Tensor([rir_raw])

    sounds_lenght = len(sounds)

    for root, dirs, files in os.walk(file_path):
        lenght = len(files)
        for i in range(samples):
            iter = i % lenght

            path = file_path + files[iter]
            path_write = path.split('.')[0]

            data, sr = librosa.load(path, sr=sr)
            data = torch.Tensor([data])

            if i < rir_samples:
                rir = rir_raw[:, int(sr * 1.01) : int(sr * 1.3)]
                rir = rir / torch.norm(rir, p=2)

                augmented = signal.fftconvolve(data, rir)
                sf.write(f'{path_write}_reverb_{i}.wav', augmented[0], sr)

            elif i < noise_samples:
                noise2 = np.random.normal(mu, sigma, [1,data.shape[1]])
                sf.write(f'{path_write}_noise_{i}.wav', (data + noise2)[0], sr)

            else:
                iter_sound = i % sounds_lenght

                data = AudioSegment.from_file(path, format="wav")
                noise = AudioSegment.from_file(sounds[iter_sound], format="wav") - 25

                overlay = data.overlay(noise, position=0)

                overlay.export(f'{path_write}_back_{i}.wav', format="wav")

        break

# Загрузка данных

In [None]:
f = open('/content/dataset_infos.json')
data = json.load(f)
f.close()

In [None]:
data['v0.02']['download_checksums']

{'https://s3.amazonaws.com/datasets.huggingface.co/SpeechCommands/v0.02/v0.02_train.tar.gz': {'num_bytes': 1944462432,
  'checksum': 'acfc1a9e5f020ef5d20f13bb5c1035dcc19a3cc6d5fd1fe775d99814ce840399'},
 'https://s3.amazonaws.com/datasets.huggingface.co/SpeechCommands/v0.02/v0.02_validation.tar.gz': {'num_bytes': 229117586,
  'checksum': '868bdecd3dc12276ee55d2aeca5b1f02d913d6f17875181c1bf9d465fa2f7be1'},
 'https://s3.amazonaws.com/datasets.huggingface.co/SpeechCommands/v0.02/v0.02_test.tar.gz': {'num_bytes': 112395851,
  'checksum': '45aedb39cb2c9f03e098a8d5c98350d6d8473c432ad4558fce26c6feb478a812'}}

In [6]:
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


In [7]:
!mkdir '/content/train'
!mkdir '/content/test'
!mkdir '/content/dev'

In [8]:
!unzip '/content/gdrive/MyDrive/Colab Notebooks/neymark/sounds to neymark.zip'

Archive:  /content/gdrive/MyDrive/Colab Notebooks/neymark/sounds to neymark.zip
   creating: sounds to neymark/
  inflating: sounds to neymark/ELG2ANV-aggressive.mp3  
  inflating: __MACOSX/sounds to neymark/._ELG2ANV-aggressive.mp3  
  inflating: sounds to neymark/VUGND8F-mellow-background.mp3  
  inflating: __MACOSX/sounds to neymark/._VUGND8F-mellow-background.mp3  
  inflating: sounds to neymark/DogsBarkingCUandDistInfuriated PEHD014302.wav  
  inflating: __MACOSX/sounds to neymark/._DogsBarkingCUandDistInfuriated PEHD014302.wav  
  inflating: sounds to neymark/CarTiresOnGravelEn PE856402.wav  
  inflating: __MACOSX/sounds to neymark/._CarTiresOnGravelEn PE856402.wav  
  inflating: sounds to neymark/mixkit-sleepy-cat-135.mp3  
  inflating: __MACOSX/sounds to neymark/._mixkit-sleepy-cat-135.mp3  
  inflating: sounds to neymark/mixkit-tech-house-vibes-130.mp3  
  inflating: __MACOSX/sounds to neymark/._mixkit-tech-house-vibes-130.mp3  


In [31]:
# !tar -xvf '/content/gdrive/MyDrive/Colab Notebooks/neymark/v0.02_train.tar' -C '/content/train'
!tar -xvf '/content/gdrive/MyDrive/Colab Notebooks/neymark/v0.02_validation.tar' -C '/content/dev'
# !tar -xvf '/content/gdrive/MyDrive/Colab Notebooks/neymark/v0.02_test.tar' -C '/content/test'

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
off/ab7b5acd_nohash_0.wav
off/e54a0f16_nohash_0.wav
off/856eb138_nohash_0.wav
off/d3831f6a_nohash_0.wav
off/7195ffa6_nohash_0.wav
off/a9ca1818_nohash_0.wav
off/525eaa62_nohash_2.wav
off/538e1856_nohash_1.wav
off/cc8b3228_nohash_2.wav
off/258f4559_nohash_2.wav
off/1aed7c6d_nohash_0.wav
off/67c7fecb_nohash_0.wav
off/48bfde8e_nohash_1.wav
off/794cdfc5_nohash_2.wav
off/7622d95b_nohash_0.wav
off/cab100c9_nohash_0.wav
off/067f61e2_nohash_0.wav
off/0ab3b47d_nohash_0.wav
off/099d52ad_nohash_2.wav
off/3cc595de_nohash_3.wav
off/d55aa56c_nohash_0.wav
off/989a2213_nohash_1.wav
off/3aa6f4e2_nohash_0.wav
off/3a929277_nohash_3.wav
off/56eb74ae_nohash_2.wav
off/19b05529_nohash_0.wav
off/c6389ab0_nohash_1.wav
off/c24d96eb_nohash_0.wav
off/ad63d93c_nohash_3.wav
off/73f20b00_nohash_4.wav
off/bdee441c_nohash_4.wav
off/32ad5b65_nohash_2.wav
off/525eaa62_nohash_4.wav
off/989a2213_nohash_0.wav
off/56eb74ae_nohash_4.wav
off/31f0

# Обработка бэков для аугментации

In [10]:
SAMPLE_NOISE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo-8000hz.wav")
SAMPLE_RIR = download_asset("tutorial-assets/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo-8000hz.wav")

!mv '/content/sounds to neymark' '/content/sounds'

100%|██████████| 78.2k/78.2k [00:00<00:00, 12.2MB/s]
100%|██████████| 31.3k/31.3k [00:00<00:00, 528kB/s]


In [11]:
sr = 22050
data, sr = librosa.load('/content/sounds/CarTiresOnGravelEn PE856402.wav', sr=sr)
data = data[:80000]
sf.write(f'/content/sounds/cars.wav', data, sr)

!rm '/content/sounds/CarTiresOnGravelEn PE856402.wav'
!mv '/content/sounds/DogsBarkingCUandDistInfuriated PEHD014302.wav' '/content/sounds/DogsBarking.wav'

In [12]:
dops_sounds = []

file_path = '/content/sounds'

for root, dirs, files in os.walk(file_path):
    dops_sounds = files
    break

dops_sounds

['ELG2ANV-aggressive.mp3',
 'mixkit-tech-house-vibes-130.mp3',
 'DogsBarking.wav',
 'cars.wav',
 'VUGND8F-mellow-background.mp3',
 'mixkit-sleepy-cat-135.mp3']

In [13]:
file_path = '/content/sounds/'

for root, dirs, files in os.walk(file_path):
    for iter, i in enumerate(files):
        path = file_path + i
        data, sr = librosa.load(path, sr=22050)

        get_silence(data, sr, '/content/sounds/' + str(iter) + '_')
    break
    
for root, dirs, files in os.walk(file_path):
    print(len(files))

1072


In [14]:
file_path = SAMPLE_NOISE

data, sr = librosa.load(SAMPLE_NOISE, sr=22050)
get_silence(data, sr, '/content/sounds/' + str(6) + '_')

file_path = '/content/sounds/'

for root, dirs, files in os.walk(file_path):
    print(len(files))
    break

1076


In [15]:
for i in dops_sounds:
    path = '/content/sounds/' + i
    !rm $path

file_path = '/content/sounds/'
sounds = []

for root, dirs, files in os.walk(file_path):
    for i in files:
        path = file_path + i
        sounds.append(path)
    print(len(files))
    break

1070


# Аугментация и починка

## Обучающая выборка

### Разбиенние длинных записей

In [16]:
mean_dur_train, dur_train, big_data_train, small_data_train = get_mean('/content/train/')

In [17]:
for i in big_data_train:
    data, sr = librosa.load(i, sr=22050)
    get_silence(data, sr, i.split('.')[0])

for i in big_data_train:
    !rm $i

In [18]:
common_len = 0
for root, dirs, files in os.walk('/content/train/'):
    common_len += len(files)

common_len

85177

### Удаление поврежденных данных

In [19]:
file_path = '/content/train/marvin/'
low_data_marvin_train = []

for root, dirs, files in os.walk(file_path):
    for i in files:
        path = file_path + i

        with wave.open(path) as mywav:
            duration_seconds = mywav.getnframes() / mywav.getframerate()

        if (duration_seconds < 1.0): 
            low_data_marvin_train.append((path, duration_seconds))

In [20]:
count = []
for i in low_data_marvin_train:
    if i[1] < 0.8:
        count.append(i[0])

In [21]:
len(count)

84

In [22]:
for i in count:
    !rm $i

In [23]:
!rm '/content/train/marvin/88e85150_nohash_0.wav'
!rm '/content/train/marvin/88e85150_nohash_1.wav'

### Аугментация

In [24]:
file_path = '/content/train/'
common = 0

for root, dirs, files in os.walk(file_path):
    for i in dirs:
        for root2, dirs2, files2 in os.walk(file_path + i):
            common += len(files2)
    # print(len(files))
    break

marvin = 0
file_path = '/content/train/marvin'
for root, dirs, files in os.walk(file_path):
    marvin = len(files)
    # print(len(files))
    break

samples = common - 2 * marvin
print('Сколько надо дозаполнить: ', samples)

mu, sigma = 0, 0.02

Сколько надо дозаполнить:  81841


In [25]:
augented_data('/content/train/marvin/', 22050, samples, mu, sigma, sounds, SAMPLE_RIR)

  rir_raw = torch.Tensor([rir_raw])


In [26]:
file_path = '/content/train/'
lenbgt = 0

for root, dirs, files in os.walk(file_path):
    for i in dirs:
        path = file_path + i

        for j, k, l in os.walk(path):
            lenbgt += len(l)
            break

    break

lenbgt

166930

In [27]:
marvin = 0
file_path = '/content/train/marvin'
for root, dirs, files in os.walk(file_path):
    marvin = len(files)
    # print(len(files))
    break

marvin

83465

In [None]:
166930 / 2

83465.0

In [28]:
shutil.make_archive("train_without_effects", "zip", "train")
!cp train_without_effects.zip "/content/gdrive/MyDrive/Colab Notebooks/neymark"
# files.download('/content/train_without_effects.zip')

In [None]:
!cp train_without_effects.zip "/content/gdrive/MyDrive/Colab Notebooks/neymark"

## Валидационная выборка
### Разбиенние длинных записей

In [32]:
mean_dur_dev, dur_dev, big_data_dev, small_data_dev = get_mean('/content/dev/')

In [33]:
data, sr = librosa.load('/content/dev/_silence_/running_tap.wav', sr=22050)
get_silence(data, sr, '/content/dev/_silence_/silence')
!rm '/content/dev/_silence_/running_tap.wav'

### Аугментация

In [34]:
file_path = '/content/dev/'
common = 0

for root, dirs, files in os.walk(file_path):
    for i in dirs:
        for root2, dirs2, files2 in os.walk(file_path + i):
            common += len(files2)
    # print(len(files))
    break

marvin = 0
file_path = '/content/dev/marvin/'
for root, dirs, files in os.walk(file_path):
    marvin = len(files)
    # print(len(files))
    break

samples = common - 2 * marvin
print('Сколько надо дозаполнить: ', samples)

mu, sigma = 0, 0.02

Сколько надо дозаполнить:  9651


In [35]:
augented_data('/content/dev/marvin/', 22050, samples, mu, sigma, sounds, SAMPLE_RIR)

In [36]:
marvin = 0
file_path = '/content/dev/marvin'
for root, dirs, files in os.walk(file_path):
    marvin = len(files)
    # print(len(files))
    break

marvin

9846

In [37]:
file_path = '/content/dev/'
lenbgt = 0

for root, dirs, files in os.walk(file_path):
    for i in dirs:
        path = file_path + i

        for j, k, l in os.walk(path):
            lenbgt += len(l)
            break

    break

lenbgt

19692

In [None]:
19692 / 2

9846.0

In [38]:
shutil.make_archive("dev_without_effects", "zip", "dev")
!cp dev_without_effects.zip "/content/gdrive/MyDrive/Colab Notebooks/neymark"
# files.download('dev_without_effects.zip')