In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# changing the working directory
%cd '/content/drive/MyDrive/Experiment1'

/content/drive/MyDrive/Experiment1


In [3]:
!ls

feature_extraction_training_and_valdiation_hw.ipynb    Training_Audio_Files    Validation_MS_PCEN
feature_extraction_training_and_valdiation_nohw.ipynb  Training_MS_PCEN
model.ipynb					       Validation_Audio_Files


### MS-PCEN for Training (NoMK)

In [4]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

audio_dir = r'/content/drive/MyDrive/Experiment1/Training_Audio_Files/NoDC'
spectrogram_dir = r'/content/drive/MyDrive/Experiment1/Training_MS_PCEN/NoDC'

Training = True # Statement: If it is true, you will generate MS-PCEN with agumentation

os.makedirs(spectrogram_dir, exist_ok=True)

# pitch shifting
shift_steps = [8, 4] # rate of the semitones

# time shifting
time_shifts = [1.0] # time in second
count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048,
                                                 hop_length=512, win_length=2048,
                                                 n_mels=128, power=1) # I think he was using 128 as default
    mel_pcen = librosa.pcen(spectrogram, sr=sr, hop_length=512,
                            gain=0.98, bias=0.2, power=0.9,
                            time_constant=0.0004, eps=1e-6)
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(mel_pcen, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

    if Training==True:
        for shift_step in shift_steps:
            shifted_audio = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=shift_step)

            pitch_shifted_spectrogram = librosa.feature.melspectrogram(y=shifted_audio, sr=sr, n_fft=2048,
                                                         hop_length=512, win_length=2048,
                                                         n_mels=128, power=1)

            pitch_shifted_mel_pcen = librosa.pcen(pitch_shifted_spectrogram, sr=sr, hop_length=512,
                                            gain=0.98, bias=0.2, power=0.9,
                                            time_constant=0.0004, eps=1e-6)

            plt.figure(figsize=(8, 4))
            librosa.display.specshow(pitch_shifted_mel_pcen, sr=sr, x_axis='time', y_axis='mel')
            plt.tight_layout()

            shifted_save_path = os.path.join(spectrogram_dir, f'{file_name}_shifted_{shift_step}.png')
            plt.axis('off')
            plt.savefig(shifted_save_path, bbox_inches='tight', pad_inches=0)
            plt.close()
            # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
            shifted_spec_image = Image.open(shifted_save_path)
            shifted_spec_image_rgb = shifted_spec_image.convert('RGB')
            shifted_spec_image_rgb_array = np.array(shifted_spec_image_rgb)
            shifted_spec_image_rgb_array_norm = 255*((shifted_spec_image_rgb_array\
                                                 -np.min(shifted_spec_image_rgb_array))\
                                                /(np.max(shifted_spec_image_rgb_array)\
                                                  -np.min(shifted_spec_image_rgb_array)))
            shifted_spec_image_rgb_im = Image.fromarray(shifted_spec_image_rgb_array_norm.astype(np.uint8))
            shifted_spec_image_rgb_im.save(shifted_save_path)

        for time_shift in time_shifts:
            # time shifting
            time_shifted_spectrogram = np.roll(mel_pcen, round((sr * time_shift)/512.0), axis=1)


            plt.figure(figsize=(8, 4))
            librosa.display.specshow(time_shifted_spectrogram, sr=sr, x_axis='time', y_axis='mel')
            plt.tight_layout()

            time_shifted_save_path = os.path.join(spectrogram_dir, f'{file_name}_shifted_{time_shift}.png')
            plt.axis('off')
            plt.savefig(time_shifted_save_path, bbox_inches='tight', pad_inches=0)
            plt.close()
            # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
            time_shifted_spec_image = Image.open(time_shifted_save_path)
            time_shifted_spec_image_rgb = time_shifted_spec_image.convert('RGB')
            time_shifted_spec_image_rgb_array = np.array(time_shifted_spec_image_rgb)
            time_shifted_spec_image_rgb_array_norm = 255*((time_shifted_spec_image_rgb_array\
                                                      -np.min(time_shifted_spec_image_rgb_array))\
                                                     /(np.max(time_shifted_spec_image_rgb_array)\
                                                       -np.min(time_shifted_spec_image_rgb_array)))
            time_shifted_spec_image_rgb_im = Image.fromarray(time_shifted_spec_image_rgb_array_norm.astype(np.uint8))
            time_shifted_spec_image_rgb_im.save(time_shifted_save_path)

The data sample 5316B776_0.wav is processed: 1/60
The data sample 530D2511_60.wav is processed: 2/60
The data sample 52A0F175_130.wav is processed: 3/60
The data sample 52F0B0B1_180.wav is processed: 4/60
The data sample 53E0901E_100.wav is processed: 5/60
The data sample 53C4C801_0.wav is processed: 6/60
The data sample 53CA1C11_170.wav is processed: 7/60
The data sample 53B3340A_230.wav is processed: 8/60
The data sample 53D7CD25_220.wav is processed: 9/60
The data sample 5367F5E6_30.wav is processed: 10/60
The data sample 542D8CB6_100.wav is processed: 11/60
The data sample 5451CDF5_50.wav is processed: 12/60
The data sample 540F5D5C_50.wav is processed: 13/60
The data sample 5428E165_200.wav is processed: 14/60
The data sample 585FF075_200.wav is processed: 15/60
The data sample 57F0D006_10.wav is processed: 16/60
The data sample 584823E1_160.wav is processed: 17/60
The data sample 58310715_260.wav is processed: 18/60
The data sample 582B96E5_130.wav is processed: 19/60
The data sa

### MS-PCEN for Validation (NoHW)

In [5]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

audio_dir = r'/content/drive/MyDrive/Experiment1/Validation_Audio_Files/NoDC'
spectrogram_dir = r'/content/drive/MyDrive/Experiment1/Validation_MS_PCEN/NoDC'

Training = False # Statement: If it is true, you will generate MS-PCEN with agumentation

os.makedirs(spectrogram_dir, exist_ok=True)

# pitch shifting
shift_steps = [8, 4] # rate of the semitones

# time shifting
time_shifts = [1.0] # time in second
count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048,
                                                 hop_length=512, win_length=2048,
                                                 n_mels=128, power=1) # I think he was using 128 as default
    mel_pcen = librosa.pcen(spectrogram, sr=sr, hop_length=512,
                            gain=0.98, bias=0.2, power=0.9,
                            time_constant=0.0004, eps=1e-6)
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(mel_pcen, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

    if Training==True:
        for shift_step in shift_steps:
            shifted_audio = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=shift_step)

            pitch_shifted_spectrogram = librosa.feature.melspectrogram(y=shifted_audio, sr=sr, n_fft=2048,
                                                         hop_length=512, win_length=2048,
                                                         n_mels=128, power=1)

            pitch_shifted_mel_pcen = librosa.pcen(pitch_shifted_spectrogram, sr=sr, hop_length=512,
                                            gain=0.98, bias=0.2, power=0.9,
                                            time_constant=0.0004, eps=1e-6)

            plt.figure(figsize=(8, 4))
            librosa.display.specshow(pitch_shifted_mel_pcen, sr=sr, x_axis='time', y_axis='mel')
            plt.tight_layout()

            shifted_save_path = os.path.join(spectrogram_dir, f'{file_name}_shifted_{shift_step}.png')
            plt.axis('off')
            plt.savefig(shifted_save_path, bbox_inches='tight', pad_inches=0)
            plt.close()
            # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
            shifted_spec_image = Image.open(shifted_save_path)
            shifted_spec_image_rgb = shifted_spec_image.convert('RGB')
            shifted_spec_image_rgb_array = np.array(shifted_spec_image_rgb)
            shifted_spec_image_rgb_array_norm = 255*((shifted_spec_image_rgb_array\
                                                 -np.min(shifted_spec_image_rgb_array))\
                                                /(np.max(shifted_spec_image_rgb_array)\
                                                  -np.min(shifted_spec_image_rgb_array)))
            shifted_spec_image_rgb_im = Image.fromarray(shifted_spec_image_rgb_array_norm.astype(np.uint8))
            shifted_spec_image_rgb_im.save(shifted_save_path)

        for time_shift in time_shifts:
            # time shifting
            time_shifted_spectrogram = np.roll(mel_pcen, round((sr * time_shift)/512.0), axis=1)


            plt.figure(figsize=(8, 4))
            librosa.display.specshow(time_shifted_spectrogram, sr=sr, x_axis='time', y_axis='mel')
            plt.tight_layout()

            time_shifted_save_path = os.path.join(spectrogram_dir, f'{file_name}_shifted_{time_shift}.png')
            plt.axis('off')
            plt.savefig(time_shifted_save_path, bbox_inches='tight', pad_inches=0)
            plt.close()
            # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
            time_shifted_spec_image = Image.open(time_shifted_save_path)
            time_shifted_spec_image_rgb = time_shifted_spec_image.convert('RGB')
            time_shifted_spec_image_rgb_array = np.array(time_shifted_spec_image_rgb)
            time_shifted_spec_image_rgb_array_norm = 255*((time_shifted_spec_image_rgb_array\
                                                      -np.min(time_shifted_spec_image_rgb_array))\
                                                     /(np.max(time_shifted_spec_image_rgb_array)\
                                                       -np.min(time_shifted_spec_image_rgb_array)))
            time_shifted_spec_image_rgb_im = Image.fromarray(time_shifted_spec_image_rgb_array_norm.astype(np.uint8))
            time_shifted_spec_image_rgb_im.save(time_shifted_save_path)

The data sample 53E9FF59_0.wav is processed: 1/21
The data sample 53CFB672_50.wav is processed: 2/21
The data sample 53B64781_70.wav is processed: 3/21
The data sample 5438CF85_280.wav is processed: 4/21
The data sample 53F48B59_280.wav is processed: 5/21
The data sample 58D36EDD_90.wav is processed: 6/21
The data sample 5932D5E9_80.wav is processed: 7/21
The data sample 589F3BFA_140.wav is processed: 8/21
The data sample 5998BF5D_40.wav is processed: 9/21
The data sample 59848F64_250.wav is processed: 10/21
The data sample 596F0A69_200.wav is processed: 11/21
The data sample 5332DAF9_240.wav is processed: 12/21
The data sample 538C4535_150.wav is processed: 13/21
The data sample 53AB04B1_50.wav is processed: 14/21
The data sample 542F3299_90.wav is processed: 15/21
The data sample 58014E49_230.wav is processed: 16/21
The data sample 57FCF051_0.wav is processed: 17/21
The data sample 57EAA1FD_130.wav is processed: 18/21
The data sample 587970F9_60.wav is processed: 19/21
The data sampl