### Pitch and Loudness Extraction from Wav

In [None]:
import torchcrepe
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import librosa
import torch
import torch.nn.functional as F
import os
import json
import random
import pickle
from scipy.signal import butter, lfilter, filtfilt, resample
from transformers import Wav2Vec2Processor, Wav2Vec2Model, HubertModel

In [None]:
#  wav files are sampled at 16 khz
sr = 16000
hop_length = int(sr / 200.) # EMA sampled at 200 Hz

# This would be a reasonable range for speech
fmin = 50
fmax = 550

# Select a model capacity--one of "tiny" or "full"
model = 'tiny'

# Choose a device to use for inference
device = 'cuda:1'

# Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 2048

# EMA Processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
speech_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53").eval().to(device)

def extract_pitch(signal):
    audio = torch.from_numpy(signal).unsqueeze(0).float()
    
    pitch, periodicity = torchcrepe.predict(audio,
                            sr,
                            hop_length,
                            fmin,
                            fmax,
                            model,
                            batch_size=batch_size,
                            device=device,
                            return_periodicity=True)
    return pitch.flatten().numpy()

In [None]:
def extract_loudness(signal):
    signal = abs(signal)
    loudness = np.array([max(signal[i:i+hop_length]) for i in range(0, signal.size, hop_length)])
    return loudness

In [None]:
def get_filenames_without_extension(directory):
    return set(os.path.splitext(file)[0] for file in os.listdir(directory))

In [None]:
# data folder to prep
data_folder_name = _
output_folder_name = _

# Define the directories
wav_dir = os.path.join(data_folder_name, "wav")
loudness_dir = os.path.join(output_folder_name, "loudness")
pitch_dir = os.path.join(output_folder_name, "pitch_tiny")
ema_dir_input = os.path.join(data_folder_name, "nema_npy")
ema_dir_output = os.path.join(output_folder_name, "nema_npy")

# Make sure the output directories exist
os.makedirs(loudness_dir, exist_ok=True)
os.makedirs(pitch_dir, exist_ok=True)
os.makedirs(ema_dir, exist_ok=True)


# Process each wav file
for file in get_filenames_without_extension(ema_dir_input):
    # Load the audio file
    y, sr = librosa.load(os.path.join(wav_dir, file+'.wav'), sr=None)

    # Extract the loudness and pitch
    l = extract_loudness(y)
    p = extract_pitch(y)

    # Save the loudness and pitch to their own directories
    # We use numpy's save function to save the arrays to .npy files
    npy_file = file + '.npy'
    np.save(os.path.join(loudness_dir, npy_file), l)
    np.save(os.path.join(pitch_dir, npy_file), p)
