# ZRE Naive Autotune
Author: Marek Hric xhricma00  
GitHub repo: https://github.com/Marek324/butfit-zre/  
Google Collab notebook: TDB after completion i guess

Original audio source: https://www.kaggle.com/datasets/pavanelisetty/sample-audio-files-for-speech-recognition?resource=download&select=harvard.wav

## Project pipeline
1. Load downsampled audio file ([code](#load-downsampled-audio-file))
2. Center the signal ([code](#center))
3. Frame the signal ([code](#framing))
7. Synthesis ([code](#synthesis))

#### Plotting

In [131]:
# set to True for plots
PLOT = False

### Load downsampled audio file

In [150]:
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import scipy.signal as sg
import librosa
from IPython.display import Audio, display

!pip install vad
from vad import EnergyVAD

s, Fs = librosa.load("harvard.wav", sr=16000)

print(f"Signal length: {len(s)} samples")
print(f"Duration: {len(s)/Fs:.2f} s")

display(Audio(s, rate=Fs))

Signal length: 160000 samples
Duration: 10.00 s


### Center

In [200]:
sc = s - np.mean(s)

if PLOT:
  plt.figure(figsize=(12,6))
  plt.title('Offline centering')
  plt.plot(s, label='Original signal')
  plt.plot(sc, label='Centered signal')
  plt.plot(s-sc, label='Difference')
  plt.legend()
  plt.show()

s = sc

### Frame

In [197]:
def frame(s, Fs=16000, len_s=0.02, overlap_s=0.01) -> tuple[np.ndarray, int, int]:
  fl = int(len_s * Fs)
  fo = int(overlap_s * Fs)
  fs = fl - fo
  Nf = int(1 + np.floor((len(s) - fl) / fs))

  hann_win = np.hanning(fl)

  return np.array([s[i * fs:i * fs + fl] * hann_win for i in range(Nf)]), fl, fo

frames, fl, fo = frame(s)


if PLOT:
  plt.title("Frame 123")
  plt.plot(frames[122])
  plt.show()

### Voiced frames

In [203]:
fl_ms = int(fl/Fs * 1000)
fs_ms = int((fl-fo)/Fs * 1000)

vad = EnergyVAD(sample_rate = 16000,
  frame_length = fl_ms,
  frame_shift = fs_ms,
  energy_threshold = 0.05)

vframes = vad(s)

if PLOT:
  plt.title("Voiced frames")
  plt.plot(vframes)
  plt.show()

### Fundamental frequency

In [204]:
def frame_lags(frames:np.ndarray, s:np.ndarray, fl:int, fo:int, Lmin:int, Lmax:int) -> np.ndarray:
  Nfr = len(frames)
  sp = np.pad(s, (Lmax, 0))
  alpha = 0.85
  fs = fl-fo

  Lnccf = np.zeros(Nfr)
  for fi in range(Nfr):
    if(vframes[fi] == 0):
      Lnccf[fi] = 0
      continue

    cf = s[fi*fs : fi*fs+fl]
    E1 = np.sum(cf**2)

    Rnccf = np.zeros(Lmax + 1)
    for n in range(Lmax):
      sf = sp[fi*fs-n+Lmax:fi*fs+fl-n+Lmax]
      E2 = np.sum(sf**2)
      numerator = np.sum(cf * sf)
      nccf = numerator / np.sqrt(E1 * E2)
      Rnccf[n] = nccf

    Rmaxarg = np.argmax(Rnccf[Lmin:Lmax])
    Rmax = Rnccf[Lmin + Rmaxarg]
    Lnccf[fi] = Lmin + Rmaxarg

  return Lnccf

Lf = frame_lags(frames, s, fl, fo, 30, 280)
Lf = sg.medfilt(Lf, 3)

first_seq_f0 = []
first_seq_i = []

for i, fv in enumerate(vframes):
  if fv == 0:
    if len(first_seq_f0) > 0:
      break
    else:
      continue
  first_seq_f0.append(Fs/Lf[i])
  first_seq_i.append(i)

F0base = np.median(first_seq_f0)
print(F0base)

if PLOT:
  plt.title("Fundamental frequencies in first voiced sequence")
  plt.plot(first_seq_i, first_seq_f0)
  plt.axhline(y=max(first_seq_f0), color='b', linestyle='--', label=f'Max: {max(first_seq_f0):.2f} Hz')
  plt.axhline(y=min(first_seq_f0), color='g', linestyle='--', label=f'Min: {min(first_seq_f0):.2f} Hz')
  plt.axhline(y=F0base, color='r', linestyle='--', label=f'F0base: {F0base:.2f} Hz')
  plt.xlabel("Frame [n]")
  plt.xticks(first_seq_i)
  plt.ylabel("F0 [Hz]")
  plt.show()


119.03971678282997


### Melody generation

In [206]:
def generate_melody(F0base, num_notes=8):
    """Generates a simple melody using semitones.

    Args:
        F0base: The base fundamental frequency.
        num_notes: The desired number of notes in the melody.

    Returns:
        A list of frequencies representing the melody.
    """

    # Define semitone intervals (major scale)
    intervals = [0, 2, 4, 5, 7, 9, 11, 12]  # Example: Major scale

    # Generate melody notes within one octave up or down
    melody = []
    for i in range(num_notes):
        interval = intervals[i % len(intervals)]  # Cycle through intervals
        frequency = F0base * (2 ** (interval / 12))  # Apply semitone shift
        melody.append(frequency)

    return melody

example_melody = generate_melody(F0base, 32)
if PLOT:
  plt.title("Melody frequencies")
  plt.plot(example_melody)
  plt.xlabel("Sample [n]")
  plt.ylabel("Frequency [Hz]")
  plt.show()

### Synthesis

In [136]:
ss = np.zeros_like(s)

n = 0
for f in frames:
   ss[n:n+fl] += f
   n += fo

display(Audio(ss, rate=Fs))
sf.write('harvard_synthesized.wav', ss, Fs)