In [85]:
import os
import torch
import torchaudio
from IPython.display import Audio
import numpy as np
from scipy.fft import fft, ifft
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import random
from tqdm import tqdm

In [662]:
wav_path = '/Users/ginoprasad/autotune/m4a_files/guy.m4a'

In [663]:
wav, sample_rate = torchaudio.load(wav_path)
wav = wav[0]
# wav = wav[300000:]
Audio(wav, rate=sample_rate)

In [664]:
# wav_slice = wav[:50000]
# Audio(wav_slice, rate=sample_rate)

# Assigning Pitches to Frequencies

In [665]:
prelim_notes_octave = 4
prelim_notes = [("C", 261.63), ("C#", 277.18), ("D", 293.66), ("D#", 311.13),
         ("E", 329.63), ("F", 349.23), ("F#", 369.99), ("G", 392.00), ("G#", 415.30), ("A", 440.00), ("A#", 466.16), ("B", 493.88), ]
prelim_notes

[('C', 261.63),
 ('C#', 277.18),
 ('D', 293.66),
 ('D#', 311.13),
 ('E', 329.63),
 ('F', 349.23),
 ('F#', 369.99),
 ('G', 392.0),
 ('G#', 415.3),
 ('A', 440.0),
 ('A#', 466.16),
 ('B', 493.88)]

In [666]:
notes = [(freq * (2 ** (octave - prelim_notes_octave)), f"{name}{octave}") for octave in range(8) for name, freq in prelim_notes]
pd.DataFrame(notes)

Unnamed: 0,0,1
0,16.351875,C0
1,17.323750,C#0
2,18.353750,D0
3,19.445625,D#0
4,20.601875,E0
...,...,...
91,3136.000000,G7
92,3322.400000,G#7
93,3520.000000,A7
94,3729.280000,A#7


In [667]:
notes

[(16.351875, 'C0'),
 (17.32375, 'C#0'),
 (18.35375, 'D0'),
 (19.445625, 'D#0'),
 (20.601875, 'E0'),
 (21.826875, 'F0'),
 (23.124375, 'F#0'),
 (24.5, 'G0'),
 (25.95625, 'G#0'),
 (27.5, 'A0'),
 (29.135, 'A#0'),
 (30.8675, 'B0'),
 (32.70375, 'C1'),
 (34.6475, 'C#1'),
 (36.7075, 'D1'),
 (38.89125, 'D#1'),
 (41.20375, 'E1'),
 (43.65375, 'F1'),
 (46.24875, 'F#1'),
 (49.0, 'G1'),
 (51.9125, 'G#1'),
 (55.0, 'A1'),
 (58.27, 'A#1'),
 (61.735, 'B1'),
 (65.4075, 'C2'),
 (69.295, 'C#2'),
 (73.415, 'D2'),
 (77.7825, 'D#2'),
 (82.4075, 'E2'),
 (87.3075, 'F2'),
 (92.4975, 'F#2'),
 (98.0, 'G2'),
 (103.825, 'G#2'),
 (110.0, 'A2'),
 (116.54, 'A#2'),
 (123.47, 'B2'),
 (130.815, 'C3'),
 (138.59, 'C#3'),
 (146.83, 'D3'),
 (155.565, 'D#3'),
 (164.815, 'E3'),
 (174.615, 'F3'),
 (184.995, 'F#3'),
 (196.0, 'G3'),
 (207.65, 'G#3'),
 (220.0, 'A3'),
 (233.08, 'A#3'),
 (246.94, 'B3'),
 (261.63, 'C4'),
 (277.18, 'C#4'),
 (293.66, 'D4'),
 (311.13, 'D#4'),
 (329.63, 'E4'),
 (349.23, 'F4'),
 (369.99, 'F#4'),
 (392.0, '

# Step 1: distance

Ideally: $x_t - x_{t+T} = 0$ for all t

$d_t (\tau) = \sum_{j=t+1}^{t+W} (x_j - x_{j+\tau})^2$

$= r_t(0) + r_{t+\tau}(0) - 2r_t(\tau)$

Where $r_t(\tau) = \sum_{j=t+1}^{t+W} x_j x_{j+\tau}$

In [668]:
W = 1000

In [669]:
def randints(n, k):
    return [random.randint(0, n-1) for _ in range(k)]

In [708]:
def d(wav_slice, t):
    autocorrelation = scipy.signal.convolve(wav_slice.numpy()[t:], wav_slice[t:t+W].numpy()[::-1], mode='valid')
    energy = scipy.signal.convolve(wav_slice.numpy()[t:] * wav_slice.numpy()[t:], np.ones(W), mode='valid')
    distance = (energy + energy[0]) - (2 * autocorrelation)
    assert len(distance) > 10
    return distance

In [709]:
def normalized_d(wav_slice, t):
    sum_ = 0
    ret = []
    distance = d(wav_slice, t)
    for tau, dist in enumerate(distance):
        if tau == 0:
            ret.append(1)
        else:
            if sum_ == 0:
                ret.append(1000)
            else:
                ret.append(dist / ((1/tau) * sum_))
        sum_ += dist
    return np.array(ret), distance

In [710]:
def parabolic_interpolation(y):
    x = np.array(range(len(y)))
    x_squared = x ** 2
    ones = np.ones(len(y))

    mat = np.transpose(np.array([x_squared, x, ones]))
    if len(y) < 3:
        return np.argmin(y)
    a, b, c = np.matmul(np.linalg.inv(np.matmul(np.transpose(mat), mat)), np.transpose(mat)).dot(y)
    if a == 0 or -(b / (2 * a)) < 0:
        return np.argmin(y)
    return -(b / (2 * a))

In [741]:
def pitch(wav_slice, x_threshold=15, y_threshold=0.15, t=0, width=3):
    ls, dist = normalized_d(wav_slice, t)
    minimum = None
    for x, val in enumerate(ls):
        if x > x_threshold and val < y_threshold and x < len(ls) - 1 and ls[x+1] >= val:
            minimum = x
            break
    if minimum is None:
#         print(len(ls), len(dist))
        assert len(ls) > x_threshold
        minimum = x_threshold + np.argmin(ls[x_threshold:])
    minimum = max(minimum-width, 0) + parabolic_interpolation(dist[max(minimum-width, 0):minimum+width+1]) # parabolic interpolation
    return sample_rate / minimum

In [742]:
def pitch_predict(wav_slice, iterations=30):
    pitch_candidates = []
    for t in randints(len(wav_slice) - W - 15, iterations):
        pitch_ = pitch(wav_slice, t=t)
        if pitch_ is not None:
            pitch_candidates.append(pitch_)
    if not pitch_candidates:
        return None
    median = np.median(pitch_candidates)
    return median if median in pitch_candidates else pitch_candidates[0]

In [743]:
base = np.arange(0, k).astype(np.float64)
amplitude = 1
def get_frequency(frequency):
    c = (frequency * 2 * np.pi) / sample_rate
    wavelet_ = amplitude * np.sin(c * base)
    return wavelet_

In [744]:
def autotune(wav_slice, precision=100):
    freq = pitch_predict(wav_slice)
    if freq is None:
        print("NO CLOSEST\n---------")
        return wav_slice
    closest_index = np.argmin(np.abs(pd.DataFrame(notes)[0] - freq))
#     if closest_index <= 0 or closest_index == len(pd.DataFrame(notes)[0]) - 1:
#         print("NO CLOSEST\n---------")
#         return wav_slice
    closest = pd.DataFrame(notes)[0][closest_index]
#     print(f"CLOSEST: {pd.DataFrame(notes)[1][closest_index]}")
#     print(f"Scaling by {(freq/closest)}")
#     print('--------')
    resampled = torchaudio.functional.resample(wav_slice, orig_freq=precision, new_freq=int(precision * freq/closest))
    return resampled, freq

In [745]:
k = int(0.15 * (sample_rate))
def get_kmers(wav):
    return [wav[i:i+k] for i in range(0, len(wav), k) if i+k <= len(wav)]

In [746]:
autotuned = []
for kmer in tqdm(get_kmers(wav)):
    autotuned.append(get_frequency(autotune(kmer)[1]))
autotuned = np.concatenate(autotuned)
Audio(autotuned, rate=sample_rate)
#     break

100%|█████████████████████████| 134/134 [00:13<00:00, 10.04it/s]
