In [1]:
# -*- coding: utf-8 -*-
# Sine Detector: CNN (classification) + DSP (precise f0) + Tkinter widget for file selection
# Requirements: pip install librosa tensorflow

import os, glob, sys, threading
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras import layers, models

In [2]:
# ===================== Config =====================
SR_CNN = 16000          # sample rate for CNN input
DUR_CNN = 0.5           # window duration for CNN (s)
N_SAMPLES = int(SR_CNN * DUR_CNN)
F_MIN, F_MAX = 200.0, 4000.0   # frequency range considered "valid"
DEFAULT_CLS_THRESH = 0.50
DEFAULT_F0_WINDOW = 1.00       # seconds (for precise f0 estimation)

In [3]:
# ===================== Synthetic generators (quick training) =====================
def gen_sine(freq, sr=SR_CNN, dur=DUR_CNN):
    t = np.arange(int(sr*dur)) / sr
    phase = np.random.rand() * 2*np.pi
    y = 0.9 * np.sin(2*np.pi*freq*t + phase)
    y += 0.005*np.random.randn(len(y))
    return y.astype(np.float32)

def gen_other():
    t = np.arange(N_SAMPLES) / SR_CNN
    typ = np.random.choice(["noise","square","saw"])
    if typ == "noise":
        y = 0.5*np.random.randn(N_SAMPLES)
    elif typ == "square":
        f = np.random.uniform(F_MIN, F_MAX)
        y = 0.7*np.sign(np.sin(2*np.pi*f*t))
    else:  # sawtooth
        f = np.random.uniform(F_MIN, F_MAX)
        y = 0.5*(2*((t*f) % 1) - 1)
    y += 0.01*np.random.randn(len(y))
    return y.astype(np.float32)

In [4]:
# ===================== Feature: log-mel spectrogram =====================
def to_logmel(y, sr=SR_CNN, n_mels=64):
    S = librosa.feature.melspectrogram(
        y=y, sr=sr, n_fft=1024, hop_length=256, n_mels=n_mels,
        fmin=50, fmax=sr//2
    )
    S_db = librosa.power_to_db(S, ref=np.max)
    S_min, S_max = -80.0, 0.0
    S_norm = (S_db - S_min) / (S_max - S_min)
    return S_norm.astype(np.float32)

def center_crop_or_pad(y, target_len):
    n = len(y)
    if n == target_len:
        return y
    if n > target_len:
        start = (n - target_len) // 2
        return y[start:start+target_len]
    pad_left = (target_len - n) // 2
    pad_right = target_len - n - pad_left
    return np.pad(y, (pad_left, pad_right))

In [5]:
# ===================== Synthetic batch (for training) =====================
def make_batch(batch_size=64, n_mels=64):
    X, y_cls = [], []
    for _ in range(batch_size):
        if np.random.rand() < 0.5:
            f = np.random.uniform(F_MIN, F_MAX)
            y = gen_sine(f)
            y_cls.append(1.0)
        else:
            y = gen_other()
            y_cls.append(0.0)
        X.append(to_logmel(y, sr=SR_CNN, n_mels=n_mels))
    X = np.expand_dims(np.array(X), -1)   # (B, n_mels, T, 1)
    y_cls = np.array(y_cls, dtype=np.float32)
    return X, y_cls


In [6]:
# ===================== CNN model (binary classification) =====================
def build_classifier(n_mels=64):
    inp = layers.Input(shape=(n_mels, None, 1))
    x = layers.Conv2D(16, 3, activation="relu", padding="same")(inp)
    x = layers.MaxPool2D(2)(x)
    x = layers.Conv2D(32, 3, activation="relu", padding="same")(x)
    x = layers.MaxPool2D(2)(x)
    x = layers.Conv2D(64, 3, activation="relu", padding="same")(x)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(128, activation="relu")(x)
    out = layers.Dense(1, activation="sigmoid", name="cls")(x)
    model = models.Model(inp, out)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [7]:
def quick_train(model, steps=200, batch_size=64, n_mels=64):
    for _ in range(steps):
        Xb, yb = make_batch(batch_size=batch_size, n_mels=n_mels)
        model.train_on_batch(Xb, yb)

In [8]:
# ===================== File I/O =====================
def load_for_cnn(path, target_sr=SR_CNN, dur_sec=DUR_CNN):
    y, sr = librosa.load(path, sr=target_sr, mono=True)
    y = center_crop_or_pad(y, int(target_sr*dur_sec))
    return y

def load_native(path):
    y, sr = librosa.load(path, sr=None, mono=True)  # NO resample for precise f0
    return y.astype(np.float32), int(sr)

In [10]:
# ===================== DSP: precise f0 estimation =====================
def estimate_f0_acf(y, sr, fmin=50.0, fmax=8000.0):
    y = librosa.util.normalize(y.astype(np.float32))
    y = y - np.mean(y)
    if len(y) < int(sr / fmin):
        return None
    corr = librosa.autocorrelate(y)
    corr = corr[: len(y)]
    min_lag = max(1, int(sr / fmax))
    max_lag = min(len(corr)-1, int(sr / fmin))
    if max_lag <= min_lag + 2:
        return None
    region = corr[min_lag:max_lag]
    k = int(np.argmax(region)) + min_lag
    if 1 <= k < len(corr)-1:
        a, b, c = corr[k-1], corr[k], corr[k+1]
        denom = (a - 2*b + c)
        if denom != 0:
            p = 0.5 * (a - c) / denom
            k = k + p
    f0 = sr / k if k > 0 else None
    return float(f0) if (f0 and np.isfinite(f0)) else None

def estimate_f0_fft(y, sr):
    N = len(y)
    if N < 1024:
        return None
    win = np.hanning(N)
    n_fft = 1 << int(np.ceil(np.log2(4*N)))   # zero padding
    Y = np.fft.rfft(win * y, n=n_fft)
    mag = np.abs(Y)
    k = int(np.argmax(mag[1:])) + 1  # ignore DC
    if k <= 0 or k >= len(mag)-1:
        return None
    alpha, beta, gamma = np.log(mag[k-1]), np.log(mag[k]), np.log(mag[k+1])
    denom = (alpha - 2*beta + gamma)
    if denom == 0:
        k_refined = float(k)
    else:
        delta = 0.5 * (alpha - gamma) / denom
        k_refined = k + float(delta)
    f0 = (k_refined * sr) / n_fft
    return float(f0) if np.isfinite(f0) else None

def estimate_f0_precise_segment(y, sr, window_sec=DEFAULT_F0_WINDOW):
    target_len = int(sr * window_sec)
    seg = center_crop_or_pad(y, target_len)
    f0 = estimate_f0_acf(seg, sr, fmin=50.0, fmax=sr/2.0)
    if not f0 or not np.isfinite(f0):
        f0 = estimate_f0_fft(seg, sr)
    return f0



In [11]:
# ===================== Combined inference =====================
def infer_file(path, model, cls_thresh=DEFAULT_CLS_THRESH, n_mels=64, f0_window_sec=DEFAULT_F0_WINDOW):
    # 1) CNN (16 kHz) → probability of sine
    y_cnn = load_for_cnn(path, target_sr=SR_CNN, dur_sec=DUR_CNN)
    M = to_logmel(y_cnn, sr=SR_CNN, n_mels=n_mels)
    Xm = np.expand_dims(np.expand_dims(M, -1), 0)   # (1, n_mels, T, 1)
    prob_sine = float(model.predict(Xm, verbose=0)[0,0])

    # 2) precise f0 on native file (if sine-like)
    f0 = None
    if prob_sine >= cls_thresh:
        y_nat, sr_nat = load_native(path)
        f0 = estimate_f0_precise_segment(y_nat, sr_nat, window_sec=f0_window_sec)

    in_range = (f0 is not None) and (F_MIN <= f0 <= F_MAX)
    return {
        "file": path,
        "prob_sine": prob_sine,
        "is_sine": prob_sine >= cls_thresh,
        "freq_hz": f0,
        "in_range": in_range,
        "range_min": F_MIN,
        "range_max": F_MAX,
    }

In [12]:
# ===================== Tkinter Widget =====================
def launch_inference_widget(model, default_thresh=DEFAULT_CLS_THRESH, default_f0_win=DEFAULT_F0_WINDOW):
    import tkinter as tk
    from tkinter import filedialog, ttk
    from tkinter.scrolledtext import ScrolledText

    root = tk.Tk()
    root.title("Sine Detector — Inference")
    root.geometry("820x520")

    controls = ttk.Frame(root, padding=10)
    controls.pack(fill="x")

    # Classification threshold
    ttk.Label(controls, text="Threshold (P sine):").grid(row=0, column=0, sticky="w")
    thresh_var = tk.DoubleVar(value=default_thresh)
    ttk.Scale(controls, from_=0.0, to=1.0, orient="horizontal", variable=thresh_var, length=240)\
        .grid(row=0, column=1, padx=8)
    ttk.Entry(controls, textvariable=thresh_var, width=6).grid(row=0, column=2)

    # f0 window (s)
    ttk.Label(controls, text="f0 window (s):").grid(row=0, column=3, sticky="w", padx=(16,0))
    f0win_var = tk.DoubleVar(value=default_f0_win)
    ttk.Scale(controls, from_=0.25, to=2.0, orient="horizontal", variable=f0win_var, length=200)\
        .grid(row=0, column=4, padx=8)
    ttk.Entry(controls, textvariable=f0win_var, width=6).grid(row=0, column=5)

    # Buttons
    def choose_files():
        paths = filedialog.askopenfilenames(
            title="Select WAV files",
            filetypes=[("WAV files", "*.wav"), ("All files", "*.*")]
        )
        if not paths:
            return
        run_inference(paths)

    def clear_output():
        out_text.configure(state="normal")
        out_text.delete("1.0", tk.END)
        out_text.configure(state="disabled")

    ttk.Button(controls, text="Select WAV files…", command=choose_files)\
        .grid(row=0, column=6, padx=12)
    ttk.Button(controls, text="Clear output", command=clear_output)\
        .grid(row=0, column=7)

    out_frame = ttk.Frame(root, padding=(10, 0, 10, 10))
    out_frame.pack(fill="both", expand=True)
    out_text = ScrolledText(out_frame, wrap="word", height=22)
    out_text.pack(fill="both", expand=True)
    out_text.configure(state="disabled")

    def log(line: str):
        out_text.configure(state="normal")
        out_text.insert(tk.END, line + "\n")
        out_text.see(tk.END)
        out_text.configure(state="disabled")

    def run_inference(paths):
        def worker():
            thr = float(thresh_var.get())
            f0w = float(f0win_var.get())
            log(f"[INFO] Running inference on {len(paths)} file(s) | threshold={thr:.2f} | f0 window={f0w:.2f}s")
            for p in paths:
                try:
                    res = infer_file(p, model, cls_thresh=thr, n_mels=64, f0_window_sec=f0w)
                    fname = os.path.basename(p)
                    if res["is_sine"] and res["freq_hz"] is not None:
                        tag = "IN RANGE" if res["in_range"] else "OUT OF RANGE"
                        log(f"{fname:35s} → SINE (p={res['prob_sine']:.2f}), "
                            f"f0≈{res['freq_hz']:.2f} Hz  [{tag} {res['range_min']:.0f}-{res['range_max']:.0f}]")
                    elif res["is_sine"]:
                        log(f"{fname:35s} → SINE (p={res['prob_sine']:.2f}), f0: N/A")
                    else:
                        log(f"{fname:35s} → NOT sine (p={res['prob_sine']:.2f})")
                except Exception as e:
                    log(f"{os.path.basename(p):35s} → ERROR: {e}")
            log("[DONE]")
        threading.Thread(target=worker, daemon=True).start()

    root.mainloop()

In [13]:
model = build_classifier(n_mels=64)
quick_train(model, steps=200, batch_size=64, n_mels=64)

In [16]:
launch_inference_widget(model, default_thresh=DEFAULT_CLS_THRESH, default_f0_win=DEFAULT_F0_WINDOW)

Is a **hybrid prototype of sound design and AI**.
I **transform the audio** into a **spectrogram (an image of the sound)** and use a **CNN for** just one thing: **to understand whether the signal is a sine wave or not**. If it is, I switch to a classic **DSP algorithm (autocorrelation/FFT)** that **measures the exact frequency in Hertz**.
In practice: the **CNN is the "eyes" that recognize the type of signal**, the **DSP is the "meter"** that measures precisely. It all runs in a small widget: I load a WAV and get a real-time sine wave yes/no and f0 in Hz.