In [3]:
!pip -q install librosa soundfile tqdm

In [4]:
import os, json, math, random, shutil, time
from pathlib import Path
import numpy as np
import soundfile as sf
import librosa
from tqdm import tqdm


In [5]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [6]:
from pathlib import Path

DRIVE_ROOT = Path("/content/drive/MyDrive")
DRIVE_SNEEZE = DRIVE_ROOT / "sneeze_models"          # 드라이브에 있는 sneeze_model 폴더
ZIP_PATH = DRIVE_ROOT / "raw_data.zip"      # 효민님이 새로 만든 zip 파일명으로 맞추기

print("DRIVE_SNEEZE exists:", DRIVE_SNEEZE.exists(), DRIVE_SNEEZE)
print("ZIP exists:", ZIP_PATH.exists(), ZIP_PATH)

# 필수 체크
assert DRIVE_SNEEZE.exists(), "드라이브의 sneeze_model 폴더 경로가 틀렸습니다. DRIVE_SNEEZE를 수정하십시오."
assert ZIP_PATH.exists(), "zip 경로가 틀렸습니다. ZIP_PATH를 수정하십시오."


DRIVE_SNEEZE exists: True /content/drive/MyDrive/sneeze_models
ZIP exists: True /content/drive/MyDrive/raw_data.zip


In [7]:
import shutil, os

WORK = Path("/content/work")

# zip 풀기
!unzip -q "{ZIP_PATH}" -d "{WORK}"

print("WORK exists:", WORK.exists())
print("WORK contents:")
for p in sorted(WORK.iterdir()):
    print(" -", p, "(dir)" if p.is_dir() else "(file)")


WORK exists: True
WORK contents:
 - /content/work/MS-SNSD-NOISE (dir)
 - /content/work/esc-50 (dir)
 - /content/work/recordings (dir)
 - /content/work/sneeze (dir)


In [8]:
WORK = Path("/content/work")

ESC50_DIR   = WORK / "esc-50"
MS_SNSD_DIR = WORK / "MS-SNSD-NOISE"
SNEEZE_DIR  = WORK / "sneeze"
REC_DIR     = WORK / "recordings"

assert ESC50_DIR.exists(), ESC50_DIR
assert MS_SNSD_DIR.exists(), MS_SNSD_DIR
assert SNEEZE_DIR.exists(), SNEEZE_DIR
assert REC_DIR.exists(), REC_DIR

print("OK:", ESC50_DIR)
print("OK:", MS_SNSD_DIR)
print("OK:", SNEEZE_DIR)
print("OK:", REC_DIR)


OK: /content/work/esc-50
OK: /content/work/MS-SNSD-NOISE
OK: /content/work/sneeze
OK: /content/work/recordings


In [9]:
!ls /content/work/

esc-50	MS-SNSD-NOISE  recordings  sneeze


In [10]:
CONFIG_PATH = WORK / "config_v4.json"

config_text = r'''
{
  "version": "v4",
  "seed": 1337,

  "audio": {
    "sr": 16000,
    "clip_seconds": 2.0
  },

  "features": {
    "type": "logmel",
    "n_mels": 64,
    "n_fft": 400,
    "hop_length": 160,
    "center": false,
    "log_eps": 1e-6
  },

  "normalization": {
    "mode": "dataset_stats",
    "rms_target_range": [0.03, 0.15],
    "rms_apply_prob": 1.0
  },

  "dataset_sizes": {
    "pos_total": 12000,
    "neg_total": 30000,
    "pos_mix_ratio": 0.70
  },

  "negative_plan": {
    "event_ratio": 0.60,
    "background_ratio": 0.40,

    "event_sources": {
      "esc50_excluding_sneeze": 0.40,
      "yaho": 0.35,
      "noise1_noise2": 0.25
    },

    "background_sources": {
      "ms_snsd": 0.50,
      "talk": 0.30,
      "dish": 0.20
    }
  },

  "positive_plan": {
    "original_ratio": 0.30,
    "synthetic_ratio": 0.70,

    "background_pool": {
      "ms_snsd": 0.40,
      "talk": 0.40,
      "dish": 0.20
    },

    "snr_db_range_bg": [0.0, 20.0],

    "optional_event_on_pos": {
      "apply_prob": 0.15,
      "event_pool": {
        "yaho": 0.50,
        "esc50_excluding_sneeze": 0.30,
        "noise1_noise2": 0.20
      },
      "snr_db_range_event": [15.0, 30.0]
    }
  },

  "augment": {
    "gain_db_range": [-6.0, 6.0],
    "time_shift_ms": 200,

    "reverb": {
      "apply_prob": 0.10,
      "ir_seconds_range": [0.05, 0.25],
      "mix_range": [0.02, 0.10]
    },

    "time_stretch": {
      "apply_prob": 0.10,
      "rate_range": [0.95, 1.05]
    },

    "pitch_shift": {
      "apply_prob": 0.05,
      "semitones_range": [-0.25, 0.25]
    },

    "specaugment": {
      "apply_prob": 0.20,
      "time_masks": 1,
      "time_mask_max": 12,
      "freq_masks": 1,
      "freq_mask_max": 6
    }
  },

  "splits": {
    "train": 0.70,
    "val": 0.15,
    "test": 0.15
  },

  "training": {
    "batch_size": 64,
    "epochs": 25,
    "lr": 0.001,
    "early_stopping_patience": 5,
    "reduce_lr_patience": 2,
    "reduce_lr_factor": 0.5
  },

  "thresholding": {
    "method": "precision_target",
    "target_precision": 0.99,
    "fallback_threshold": 0.90
  },

  "export": {
    "tflite_dynamic": true
  }
}
'''
CONFIG_PATH.write_text(config_text, encoding="utf-8")

cfg = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
print("config loaded:", cfg["version"], "at", CONFIG_PATH)


config loaded: v4 at /content/work/config_v4.json


In [11]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"


In [12]:
SEED = int(cfg["seed"])
random.seed(SEED)
np.random.seed(SEED)
print("seed:", SEED)


seed: 1337


In [13]:
SR = int(cfg["audio"]["sr"])
CLIP_SEC = float(cfg["audio"]["clip_seconds"])
CLIP_SAMPLES = int(SR * CLIP_SEC)

# recordings
TALK_WAV  = REC_DIR / "talk.wav"
DISH_WAV  = REC_DIR / "dish.wav"
YAHO_WAV  = REC_DIR / "yaho.wav"
NOISE1_WAV = REC_DIR / "noise1.wav"
NOISE2_WAV = REC_DIR / "noise2.wav"

for p in [TALK_WAV, DISH_WAV, YAHO_WAV, NOISE1_WAV, NOISE2_WAV]:
    assert p.exists(), f"missing: {p}"

# sneeze clips
sneeze_files = sorted([p for p in SNEEZE_DIR.rglob("*.wav")])
assert len(sneeze_files) > 0
print("sneeze clips:", len(sneeze_files))

# ESC-50: meta로 sneeze 클래스 제외
esc_meta = ESC50_DIR / "meta" / "esc50.csv"
esc_audio_dir = ESC50_DIR / "audio"
assert esc_meta.exists(), esc_meta
assert esc_audio_dir.exists(), esc_audio_dir

import csv
esc_rows = []
with open(esc_meta, "r", encoding="utf-8") as f:
    r = csv.DictReader(f)
    for row in r:
        esc_rows.append(row)

# ESC-50 라벨 문자열에 'sneeze'가 있으면 제외
esc_event_files = []
for row in esc_rows:
    label = row["category"].strip().lower()
    fname = row["filename"].strip()
    if "sneeze" in label:
        continue
    wav = esc_audio_dir / fname
    if wav.exists():
        esc_event_files.append(wav)

print("esc-50 event files(excl sneeze):", len(esc_event_files))

# MS-SNSD: wav 전부 수집
ms_snsd_files = sorted([p for p in MS_SNSD_DIR.rglob("*.wav")])
assert len(ms_snsd_files) > 0
print("ms-snsd wav files:", len(ms_snsd_files))


sneeze clips: 968
esc-50 event files(excl sneeze): 2000
ms-snsd wav files: 128


In [14]:
def rms(x):
    x = np.asarray(x, np.float32)
    return float(np.sqrt(np.mean(x*x) + 1e-8))

def fix_2s(y):
    y = np.asarray(y, np.float32)
    if len(y) >= CLIP_SAMPLES:
        return y[:CLIP_SAMPLES]
    return np.pad(y, (0, CLIP_SAMPLES - len(y))).astype(np.float32)

def rand_crop_2s(y):
    y = np.asarray(y, np.float32)
    if len(y) <= CLIP_SAMPLES:
        return fix_2s(y)
    start = np.random.randint(0, len(y) - CLIP_SAMPLES + 1)
    return y[start:start+CLIP_SAMPLES].astype(np.float32)

def load_mono(path, sr=SR):
    y, _ = librosa.load(str(path), sr=sr, mono=True)
    return y.astype(np.float32)

def apply_gain_db(y, db):
    g = 10 ** (db / 20.0)
    return np.clip(y * g, -1.0, 1.0).astype(np.float32)

def rms_randomize(y, lo, hi):
    target = float(np.random.uniform(lo, hi))
    r = rms(y)
    if r > 1e-6:
        y = y * (target / (r + 1e-8))
    return np.clip(y, -1.0, 1.0).astype(np.float32)

def mix_at_snr(signal, background, snr_db):
    s = np.asarray(signal, np.float32)
    b = np.asarray(background, np.float32)

    s = fix_2s(s)
    b = fix_2s(b)

    rs = rms(s)
    rb = rms(b)
    if rb < 1e-6:
        return s

    # 원하는 SNR: 20*log10(rs / (rb*alpha)) = snr_db  -> alpha = rs / (rb * 10^(snr/20))
    alpha = (rs + 1e-8) / ((rb + 1e-8) * (10 ** (snr_db / 20.0)))
    y = s + b * alpha
    return np.clip(y, -1.0, 1.0).astype(np.float32)


In [15]:
def time_shift(y, max_ms=200):
    max_samp = int(SR * (max_ms / 1000.0))
    if max_samp <= 0:
        return y
    k = np.random.randint(-max_samp, max_samp + 1)
    return np.roll(y, k).astype(np.float32)

def maybe_time_stretch(y, prob, r_lo, r_hi):
    if np.random.rand() > prob:
        return y
    rate = float(np.random.uniform(r_lo, r_hi))
    # librosa time_stretch는 길이가 변함 -> 다시 2초로 맞춤
    ys = librosa.effects.time_stretch(y, rate=rate).astype(np.float32)
    return fix_2s(ys)

def maybe_pitch_shift(y, prob, s_lo, s_hi):
    if np.random.rand() > prob:
        return y
    steps = float(np.random.uniform(s_lo, s_hi))
    yp = librosa.effects.pitch_shift(y, sr=SR, n_steps=steps).astype(np.float32)
    return fix_2s(yp)


In [16]:
N_MELS = int(cfg["features"]["n_mels"])
N_FFT  = int(cfg["features"]["n_fft"])
HOP    = int(cfg["features"]["hop_length"])
CENTER = bool(cfg["features"]["center"])
LOG_EPS = float(cfg["features"]["log_eps"])

def logmel(y):
    S = librosa.feature.melspectrogram(
        y=y, sr=SR, n_fft=N_FFT, hop_length=HOP,
        n_mels=N_MELS, power=2.0, center=CENTER
    )
    return np.log(S + LOG_EPS).T.astype(np.float32)  # (frames, mels)

def specaugment(f, time_masks=1, tmax=12, freq_masks=1, fmax=6):
    g = f.copy()
    T, F = g.shape
    for _ in range(time_masks):
        w = np.random.randint(0, tmax+1)
        if w == 0: 
            continue
        t0 = np.random.randint(0, max(1, T - w))
        g[t0:t0+w, :] = 0.0
    for _ in range(freq_masks):
        w = np.random.randint(0, fmax+1)
        if w == 0:
            continue
        f0 = np.random.randint(0, max(1, F - w))
        g[:, f0:f0+w] = 0.0
    return g


In [17]:
# 긴 recordings에서 랜덤 2초 크롭
talk_audio = load_mono(TALK_WAV)
dish_audio = load_mono(DISH_WAV)
yaho_audio = load_mono(YAHO_WAV)
noise1_audio = load_mono(NOISE1_WAV)
noise2_audio = load_mono(NOISE2_WAV)

def sample_from_long(y_long):
    return rand_crop_2s(y_long)

def sample_esc50_2s(path):
    y = load_mono(path)
    return rand_crop_2s(y)

def sample_ms_snsd_2s(path):
    y = load_mono(path)
    return rand_crop_2s(y)


In [18]:
# v4 OUT_DIR 준비
OUT_DIR = WORK / "out" / "v4"
OUT_DIR.mkdir(parents=True, exist_ok=True)

pos_total = int(cfg["dataset_sizes"]["pos_total"])
neg_total = int(cfg["dataset_sizes"]["neg_total"])
N_TOTAL = pos_total + neg_total

# frames를 확정(2초 고정 + center 고정이므로 frames 고정)
tmp = logmel(np.zeros(CLIP_SAMPLES, np.float32))
FRAMES = int(tmp.shape[0])
assert tmp.shape[1] == N_MELS
print("FRAMES:", FRAMES, "N_MELS:", N_MELS, "N_TOTAL:", N_TOTAL)

# memmap 경로
X_PATH = OUT_DIR / "v4_features_f32.dat"
Y_PATH = OUT_DIR / "v4_labels_i8.dat"
META_PATH = OUT_DIR / "v4_meta.json"

# memmap 할당
X_mm = np.memmap(str(X_PATH), dtype="float32", mode="w+", shape=(N_TOTAL, FRAMES, N_MELS))
y_mm = np.memmap(str(Y_PATH), dtype="int8", mode="w+", shape=(N_TOTAL,))

print("memmap X:", X_PATH)
print("memmap y:", Y_PATH)

# 통계 누적(멜 bin별, 모든 프레임/샘플 전체에 대해)
sum_m = np.zeros((N_MELS,), np.float64)
sumsq_m = np.zeros((N_MELS,), np.float64)
count_tf = 0  # total frames count (N * FRAMES)

# 구성 수치
pos_original_n = int(pos_total * float(cfg["positive_plan"]["original_ratio"]))
pos_synth_n    = pos_total - pos_original_n

neg_event_n = int(neg_total * float(cfg["negative_plan"]["event_ratio"]))
neg_bg_n    = neg_total - neg_event_n

print("pos_total:", pos_total, "original:", pos_original_n, "synth:", pos_synth_n)
print("neg_total:", neg_total, "event:", neg_event_n, "bg:", neg_bg_n)

def write_feature(i, y_audio, label):
    global sum_m, sumsq_m, count_tf
    f = logmel(y_audio)  # (frames, 64)
    # 혹시라도 파라미터 변경/라이브러리 차이로 frames가 달라지면 강제로 맞춤
    if f.shape[0] > FRAMES:
        f = f[:FRAMES, :]
    elif f.shape[0] < FRAMES:
        f = np.pad(f, ((0, FRAMES - f.shape[0]), (0, 0)), mode="constant")

    # specaugment는 저장 단계에서 하지 말고, 학습 단계에서 on-the-fly로 하는 편이 안정적입니다.
    # 여기서는 feature 고정(재현성 + 디버깅)
    X_mm[i, :, :] = f.astype(np.float32)
    y_mm[i] = np.int8(label)

    # 통계 누적(멜별로 프레임 합산)
    sum_m += f.sum(axis=0)
    sumsq_m += (f * f).sum(axis=0)
    count_tf += f.shape[0]

# 가중치 선택(기존 셀 10의 weighted_choice 그대로 사용 가능)
def weighted_choice(items, weights):
    w = np.asarray(weights, np.float64)
    w = w / w.sum()
    idx = np.random.choice(len(items), p=w)
    return items[idx]

# 풀/가중치(기존 계획 그대로)
bg_pool_items = ["ms_snsd", "talk", "dish"]
bg_pool_w = [
    cfg["positive_plan"]["background_pool"]["ms_snsd"],
    cfg["positive_plan"]["background_pool"]["talk"],
    cfg["positive_plan"]["background_pool"]["dish"],
]

pos_evt_items = ["yaho", "esc", "noise12"]
pos_evt_w = [
    cfg["positive_plan"]["optional_event_on_pos"]["event_pool"]["yaho"],
    cfg["positive_plan"]["optional_event_on_pos"]["event_pool"]["esc50_excluding_sneeze"],
    cfg["positive_plan"]["optional_event_on_pos"]["event_pool"]["noise1_noise2"],
]

neg_evt_items = ["esc", "yaho", "noise12"]
neg_evt_w = [
    cfg["negative_plan"]["event_sources"]["esc50_excluding_sneeze"],
    cfg["negative_plan"]["event_sources"]["yaho"],
    cfg["negative_plan"]["event_sources"]["noise1_noise2"],
]

neg_bg_items = ["ms_snsd", "talk", "dish"]
neg_bg_w = [
    cfg["negative_plan"]["background_sources"]["ms_snsd"],
    cfg["negative_plan"]["background_sources"]["talk"],
    cfg["negative_plan"]["background_sources"]["dish"],
]

rms_lo, rms_hi = cfg["normalization"]["rms_target_range"]
snr_bg_lo, snr_bg_hi = cfg["positive_plan"]["snr_db_range_bg"]
pos_evt_prob = float(cfg["positive_plan"]["optional_event_on_pos"]["apply_prob"])
snr_evt_lo, snr_evt_hi = cfg["positive_plan"]["optional_event_on_pos"]["snr_db_range_event"]

aug = cfg["augment"]
gain_lo, gain_hi = aug["gain_db_range"]
shift_ms = int(aug["time_shift_ms"])
ts_prob = float(aug["time_stretch"]["apply_prob"])
ts_lo, ts_hi = aug["time_stretch"]["rate_range"]
ps_prob = float(aug["pitch_shift"]["apply_prob"])
ps_lo, ps_hi = aug["pitch_shift"]["semitones_range"]

def sample_background(source_name):
    if source_name == "talk":
        return sample_from_long(talk_audio)
    if source_name == "dish":
        return sample_from_long(dish_audio)
    if source_name == "ms_snsd":
        p = random.choice(ms_snsd_files)
        return sample_ms_snsd_2s(p)
    raise ValueError(source_name)

def sample_event(source_name):
    if source_name == "yaho":
        return sample_from_long(yaho_audio)
    if source_name == "noise12":
        return sample_from_long(noise1_audio if np.random.rand() < 0.5 else noise2_audio)
    if source_name == "esc":
        p = random.choice(esc_event_files)
        return sample_esc50_2s(p)
    raise ValueError(source_name)

def apply_audio_aug(y):
    # gain
    y = apply_gain_db(y, float(np.random.uniform(gain_lo, gain_hi)))
    # shift
    y = time_shift(y, max_ms=shift_ms)
    # stretch/pitch (약하게)
    y = maybe_time_stretch(y, ts_prob, ts_lo, ts_hi)
    y = maybe_pitch_shift(y, ps_prob, ps_lo, ps_hi)
    # RMS 랜덤화(레벨 변화 학습)
    y = rms_randomize(y, rms_lo, rms_hi)
    return y

# 생성 루프
i = 0

# 포지: 원본
for _ in tqdm(range(pos_original_n), desc="pos_original"):
    p = random.choice(sneeze_files)
    y = rand_crop_2s(load_mono(p))
    y = apply_audio_aug(y)
    write_feature(i, y, 1)
    i += 1

# 포지: 합성(재채기 + 배경, 가끔 이벤트 약하게)
for _ in tqdm(range(pos_synth_n), desc="pos_synth"):
    p = random.choice(sneeze_files)
    sneeze = rand_crop_2s(load_mono(p))

    bg_src = weighted_choice(bg_pool_items, bg_pool_w)
    bg = sample_background(bg_src)

    snr_bg = float(np.random.uniform(snr_bg_lo, snr_bg_hi))
    y = mix_at_snr(sneeze, bg, snr_bg)

    if np.random.rand() < pos_evt_prob:
        evt_src = weighted_choice(pos_evt_items, pos_evt_w)
        evt = sample_event(evt_src)
        snr_evt = float(np.random.uniform(snr_evt_lo, snr_evt_hi))
        y = mix_at_snr(y, evt, snr_evt)

    y = apply_audio_aug(y)
    write_feature(i, y, 1)
    i += 1

# 네거: 배경형
for _ in tqdm(range(neg_bg_n), desc="neg_bg"):
    bg_src = weighted_choice(neg_bg_items, neg_bg_w)
    y = sample_background(bg_src)
    y = apply_audio_aug(y)
    write_feature(i, y, 0)
    i += 1

# 네거: 이벤트형(+50% 확률로 약한 배경 섞기)
for _ in tqdm(range(neg_event_n), desc="neg_event"):
    evt_src = weighted_choice(neg_evt_items, neg_evt_w)
    y = sample_event(evt_src)

    if np.random.rand() < 0.50:
        bg_src = weighted_choice(neg_bg_items, neg_bg_w)
        bg = sample_background(bg_src)
        snr = float(np.random.uniform(5.0, 25.0))
        y = mix_at_snr(y, bg, snr)

    y = apply_audio_aug(y)
    write_feature(i, y, 0)
    i += 1

# flush
X_mm.flush()
y_mm.flush()

# meta 저장
meta = {
    "version": "v4",
    "sr": SR,
    "clip_seconds": CLIP_SEC,
    "frames": FRAMES,
    "mels": N_MELS,
    "n_total": int(N_TOTAL),
    "pos_total": int(pos_total),
    "neg_total": int(neg_total),
    "features_path": str(X_PATH),
    "labels_path": str(Y_PATH),
}
META_PATH.write_text(json.dumps(meta, indent=2), encoding="utf-8")

print("written samples:", i, "expected:", N_TOTAL)
print("pos:", int((y_mm[:] == 1).sum()), "neg:", int((y_mm[:] == 0).sum()))
print("count_tf(frames):", count_tf, "expected:", int(N_TOTAL * FRAMES))
print("saved meta:", META_PATH)


FRAMES: 198 N_MELS: 64 N_TOTAL: 42000
memmap X: /content/work/out/v4/v4_features_f32.dat
memmap y: /content/work/out/v4/v4_labels_i8.dat
pos_total: 12000 original: 3600 synth: 8400
neg_total: 30000 event: 18000 bg: 12000


pos_original: 100%|██████████| 3600/3600 [00:32<00:00, 109.24it/s]
pos_synth: 100%|██████████| 8400/8400 [02:05<00:00, 67.12it/s] 
neg_bg: 100%|██████████| 12000/12000 [02:57<00:00, 67.72it/s] 
neg_event: 100%|██████████| 18000/18000 [04:20<00:00, 69.07it/s] 


written samples: 42000 expected: 42000
pos: 12000 neg: 30000
count_tf(frames): 8316000 expected: 8316000
saved meta: /content/work/out/v4/v4_meta.json


In [19]:
# mu/sd 계산(멜 bin별)
mu = (sum_m / max(1, count_tf)).astype(np.float32)
var = (sumsq_m / max(1, count_tf) - (mu.astype(np.float64) ** 2))
var = np.maximum(var, 1e-8).astype(np.float32)
sd = np.sqrt(var).astype(np.float32)

stats_path = OUT_DIR / "v4_norm_stats.npz"
np.savez(stats_path, mu=mu, sd=sd)

print("mu shape:", mu.shape, "sd shape:", sd.shape)
print("saved:", stats_path)


mu shape: (64,) sd shape: (64,)
saved: /content/work/out/v4/v4_norm_stats.npz


In [20]:
!pip -q install scikit-learn

from sklearn.model_selection import train_test_split

# labels 로드(작음)
y_all = np.array(y_mm[:], dtype=np.int64)

idx = np.arange(len(y_all))
idx_train, idx_tmp, y_train, y_tmp = train_test_split(
    idx, y_all, test_size=(1.0 - cfg["splits"]["train"]), random_state=SEED, stratify=y_all
)
val_ratio = cfg["splits"]["val"] / (cfg["splits"]["val"] + cfg["splits"]["test"])
idx_val, idx_test, y_val, y_test = train_test_split(
    idx_tmp, y_tmp, test_size=(1.0 - val_ratio), random_state=SEED, stratify=y_tmp
)

def counts(name, idxs):
    yy = y_all[idxs]
    print(name, "n=", len(idxs), "pos=", int(yy.sum()), "neg=", int((yy==0).sum()))

counts("train", idx_train)
counts("val  ", idx_val)
counts("test ", idx_test)


train n= 29399 pos= 8400 neg= 20999
val   n= 6300 pos= 1800 neg= 4500
test  n= 6301 pos= 1800 neg= 4501


In [21]:
import tensorflow as tf

# memmap 재오픈(안전)
meta = json.loads((OUT_DIR / "v4_meta.json").read_text(encoding="utf-8"))
FRAMES = int(meta["frames"])
N_MELS = int(meta["mels"])
N_TOTAL = int(meta["n_total"])

X_mm = np.memmap(str(OUT_DIR / "v4_features_f32.dat"), dtype="float32", mode="r", shape=(N_TOTAL, FRAMES, N_MELS))
y_all = np.array(np.memmap(str(OUT_DIR / "v4_labels_i8.dat"), dtype="int8", mode="r", shape=(N_TOTAL,)), dtype=np.int64)

st = np.load(str(OUT_DIR / "v4_norm_stats.npz"))
mu = st["mu"].astype(np.float32)
sd = st["sd"].astype(np.float32)

BATCH = int(cfg["training"]["batch_size"])

# on-the-fly specaugment(배치 단위)
sa = cfg["augment"]["specaugment"]
SA_PROB = float(sa["apply_prob"])
SA_TM = int(sa["time_masks"])
SA_TMAX = int(sa["time_mask_max"])
SA_FM = int(sa["freq_masks"])
SA_FMAX = int(sa["freq_mask_max"])

def specaugment_np(f):
    g = f.copy()
    T, F = g.shape
    for _ in range(SA_TM):
        w = np.random.randint(0, SA_TMAX+1)
        if w > 0 and T - w > 0:
            t0 = np.random.randint(0, T - w)
            g[t0:t0+w, :] = 0.0
    for _ in range(SA_FM):
        w = np.random.randint(0, SA_FMAX+1)
        if w > 0 and F - w > 0:
            f0 = np.random.randint(0, F - w)
            g[:, f0:f0+w] = 0.0
    return g

def batch_generator(idxs, shuffle=True):
    idxs = np.array(idxs, dtype=np.int64)
    n = len(idxs)
    while True:
        if shuffle:
            np.random.shuffle(idxs)
        for s in range(0, n, BATCH):
            b = idxs[s:s+BATCH]
            Xb = np.array(X_mm[b, :, :], dtype=np.float32)  # (B, frames, 64)

            # 정규화
            Xb = (Xb - mu[None, None, :]) / (sd[None, None, :] + 1e-6)

            # specaugment(확률적)
            if np.random.rand() < SA_PROB:
                for i in range(Xb.shape[0]):
                    Xb[i] = specaugment_np(Xb[i])

            # 채널 추가
            Xb = Xb[..., None]  # (B, frames, 64, 1)
            yb = y_all[b].astype(np.float32)
            yield Xb, yb

# shape sanity check
Xb0, yb0 = next(batch_generator(idx_train, shuffle=False))
print("batch X:", Xb0.shape, "batch y:", yb0.shape, "pos_in_batch:", int(yb0.sum()))

def build_model(frames, mels):
    inp = tf.keras.Input(shape=(frames, mels, 1))
    x = tf.keras.layers.Conv2D(16, (3,3), padding="same", activation="relu")(inp)
    x = tf.keras.layers.MaxPool2D((2,2))(x)
    x = tf.keras.layers.Conv2D(32, (3,3), padding="same", activation="relu")(x)
    x = tf.keras.layers.MaxPool2D((2,2))(x)
    x = tf.keras.layers.Conv2D(64, (3,3), padding="same", activation="relu")(x)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    return tf.keras.Model(inp, out)

model = build_model(FRAMES, N_MELS)
model.summary()


batch X: (64, 198, 64, 1) batch y: (64,) pos_in_batch: 16


In [22]:
lr = float(cfg["training"]["lr"])
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.AUC(name="auc"),
        tf.keras.metrics.Precision(name="prec"),
        tf.keras.metrics.Recall(name="rec"),
    ],
)

ckpt_path = OUT_DIR / "v4_model.keras"

steps_per_epoch = math.ceil(len(idx_train) / BATCH)
val_steps = math.ceil(len(idx_val) / BATCH)

train_gen = batch_generator(idx_train, shuffle=True)
val_gen   = batch_generator(idx_val, shuffle=False)

cbs = [
    tf.keras.callbacks.ModelCheckpoint(
        str(ckpt_path),
        monitor="val_auc",
        mode="max",
        save_best_only=True,
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor="val_auc",
        mode="max",
        patience=6,
        restore_best_weights=True,
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_auc",
        mode="max",
        patience=2,
        factor=0.5,
    ),
]

history = model.fit(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_gen,
    validation_steps=val_steps,
    epochs=100,
    callbacks=cbs,
    verbose=1,
)

print("saved best:", ckpt_path)
"|}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}"

Epoch 1/100
[1m460/460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 695ms/step - auc: 0.7178 - loss: 0.5225 - prec: 0.6145 - rec: 0.2350 - val_auc: 0.9053 - val_loss: 0.3523 - val_prec: 0.7504 - val_rec: 0.7017 - learning_rate: 0.0010
Epoch 2/100
[1m460/460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 696ms/step - auc: 0.8975 - loss: 0.3572 - prec: 0.7658 - rec: 0.6737 - val_auc: 0.9242 - val_loss: 0.3173 - val_prec: 0.8064 - val_rec: 0.6856 - learning_rate: 0.0010
Epoch 3/100
[1m460/460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 703ms/step - auc: 0.9197 - loss: 0.3202 - prec: 0.7942 - rec: 0.7227 - val_auc: 0.9334 - val_loss: 0.3069 - val_prec: 0.8380 - val_rec: 0.6683 - learning_rate: 0.0010
Epoch 4/100
[1m460/460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 701ms/step - auc: 0.9308 - loss: 0.2979 - prec: 0.8140 - rec: 0.7516 - val_auc: 0.9417 - val_loss: 0.2886 - val_prec: 0.7626 - val_rec: 0.8160 - learning_rate: 0.0010
Epoch 5/100


: 

In [None]:
!ls /content/work/out

v4


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

model = tf.keras.models.load_model(str(ckpt_path))

def predict_on_idxs(idxs):
    probs = []
    ys = []
    for s in range(0, len(idxs), BATCH):
        b = np.array(idxs[s:s+BATCH], dtype=np.int64)
        Xb = np.array(X_mm[b, :, :], dtype=np.float32)
        Xb = (Xb - mu[None, None, :]) / (sd[None, None, :] + 1e-6)
        Xb = Xb[..., None]
        pb = model.predict(Xb, verbose=0).reshape(-1)
        probs.append(pb)
        ys.append(y_all[b])
    return np.concatenate(probs), np.concatenate(ys)

p_test, y_test = predict_on_idxs(idx_test)

thr = 0.5
yhat = (p_test >= thr).astype(int)

cm = confusion_matrix(y_test, yhat)
print("confusion @0.5:\n", cm)
print(classification_report(y_test, yhat, digits=4))


confusion @0.5:
 [[4387  114]
 [ 143 1657]]
              precision    recall  f1-score   support

           0     0.9684    0.9747    0.9715      4501
           1     0.9356    0.9206    0.9280      1800

    accuracy                         0.9592      6301
   macro avg     0.9520    0.9476    0.9498      6301
weighted avg     0.9591    0.9592    0.9591      6301



In [None]:
from sklearn.metrics import precision_recall_curve

p_val, y_val2 = predict_on_idxs(idx_val)

prec, rec, th = precision_recall_curve(y_val2, p_val)

target_prec = float(cfg["thresholding"]["target_precision"])
fallback = float(cfg["thresholding"]["fallback_threshold"])

cands = [(t, p, r) for t, p, r in zip(th, prec[:-1], rec[:-1]) if p >= target_prec]
if len(cands) == 0:
    best_t = fallback
    best_p = float(prec[0])
    best_r = float(rec[0])
else:
    best_t, best_p, best_r = sorted(cands, key=lambda x: x[2], reverse=True)[0]

thr_path = OUT_DIR / "v4_threshold.txt"
thr_path.write_text(f"{best_t}\n", encoding="utf-8")

print("selected threshold:", best_t, "precision:", best_p, "recall:", best_r)
print("saved:", thr_path)


NameError: name 'predict_on_idxs' is not defined

In [None]:
!cp -r /content/work/out/v4 /content/drive/MyDrive/sneeze_models_v4/

cp: cannot stat '/content/work/out': No such file or directory
