In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install crepe

Collecting crepe
  Downloading crepe-0.0.16.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting resampy>=0.2.0 (from crepe)
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Collecting hmmlearn>=0.3.0 (from crepe)
  Downloading hmmlearn-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading hmmlearn-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.9/165.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: crepe
  Building wheel for crepe (setup.py) ... [?25l[?25hdone
  Created wheel for crepe: filename=crepe-0.0.16-py3-none-any.whl size=134848677 sha256=2c926b865e1c1160cba5bc5dec30299a66abf01b05b987ea4ee

In [4]:
import os
import numpy as np
import pandas as pd
import torch
import librosa
import crepe
import joblib
import soundfile as sf
from music21 import pitch, stream, note, key
from scipy.signal import butter, filtfilt, resample
from tqdm import tqdm


config

In [9]:
STYLE_CSV_PATH = "/content/drive/MyDrive/Datasets_For_Ai_builders/listSongDatasets - listSongDatasets(1).csv"  # path to your CSV
GENDER_MODEL_PATH = "/content/drive/MyDrive/Datasets_For_Ai_builders/Models/voice_clssification_gender_model.pkl"
INPUT_AUDIO_DIR = "/content/drive/MyDrive/Datasets_For_Ai_builders/CleanVoice_v2/"
OUTPUT_DIR = "/content/drive/MyDrive/Datasets_For_Ai_builders/outputPipline/"
TARGET_SR = 16000
STYLE_DIM = 31

setup

In [10]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
style_df = pd.read_csv(STYLE_CSV_PATH)
gender_model = joblib.load(GENDER_MODEL_PATH)

24 common key signatures

In [14]:
KEY_LIST = [
    'C major', 'G major', 'D major', 'A major', 'E major', 'B major', 'F# major', 'C# major', 'G# major', 'D# major', 'A# major', 'F major',
    'A minor', 'E minor', 'B minor', 'F# minor', 'C# minor', 'G# minor', 'D# minor', 'A# minor', 'F minor', 'C minor', 'G minor', 'D minor'
]
KEY2IDX = {k: i for i, k in enumerate(KEY_LIST)}

def butter_filter(data, cutoff, fs, btype, order=5):
    nyq = 0.5 * fs
    norm_cutoff = cutoff / nyq
    b, a = butter(order, norm_cutoff, btype=btype)
    return filtfilt(b, a, data)

def apply_filter(y, sr):
    y = y / np.max(np.abs(y))
    y_rs = resample(y, int(len(y) * TARGET_SR / sr))
    low = butter_filter(y_rs, 1000, TARGET_SR, 'low')
    high = butter_filter(y_rs, 500, TARGET_SR, 'high')
    combined = low + high
    return combined / np.max(np.abs(combined))

def detect_gender(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=17)  # เปลี่ยนเป็น 17
    mfcc_mean = np.mean(mfcc, axis=1)
    gender = gender_model.predict([mfcc_mean])[0]
    return gender

def detect_key_signature(y, sr):
    y_trimmed, _ = librosa.effects.trim(y, top_db=40)
    time, freq, conf, _ = crepe.predict(y_trimmed, sr, viterbi=True, step_size=10)
    filtered = freq[conf > 0.8]
    s = stream.Stream()
    for f in filtered:
        try:
            p = pitch.Pitch()
            p.frequency = f
            s.append(note.Note(p))
        except:
            continue
    try:
        key_sig = s.analyze('key')
        return f"{key_sig.tonic.name} {key_sig.mode}"
    except:
        return "Unknown"

def build_style_vector(row, gender, key_sig):
    vec = np.zeros(STYLE_DIM)
    vec[:5] = [row['sweet'], row['soft'], row['clear'], row['powerful'], row['high']]
    vec[5:7] = [1, 0] if gender == 'male' else [0, 1]
    if key_sig in KEY2IDX:
        vec[7 + KEY2IDX[key_sig]] = 1
    return vec

def extract_mel(y, sr):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return torch.tensor(mel_db, dtype=torch.float32)


main pipline

In [None]:
for fname in tqdm(os.listdir(INPUT_AUDIO_DIR)):
    if not fname.endswith(".wav"): continue
    filepath = os.path.join(INPUT_AUDIO_DIR, fname)
    y, sr = librosa.load(filepath, sr=None, mono=True)

    y_filtered = apply_filter(y, sr)
    gender = detect_gender(y_filtered, TARGET_SR)
    key_sig = detect_key_signature(y_filtered, TARGET_SR)

    row = style_df[style_df['filename'] == fname].iloc[0]
    style_vec = build_style_vector(row, gender, key_sig)
    mel = extract_mel(y_filtered, TARGET_SR)

    output = {
        'mel': mel,
        'style': torch.tensor(style_vec, dtype=torch.float32),
        'meta': {
            'gender': gender,
            'key_signature': key_sig,
            'filename': fname
        }
    }
    torch.save(output, os.path.join(OUTPUT_DIR, fname.replace(".wav", ".pt")))


  0%|          | 0/353 [00:00<?, ?it/s]

[1m201/497[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m8:25[0m 2s/step