# 用 noisereduce 進行降噪
同時保存原始聲音、降噪音檔

In [1]:
import pyaudio
import noisereduce as nr
import numpy as np
import wave

# 設置音頻流參數
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 10
OUTPUT_FILENAME_ORIGINAL = "output_original.wav"
OUTPUT_FILENAME_REDUCED = "output_reduced.wav"

p = pyaudio.PyAudio()

# 打開音頻流
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("開始錄音...")

frames_original = []
frames_reduced = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames_original.append(data)
    audio_data = np.frombuffer(data, dtype=np.int16)
    reduced_noise = nr.reduce_noise(y=audio_data, sr=RATE, prop_decrease=1.0, n_std_thresh_stationary=1.5)
    frames_reduced.append(reduced_noise.tobytes())

print("錄音結束...")

# 停止和關閉音頻流
stream.stop_stream()
stream.close()
p.terminate()

# 保存原始音頻數據到文件
wf_original = wave.open(OUTPUT_FILENAME_ORIGINAL, 'wb')
wf_original.setnchannels(CHANNELS)
wf_original.setsampwidth(p.get_sample_size(FORMAT))
wf_original.setframerate(RATE)
wf_original.writeframes(b''.join(frames_original))
wf_original.close()

# 保存降噪後的音頻數據到文件
wf_reduced = wave.open(OUTPUT_FILENAME_REDUCED, 'wb')
wf_reduced.setnchannels(CHANNELS)
wf_reduced.setsampwidth(p.get_sample_size(FORMAT))
wf_reduced.setframerate(RATE)
wf_reduced.writeframes(b''.join(frames_reduced))
wf_reduced.close()

print(f"原始音頻已保存到 {OUTPUT_FILENAME_ORIGINAL}")
print(f"降噪後的音頻已保存到 {OUTPUT_FILENAME_REDUCED}")


開始錄音...
錄音結束...
原始音頻已保存到 output_original.wav
降噪後的音頻已保存到 output_reduced.wav


# 用 noisereduce 進行降噪
先保存原始聲音，在對聲音降噪後另存降噪音檔

In [3]:
import pyaudio
import noisereduce as nr
import numpy as np
import wave

# 設置音頻流參數
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 10
OUTPUT_FILENAME_ORIGINAL = "output_original.wav"
OUTPUT_FILENAME_REDUCED = "output_reduced.wav"

p = pyaudio.PyAudio()

# 打開音頻流
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("開始錄音...")

frames_original = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames_original.append(data)

print("錄音結束...")

# 停止和關閉音頻流
stream.stop_stream()
stream.close()
p.terminate()

# 保存原始音頻數據到文件
wf_original = wave.open(OUTPUT_FILENAME_ORIGINAL, 'wb')
wf_original.setnchannels(CHANNELS)
wf_original.setsampwidth(p.get_sample_size(FORMAT))
wf_original.setframerate(RATE)
wf_original.writeframes(b''.join(frames_original))
wf_original.close()

print(f"原始音頻已保存到 {OUTPUT_FILENAME_ORIGINAL}")

# 進行降噪處理並保存降噪後的音頻數據到文件
wf_reduced = wave.open(OUTPUT_FILENAME_REDUCED, 'wb')
wf_reduced.setnchannels(CHANNELS)
wf_reduced.setsampwidth(p.get_sample_size(FORMAT))
wf_reduced.setframerate(RATE)

for frame in frames_original:
    audio_data = np.frombuffer(frame, dtype=np.int16)
    reduced_noise = nr.reduce_noise(y=audio_data, sr=RATE, prop_decrease=1.0, n_std_thresh_stationary=1.5)
    wf_reduced.writeframes(reduced_noise.tobytes())

wf_reduced.close()

print(f"降噪後的音頻已保存到 {OUTPUT_FILENAME_REDUCED}")


開始錄音...
錄音結束...
原始音頻已保存到 output_original.wav
降噪後的音頻已保存到 output_reduced.wav


# 用 scipy 進行降噪

In [None]:
import pyaudio
import numpy as np
import wave
import librosa
import scipy

# 設置音頻流參數
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 10
OUTPUT_FILENAME_ORIGINAL = "output_original.wav"
OUTPUT_FILENAME_REDUCED = "output_reduced.wav"

p = pyaudio.PyAudio()

# 打開音頻流
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("開始錄音...")

frames_original = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames_original.append(data)

print("錄音結束...")

# 停止和關閉音頻流
stream.stop_stream()
stream.close()
p.terminate()

# 保存原始音頻數據到文件
wf_original = wave.open(OUTPUT_FILENAME_ORIGINAL, 'wb')
wf_original.setnchannels(CHANNELS)
wf_original.setsampwidth(p.get_sample_size(FORMAT))
wf_original.setframerate(RATE)
wf_original.writeframes(b''.join(frames_original))
wf_original.close()

# 將原始音頻數據轉換為 numpy 數組
audio_data = np.frombuffer(b''.join(frames_original), dtype=np.int16)

# 使用 librosa 讀取音頻數據並進行降噪處理
y = librosa.util.buf_to_float(audio_data, n_bytes=2, dtype=np.float32)
y_denoised = librosa.effects.preemphasis(y)

# 使用 scipy 將降噪後的數據轉換回原始格式
y_denoised_int16 = (y_denoised * 32767).astype(np.int16)

# 保存降噪後的音頻數據到文件
wf_reduced = wave.open(OUTPUT_FILENAME_REDUCED, 'wb')
wf_reduced.setnchannels(CHANNELS)
wf_reduced.setsampwidth(p.get_sample_size(FORMAT))
wf_reduced.setframerate(RATE)
wf_reduced.writeframes(y_denoised_int16.tobytes())
wf_reduced.close()

print(f"原始音頻已保存到 {OUTPUT_FILENAME_ORIGINAL}")
print(f"降噪後的音頻已保存到 {OUTPUT_FILENAME_REDUCED}")


# 用 FFmpeg 進行降噪

In [None]:
"""
ffmpeg -f dshow -i audio="麥克風 (BLUE Yeti PRO)"  -acodec pcm_s16le output_original.wav
ffmpeg -f dshow -i audio="麥克風 (BLUE Yeti PRO)" -af "afftdn=nf=-25" -acodec pcm_s16le output_reduced.wav
ffmpeg -f dshow -i audio="麥克風 (BLUE Yeti PRO)" -af "afftdn=nf=-80" -acodec pcm_s16le output_reduced_2.wav
"""

# 用 pyrnnoise 進行降噪

In [4]:
import pyaudio
import numpy as np
import wave
from pyrnnoise import RNNoise

# 初始化 PyAudio
p = pyaudio.PyAudio()

# 設置音頻流參數
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 48000
CHUNK = 1024  # 每個塊的大小應與 RNNoise 的幀大小一致

# 初始化 RNNoise
rnnoise = RNNoise(RATE)

# 打開音頻流
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

# 打開 WAV 文件進行寫入
raw_wav = wave.open("raw_audio.wav", "wb")
denoised_wav = wave.open("denoised_audio.wav", "wb")

# 設置 WAV 文件參數
raw_wav.setnchannels(CHANNELS)
raw_wav.setsampwidth(p.get_sample_size(FORMAT))
raw_wav.setframerate(RATE)

denoised_wav.setnchannels(CHANNELS)
denoised_wav.setsampwidth(p.get_sample_size(FORMAT))
denoised_wav.setframerate(RATE)

print("開始錄音")

try:
    while True:
        # 從麥克風讀取數據
        data = stream.read(CHUNK)
        
        # 將數據轉換為 NumPy 數組
        audio_chunk = np.frombuffer(data, dtype=np.int16)
        
        # 保存原始音頻數據
        raw_wav.writeframes(data)
        
        # 使用 RNNoise 進行降噪處理
        for vad_prob, denoised_chunk in rnnoise.process_chunk(audio_chunk):
            # 在這裡你可以使用處理後的音頻數據
            # 例如，你可以將其保存或播放
            print(f"VAD 概率: {vad_prob}, 降噪後的幀大小: {len(denoised_chunk)}")
            
            # 將降噪後的數據轉換回字節數據
            denoised_data = denoised_chunk.tobytes()
            
            # 保存降噪後的音頻數據
            denoised_wav.writeframes(denoised_data)
            
            # 示例：直接播放處理後的音頻數據（需要額外配置輸出流）
            # output_stream.write(denoised_data)

except KeyboardInterrupt:
    print("錄音結束")

finally:
    # 停止和關閉音頻流
    stream.stop_stream()
    stream.close()
    p.terminate()
    
    # 關閉 WAV 文件
    raw_wav.close()
    denoised_wav.close()

開始錄音
VAD 概率: 0.9823699593544006, 降噪後的幀大小: 480
VAD 概率: 0.9722340106964111, 降噪後的幀大小: 480
VAD 概率: 0.9961364269256592, 降噪後的幀大小: 480
VAD 概率: 0.996465265750885, 降噪後的幀大小: 480
VAD 概率: 0.9591103196144104, 降噪後的幀大小: 480
VAD 概率: 0.9941698312759399, 降噪後的幀大小: 480
VAD 概率: 0.9863289594650269, 降噪後的幀大小: 480
VAD 概率: 0.9784092903137207, 降噪後的幀大小: 480
VAD 概率: 0.9746274948120117, 降噪後的幀大小: 480
VAD 概率: 0.963286280632019, 降噪後的幀大小: 480
VAD 概率: 0.9260342121124268, 降噪後的幀大小: 480
VAD 概率: 0.7967579364776611, 降噪後的幀大小: 480
VAD 概率: 0.5767187476158142, 降噪後的幀大小: 480
VAD 概率: 0.6638088822364807, 降噪後的幀大小: 480
VAD 概率: 0.577681839466095, 降噪後的幀大小: 480
VAD 概率: 0.7058886289596558, 降噪後的幀大小: 480
VAD 概率: 0.46841952204704285, 降噪後的幀大小: 480
VAD 概率: 0.295551598072052, 降噪後的幀大小: 480
VAD 概率: 0.23215386271476746, 降噪後的幀大小: 480
VAD 概率: 0.17747309803962708, 降噪後的幀大小: 480
VAD 概率: 0.12466713786125183, 降噪後的幀大小: 480
VAD 概率: 0.04472199082374573, 降噪後的幀大小: 480
VAD 概率: 0.04530012607574463, 降噪後的幀大小: 480
VAD 概率: 0.028403520584106445, 降噪後的幀大小: 480
VAD 概率:

# 先用 pyrnnoise 再用 noisereduce 進行兩次降噪

In [6]:
import pyaudio
import noisereduce as nr
import numpy as np
import wave
from pyrnnoise import RNNoise

# 設置音頻流參數
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 10
OUTPUT_FILENAME_ORIGINAL = "output_original.wav"
OUTPUT_FILENAME_REDUCED_1 = "output_reduced_1.wav"
OUTPUT_FILENAME_REDUCED_2 = "output_reduced_2.wav"

p = pyaudio.PyAudio()

# 打開音頻流
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("開始錄音...")

frames_original = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames_original.append(data)

print("錄音結束...")

# 停止和關閉音頻流
stream.stop_stream()
stream.close()
p.terminate()

# 保存原始音頻數據到文件
wf_original = wave.open(OUTPUT_FILENAME_ORIGINAL, 'wb')
wf_original.setnchannels(CHANNELS)
wf_original.setsampwidth(p.get_sample_size(FORMAT))
wf_original.setframerate(RATE)
wf_original.writeframes(b''.join(frames_original))
wf_original.close()

print(f"原始音頻已保存到 {OUTPUT_FILENAME_ORIGINAL}")

# 初始化 RNNoise
rnnoise = RNNoise(RATE)

# 進行第一次降噪處理並保存降噪後的音頻數據到文件
wf_reduced_1 = wave.open(OUTPUT_FILENAME_REDUCED_1, 'wb')
wf_reduced_1.setnchannels(CHANNELS)
wf_reduced_1.setsampwidth(p.get_sample_size(FORMAT))
wf_reduced_1.setframerate(RATE)

for frame in frames_original:
    audio_data = np.frombuffer(frame, dtype=np.int16)
    denoised_chunks = []
    for vad_prob, denoised_chunk in rnnoise.process_chunk(audio_data):
        denoised_chunks.append(denoised_chunk)
    denoised_data = np.concatenate(denoised_chunks).astype(np.int16).tobytes()
    wf_reduced_1.writeframes(denoised_data)

wf_reduced_1.close()

print(f"第一次降噪後的音頻已保存到 {OUTPUT_FILENAME_REDUCED_1}")

# 讀取第一次降噪後的音頻數據
wf_reduced_1 = wave.open(OUTPUT_FILENAME_REDUCED_1, 'rb')
reduced_1_data = wf_reduced_1.readframes(wf_reduced_1.getnframes())
audio_data_reduced_1 = np.frombuffer(reduced_1_data, dtype=np.int16)
wf_reduced_1.close()

# 進行第二次降噪處理
reduced_noise_2 = nr.reduce_noise(y=audio_data_reduced_1, sr=RATE, prop_decrease=1.0, n_std_thresh_stationary=1.5)

# 保存第二次降噪後的音頻數據到文件
wf_reduced_2 = wave.open(OUTPUT_FILENAME_REDUCED_2, 'wb')
wf_reduced_2.setnchannels(CHANNELS)
wf_reduced_2.setsampwidth(p.get_sample_size(FORMAT))
wf_reduced_2.setframerate(RATE)
wf_reduced_2.writeframes(reduced_noise_2.tobytes())
wf_reduced_2.close()

print(f"第二次降噪後的音頻已保存到 {OUTPUT_FILENAME_REDUCED_2}")


開始錄音...
錄音結束...
原始音頻已保存到 output_original.wav
第一次降噪後的音頻已保存到 output_reduced_1.wav
第二次降噪後的音頻已保存到 output_reduced_2.wav


# 先用 noisereduce 再用 pyrnnoise 進行兩次降噪

In [8]:
import pyaudio
import noisereduce as nr
import numpy as np
import wave
from pyrnnoise import RNNoise

# 設置音頻流參數
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 10
OUTPUT_FILENAME_ORIGINAL = "output_original.wav"
OUTPUT_FILENAME_REDUCED_1 = "output_reduced_1.wav"
OUTPUT_FILENAME_REDUCED_2 = "output_reduced_2.wav"

p = pyaudio.PyAudio()

# 打開音頻流
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("開始錄音...")

frames_original = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames_original.append(data)

print("錄音結束...")

# 停止和關閉音頻流
stream.stop_stream()
stream.close()
p.terminate()

# 保存原始音頻數據到文件
wf_original = wave.open(OUTPUT_FILENAME_ORIGINAL, 'wb')
wf_original.setnchannels(CHANNELS)
wf_original.setsampwidth(p.get_sample_size(FORMAT))
wf_original.setframerate(RATE)
wf_original.writeframes(b''.join(frames_original))
wf_original.close()

print(f"原始音頻已保存到 {OUTPUT_FILENAME_ORIGINAL}")

# 進行第一次降噪處理並保存降噪後的音頻數據到文件
wf_reduced_1 = wave.open(OUTPUT_FILENAME_REDUCED_1, 'wb')
wf_reduced_1.setnchannels(CHANNELS)
wf_reduced_1.setsampwidth(p.get_sample_size(FORMAT))
wf_reduced_1.setframerate(RATE)

for frame in frames_original:
    audio_data = np.frombuffer(frame, dtype=np.int16)
    reduced_noise = nr.reduce_noise(y=audio_data, sr=RATE, prop_decrease=1.0, n_std_thresh_stationary=1.5)
    wf_reduced_1.writeframes(reduced_noise.tobytes())

wf_reduced_1.close()

print(f"第一次降噪後的音頻已保存到 {OUTPUT_FILENAME_REDUCED_1}")

# 讀取第一次降噪後的音頻數據
wf_reduced_1 = wave.open(OUTPUT_FILENAME_REDUCED_1, 'rb')
reduced_1_data = wf_reduced_1.readframes(wf_reduced_1.getnframes())
audio_data_reduced_1 = np.frombuffer(reduced_1_data, dtype=np.int16)
wf_reduced_1.close()

# 初始化 RNNoise
rnnoise = RNNoise(RATE)

# 進行第二次降噪處理並保存降噪後的音頻數據到文件
wf_reduced_2 = wave.open(OUTPUT_FILENAME_REDUCED_2, 'wb')
wf_reduced_2.setnchannels(CHANNELS)
wf_reduced_2.setsampwidth(p.get_sample_size(FORMAT))
wf_reduced_2.setframerate(RATE)

denoised_chunks = []
for vad_prob, denoised_chunk in rnnoise.process_chunk(audio_data_reduced_1):
    denoised_chunks.append(denoised_chunk)
denoised_data = np.concatenate(denoised_chunks).astype(np.int16).tobytes()
wf_reduced_2.writeframes(denoised_data)

wf_reduced_2.close()

print(f"第二次降噪後的音頻已保存到 {OUTPUT_FILENAME_REDUCED_2}")


開始錄音...
錄音結束...
原始音頻已保存到 output_original.wav
第一次降噪後的音頻已保存到 output_reduced_1.wav
第二次降噪後的音頻已保存到 output_reduced_2.wav


# 計算路徑下 wav總時長

In [9]:
import wave
import os

def get_wav_duration(file_path):
    with wave.open(file_path, 'rb') as wf:
        frames = wf.getnframes()
        rate = wf.getframerate()
        duration = frames / float(rate)
        return duration

def calculate_total_duration(folder_path):
    total_duration = 0.0
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.wav'):
            file_path = os.path.join(folder_path, file_name)
            duration = get_wav_duration(file_path)
            total_duration += duration
            print(f"{file_name}: {duration:.2f} seconds")
    return total_duration

folder_path = 'C:\\Users\\User\\Documents\\work\\GPT-SoVITS-beta0217\\DATA\\Joneshong\\slicer'  # 替換為你的資料夾路徑
total_duration = calculate_total_duration(folder_path)
print(f"Total duration: {total_duration:.2f} seconds")


vocal_0.wav.reformatted.wav_10.wav_0000011520_0000141760.wav: 4.07 seconds
vocal_0.wav.reformatted.wav_10.wav_0000141760_0000270720.wav: 4.03 seconds
vocal_0.wav.reformatted.wav_10.wav_0000276160_0000393280.wav: 3.66 seconds
vocal_0.wav.reformatted.wav_10.wav_0000393280_0000523520.wav: 4.07 seconds
vocal_0.wav.reformatted.wav_10.wav_0000530240_0000690560.wav: 5.01 seconds
vocal_0.wav.reformatted.wav_10.wav_0000690560_0000813760.wav: 3.85 seconds
vocal_0.wav.reformatted.wav_10.wav_0000813760_0000914560.wav: 3.15 seconds
vocal_1.wav.reformatted.wav_10.wav_0000011200_0000160640.wav: 4.67 seconds
vocal_1.wav.reformatted.wav_10.wav_0000160640_0000345280.wav: 5.77 seconds
vocal_1.wav.reformatted.wav_10.wav_0000345280_0000503680.wav: 4.95 seconds
vocal_1.wav.reformatted.wav_10.wav_0000503680_0000701120.wav: 6.17 seconds
vocal_2.wav.reformatted.wav_10.wav_0000006400_0000119040.wav: 3.52 seconds
vocal_2.wav.reformatted.wav_10.wav_0000121280_0000241280.wav: 3.75 seconds
vocal_2.wav.reformatted.w

In [11]:
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np

augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(p=0.5),
])

# 生成2秒钟的虚拟音频以便示例
samples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)

# 对音频数据进行增强/变换/扰动
augmented_samples = augment(samples=samples, sample_rate=16000)

In [13]:
import pyaudio
import wave
from pyrnnoise import RNNoise

# 設置音頻流參數
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 10
OUTPUT_FILENAME_ENHANCED = "output_enhanced.wav"

p = pyaudio.PyAudio()

# 打開音頻流
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("開始錄音...")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("錄音結束...")

# 停止和關閉音頻流
stream.stop_stream()
stream.close()
p.terminate()

# 初始化 RNNoise
rnnoise = RNNoise(RATE)

# 進行人聲增強
enhanced_frames = []
for frame in frames:
    audio_data = np.frombuffer(frame, dtype=np.int16)
    denoised_chunks = []
    for vad_prob, denoised_chunk in rnnoise.process_chunk(audio_data):
        denoised_chunks.append(denoised_chunk)
    denoised_data = np.concatenate(denoised_chunks).astype(np.int16).tobytes()
    enhanced_frames.append(denoised_data)

# 保存增強後的音頻數據到文件
wf_enhanced = wave.open(OUTPUT_FILENAME_ENHANCED, 'wb')
wf_enhanced.setnchannels(CHANNELS)
wf_enhanced.setsampwidth(p.get_sample_size(FORMAT))
wf_enhanced.setframerate(RATE)
wf_enhanced.writeframes(b''.join(enhanced_frames))
wf_enhanced.close()

print(f"增強後的音頻已保存到 {OUTPUT_FILENAME_ENHANCED}")


開始錄音...
錄音結束...
增強後的音頻已保存到 output_enhanced.wav


: 