In [2]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import librosa.display

# 定义音频文件路径和目标文件夹路径
audio_files = ['Lucy_a bit confident.wav', 'Lucy_confident.wav', 'Lucy_unconfident.mp4.wav']
data_dir = 'hesitation_voices'
output_dir = 'main_voices_spectrograms'

# 创建输出目录，如果不存在的话
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 提取Mel频谱图的函数
def extract_mel_spectrogram(file_path, n_mels=128, n_fft=2048, hop_length=512, fixed_length=128):
    # 载入音频文件
    y, sr = librosa.load(file_path, sr=None)
    
    # 计算Mel频谱图
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
    
    # 转换为dB（对数尺度）
    S_dB = librosa.power_to_db(S, ref=np.max)
    
    # 如果时间维度小于固定长度，则进行零填充
    if S_dB.shape[1] < fixed_length:
        padding = fixed_length - S_dB.shape[1]
        S_dB = np.pad(S_dB, pad_width=((0, 0), (0, padding)), mode='constant')
    # 如果时间维度大于固定长度，则进行裁剪
    elif S_dB.shape[1] > fixed_length:
        S_dB = S_dB[:, :fixed_length]
    
    return S_dB

# 遍历指定的文件并保存Mel频谱图
for file_name in audio_files:
    file_path = os.path.join(data_dir, file_name)
    
    if os.path.exists(file_path):
        # 提取Mel频谱图
        mel_spectrogram = extract_mel_spectrogram(file_path)
        
        # 可视化并保存为PNG文件
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(mel_spectrogram, sr=22050, hop_length=512, x_axis='time', y_axis='mel')
        plt.colorbar(format='%+2.0f dB')
        plt.title(f'Mel-frequency spectrogram of {file_name}')
        
        # 生成输出文件路径
        output_file = os.path.join(output_dir, f'{os.path.splitext(file_name)[0]}.png')
        
        # 保存图像
        plt.savefig(output_file)
        plt.close()
        
        print(f'Saved Mel spectrogram for {file_name} as {output_file}')
    else:
        print(f"File {file_name} not found in {data_dir}.")


Saved Mel spectrogram for Lucy_a bit confident.wav as main_voices_spectrograms\Lucy_a bit confident.png
Saved Mel spectrogram for Lucy_confident.wav as main_voices_spectrograms\Lucy_confident.png
Saved Mel spectrogram for Lucy_unconfident.mp4.wav as main_voices_spectrograms\Lucy_unconfident.mp4.png


In [1]:
import librosa
from PIL import Image

# 加載音頻文件，並獲取其長度（秒）
audio_path = 'Lucy_confident.wav'
y, sr = librosa.load(audio_path)
audio_duration = librosa.get_duration(y=y, sr=sr)

# 加載 Mel spectrogram 圖像，並獲取其寬度（像素）
image_path = 'main_voices_spectrograms/Lucy_confident.png'
image = Image.open(image_path)
image_width, image_height = image.size

# 計算每秒對應的像素數
pixels_per_second = image_width / audio_duration

print(f"每秒對應的像素數量：{pixels_per_second:.2f} 像素/秒")


每秒對應的像素數量：1.73 像素/秒


In [2]:
import librosa
from PIL import Image

# 加載音頻文件，並獲取其長度（秒）
audio_path = 'Lucy_unconfident.mp4.wav'
y, sr = librosa.load(audio_path)
audio_duration = librosa.get_duration(y=y, sr=sr)

# 加載 Mel spectrogram 圖像，並獲取其寬度（像素）
image_path = 'main_voices_spectrograms/Lucy_unconfident.mp4.png'
image = Image.open(image_path)
image_width, image_height = image.size

# 計算每秒對應的像素數
pixels_per_second = image_width / audio_duration

print(f"每秒對應的像素數量：{pixels_per_second:.2f} 像素/秒")


每秒對應的像素數量：2.35 像素/秒


In [3]:
import librosa
from PIL import Image

# 加載音頻文件，並獲取其長度（秒）
audio_path = 'Lucy_a bit confident.wav'
y, sr = librosa.load(audio_path)
audio_duration = librosa.get_duration(y=y, sr=sr)

# 加載 Mel spectrogram 圖像，並獲取其寬度（像素）
image_path = 'main_voices_spectrograms/Lucy_a bit confident.png'
image = Image.open(image_path)
image_width, image_height = image.size

# 計算每秒對應的像素數
pixels_per_second = image_width / audio_duration

print(f"每秒對應的像素數量：{pixels_per_second:.2f} 像素/秒")


每秒對應的像素數量：1.65 像素/秒
