In [22]:
import numpy as np
import librosa
import pygame
from pygame.locals import *

target_frequency = (158, 365)  # 包括中央C的頻率範圍
filename = r"D:\Temp\好聽拷貝.wav"

# 載入音頻檔案
audio, sr = librosa.load(filename, sr=None)

# 執行短時傅立葉變換    
stft = np.abs(librosa.stft(audio, hop_length=512, n_fft=2048 * 8))  

# 初始化 pygame
pygame.init()

# 設置視窗大小
screen_width = 800
screen_height = 600
screen = pygame.display.set_mode((screen_width, screen_height))

# 設定震幅的最大值和最小值
max_amplitude = np.max(stft)
min_amplitude = np.min(stft)

# 定義每一幀的長度（以毫秒為單位）
frame_duration_ms = 10

# 設置時鐘對象
clock = pygame.time.Clock()

# 設置音頻
pygame.mixer.init()
pygame.mixer.music.load(filename)

# 開始播放音頻
pygame.mixer.music.play()

time_data = []  # 時間數據
frequency_data = []  # 頻率數據
amplitude_data = []  # 振幅數據


# 设置刻度相关参数
scale_length = 20  # 刻度线长度
scale_spacing = 50  # 刻度线间距
scale_start = 50  # 刻度线起始位置
scale_end = screen_height - 50  # 刻度线结束位置（从下到上）
font = pygame.font.Font(None, 24)  # 字体和大小

# 遍历 stft 的每一帧，将其转换为视频中的一帧并播放
for i in range(stft.shape[1]):  # 使用 stft.shape[1] 作为时间轴的大小
    # 在这里添加你的 stft[:, i] 到视频中的处理代码
    # 例如，你可以根据 stft[:, i] 绘制一幅图像并显示在屏幕上
    # 这里简单地将 stft 的每一列作为一个折线图显示在屏幕上

    screen.fill((255, 255, 255))  # 清空屏幕，填充为白色

    # 缩放震幅到屏幕范围内
    scaled_stft = ((stft[:, i] - min_amplitude) / (max_amplitude - min_amplitude)) * screen_height-50
    
    # 将 stft 的每一列作为一个折线图显示在屏幕上
    points = [(j, screen_height - int(scaled_stft[j])) for j in range(stft.shape[0])]  # 构建折线图的点
    pygame.draw.lines(screen, (255, 0, 0), False, points, 4)  # 绘制折线图

    for y in range(scale_start, screen_height, scale_spacing):
        # 绘制刻度线
        pygame.draw.line(screen, (0, 0, 0), (scale_length, y), (0, y), 2)  # 左侧刻度线

        # 获取振幅值并将其转换为整数
        amplitude_value = int((scale_end - y) * (max_amplitude - min_amplitude) / (scale_end - scale_spacing) + min_amplitude)

        # 绘制振幅值文本
        text = font.render(str(amplitude_value), True, (0, 0, 0))  # 将文本渲染为图像
        text_rect = text.get_rect()  # 获取文本图像的矩形
        text_rect.center = (scale_length + 30, y)  # 设置文本图像的中心位置
        screen.blit(text, text_rect)  # 在屏幕上绘制文本图像

    # 更新屏幕
    pygame.display.flip()

    # 控制帧率
    clock.tick(1000 / frame_duration_ms)

    # 监听事件，例如点击关闭按钮等
    for event in pygame.event.get():
        if event.type == QUIT:
            pygame.quit()
            exit()

    # 找出每个窗口的最高振幅音
    max_frequency_index = np.argmax(stft[:, i])  # 找出当前窗口最高振幅的索引
    max_frequency = librosa.fft_frequencies(sr=sr, n_fft=2048*8)[max_frequency_index]  # 将索引转换为频率
    max_amplitude = stft[max_frequency_index, i]  # 获取最高振幅

    # 输出每个窗口的信息

    print(f"窗口{i+1}：频率为{max_frequency} Hz，振幅为{max_amplitude}")
    time_data.append(i+1),frequency_data.append(max_frequency),amplitude_data.append(max_amplitude)

# 结束 pygame
pygame.quit()
print(len(frequency_data))
print(amplitude_data[100])
# 使用 ffmpeg 将帧和音频合成为视频
#os.system(f"ffmpeg -y -r {fps} -f image2 -s 1920x1080 -i {output_dir}/frame%04d.png -i {filename} -c:v libx264 -pix_fmt yuv420p movie.mp4")


窗口1：频率为199.18212890625 Hz，振幅为1.4895133972167969
窗口2：频率为201.873779296875 Hz，振幅为1.6324644088745117
窗口3：频率为201.873779296875 Hz，振幅为1.7916499376296997
窗口4：频率为201.873779296875 Hz，振幅为1.9414058923721313
窗口5：频率为201.873779296875 Hz，振幅为2.0755393505096436
窗口6：频率为201.873779296875 Hz，振幅为2.1891677379608154
窗口7：频率为201.873779296875 Hz，振幅为2.2730305194854736
窗口8：频率为212.640380859375 Hz，振幅为2.4972126483917236
窗口9：频率为212.640380859375 Hz，振幅为2.717575788497925
窗口10：频率为212.640380859375 Hz，振幅为2.909600257873535
窗口11：频率为212.640380859375 Hz，振幅为3.0630271434783936
窗口12：频率为209.94873046875 Hz，振幅为3.224102735519409
窗口13：频率为209.94873046875 Hz，振幅为3.4057059288024902
窗口14：频率为209.94873046875 Hz，振幅为3.5605156421661377
窗口15：频率为209.94873046875 Hz，振幅为3.6921565532684326
窗口16：频率为209.94873046875 Hz，振幅为3.7917087078094482
窗口17：频率为209.94873046875 Hz，振幅为3.8518624305725098
窗口18：频率为209.94873046875 Hz，振幅为3.866842269897461
窗口19：频率为794.036865234375 Hz，振幅为4.243029594421387
窗口20：频率为794.036865234375 Hz，振幅为5.459777355194092
窗口21：频率为794.03686523437

In [23]:
replacement_value = 800
start_index = 250
end_index = 650
replacement_list = [replacement_value] * (end_index - start_index)
print(replacement_list)

# 确保替换列表的长度与要替换的范围的长度相同
if len(replacement_list) == end_index - start_index:
    # 使用切片赋值将替换列表中的元素替换为原始列表中的元素
    amplitude_data[start_index:end_index] = replacement_list

# 打印替换后的列表
print(amplitude_data)  # 输出：[1, 2, 10, 11, 12, 6, 7, 8, 9]

[800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800,

In [25]:
replacement_value = 440
start_index = 50
end_index = 350
replacement_list = [replacement_value] * (end_index - start_index)
print(replacement_list)


# 确保替换列表的长度与要替换的范围的长度相同
if len(replacement_list) == end_index - start_index:
    # 使用切片赋值将替换列表中的元素替换为原始列表中的元素
    frequency_data[start_index:end_index] = replacement_list

# 打印替换后的列表
print(frequency_data)  # 输出：[1, 2, 10, 11, 12, 6, 7, 8, 9]

[440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440, 440,

In [26]:
import numpy as np
import librosa
from IPython.display import Audio

def generate_audio(frequency_data, amplitude_data, time_data, sample_rate=44100):
    # 將頻率和振幅數據堆疊為二維數組
    freq_ampl_nparray = np.vstack((frequency_data, amplitude_data))
    
    # 確定時間窗口的長度和跳躍長度
    hop_length = 512
    win_length = 2048

    # 初始化音頻數據
    audio_data = np.zeros((len(time_data) * hop_length + win_length))

    # 將每個時間窗口的頻率和振幅轉換為時間域信號
    for i, (frequency, amplitude) in enumerate(zip(*freq_ampl_nparray)):
        # 計算時間窗口的起始樣本位置
        start_sample = i * hop_length
        # 計算每個時間窗口的時間軸
        time_axis = np.arange(start_sample, start_sample + win_length)
        # 計算正弦波信號
        sin_wave = amplitude * np.sin(2 * np.pi * frequency * time_axis / sample_rate)
        # 添加到總音頻數據中
        audio_data[start_sample:start_sample + win_length] += sin_wave

    # 顯示音頻
    return audio_data

# 調用函數生成音頻數據並顯示
audio_data = generate_audio(frequency_data, amplitude_data, time_data)
Audio(data=audio_data, rate=44100)




In [81]:
import numpy as np
import librosa
from IPython.display import Audio

# 假设你已经有了 frequency_data、amplitude_data、hop_length 和 win_length

hop_length = 16
win_length = 128

# 将频率和振幅数据转换为 numpy 数组
frequency_data_2d = np.array(frequency_data).reshape(-1, 1)
amplitude_data_2d = np.array(amplitude_data).reshape(-1, 1)

# 将频率和振幅数据组合成复数形式的 STFT 数据
stft_data = frequency_data_2d + 1j * amplitude_data_2d
print(stft_data.shape)
# 使用 iSTFT 将 STFT 数据转换为时域信号
istft_signal = librosa.istft(stft_data, hop_length=hop_length, win_length=win_length)

# 打印输出的时域信号的长度
print(len(istft_signal))
#Audio(istft_signal,rate=44100)

(399, 1)
0


ValueError: zero-size array to reduction operation maximum which has no identity