In [104]:
import numpy as np
import librosa
import pygame
from pygame.locals import *

target_frequency = (158, 365)  # 包括中央C的頻率範圍
filename = r"D:\Temp\鋼琴.mp3"

# 載入音頻檔案
audio, sr = librosa.load(filename, sr=None)

# 執行短時傅立葉變換    
stft = np.abs(librosa.stft(audio, hop_length=512, n_fft=2048 * 8))  

# 初始化 pygame
pygame.init()

# 設置視窗大小
screen_width = 800
screen_height = 600
screen = pygame.display.set_mode((screen_width, screen_height))

# 設定震幅的最大值和最小值
max_amplitude = np.max(stft)
min_amplitude = np.min(stft)

# 定義每一幀的長度（以毫秒為單位）
frame_duration_ms = 10

# 設置時鐘對象
clock = pygame.time.Clock()

# 設置音頻
#pygame.mixer.init()
#pygame.mixer.music.load(filename)

# 開始播放音頻
#pygame.mixer.music.play()

time_data = []  # 時間數據
frequency_data = []  # 頻率數據
amplitude_data = []  # 振幅數據


# 设置刻度相关参数
scale_length = 20  # 刻度线长度
scale_spacing = 50  # 刻度线间距
scale_start = 50  # 刻度线起始位置
scale_end = screen_height - 50  # 刻度线结束位置（从下到上）
font = pygame.font.Font(None, 24)  # 字体和大小

# 遍历 stft 的每一帧，将其转换为视频中的一帧并播放
for i in range(stft.shape[1]):  # 使用 stft.shape[1] 作为时间轴的大小
    # 在这里添加你的 stft[:, i] 到视频中的处理代码
    # 例如，你可以根据 stft[:, i] 绘制一幅图像并显示在屏幕上
    # 这里简单地将 stft 的每一列作为一个折线图显示在屏幕上

    screen.fill((255, 255, 255))  # 清空屏幕，填充为白色

    # 缩放震幅到屏幕范围内
    scaled_stft = ((stft[:, i] - min_amplitude) / (max_amplitude - min_amplitude)) * screen_height-50
    
    # 将 stft 的每一列作为一个折线图显示在屏幕上
    points = [(j, screen_height - int(scaled_stft[j])) for j in range(stft.shape[0])]  # 构建折线图的点
    pygame.draw.lines(screen, (255, 0, 0), False, points, 4)  # 绘制折线图

    for y in range(scale_start, screen_height, scale_spacing):
        # 绘制刻度线
        pygame.draw.line(screen, (0, 0, 0), (scale_length, y), (0, y), 2)  # 左侧刻度线

        # 获取振幅值并将其转换为整数
        amplitude_value = int((scale_end - y) * (max_amplitude - min_amplitude) / (scale_end - scale_spacing) + min_amplitude)

        # 绘制振幅值文本
        text = font.render(str(amplitude_value), True, (0, 0, 0))  # 将文本渲染为图像
        text_rect = text.get_rect()  # 获取文本图像的矩形
        text_rect.center = (scale_length + 30, y)  # 设置文本图像的中心位置
        screen.blit(text, text_rect)  # 在屏幕上绘制文本图像

    # 更新屏幕
    pygame.display.flip()

    # 控制帧率
    clock.tick(1000 / frame_duration_ms)

    # 监听事件，例如点击关闭按钮等
    for event in pygame.event.get():
        if event.type == QUIT:
            pygame.quit()
            exit()

    # 找出每个窗口的最高振幅音
    #max_frequency_index = np.argmax(stft[:, i])  # 找出当前窗口最高振幅的索引
    #max_frequency = librosa.fft_frequencies(sr=sr, n_fft=2048*8)[max_frequency_index]  # 将索引转换为频率
    #max_amplitude = stft[max_frequency_index, i]  # 获取最高振幅
    #if max_amplitude < 3:
    #    max_amplitude = 1
    #time_data.append(i+1),frequency_data.append(max_frequency),amplitude_data.append(max_amplitude)
    #print(f"窗口{i+1}：频率为{max_frequency} Hz，振幅为{max_amplitude}")

    f0 = librosa.yin(audio[i * 512:(i + 1) * 512], fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    f0 = f0[0] if f0.size > 0 else 0

    max_amplitude = stft[:, i].max()  # 获取当前窗口的最高振幅

    # 输出每个窗口的信息
    if max_amplitude < 3:
        max_amplitude = 1
    time_data.append(i+1)
    frequency_data.append(f0)
    amplitude_data.append(max_amplitude)
    print(f"窗口{i+1}：频率为{f0} Hz，振幅为{max_amplitude}")


# 结束 pygame
pygame.quit()
print(len(frequency_data))
print(amplitude_data[100])
print(frequency_data[100])
# 使用 ffmpeg 将帧和音频合成为视频
#os.system(f"ffmpeg -y -r {fps} -f image2 -s 1920x1080 -i {output_dir}/frame%04d.png -i {filename} -c:v libx264 -pix_fmt yuv420p movie.mp4")


窗口1：频率为2205.0 Hz，振幅为1
窗口2：频率为2205.0 Hz，振幅为1
窗口3：频率为2205.0 Hz，振幅为1
窗口4：频率为2205.0 Hz，振幅为1
窗口5：频率为2205.0 Hz，振幅为1
窗口6：频率为2205.0 Hz，振幅为1
窗口7：频率为381.61503055450044 Hz，振幅为1
窗口8：频率为234.58031960700134 Hz，振幅为1
窗口9：频率为175.30792130725234 Hz，振幅为1
窗口10：频率为76.55502833431488 Hz，振幅为1
窗口11：频率为2205.0 Hz，振幅为1
窗口12：频率为380.16788798609457 Hz，振幅为3.0249240398406982
窗口13：频率为996.1252918643668 Hz，振幅为3.266350746154785
窗口14：频率为1229.2623404265767 Hz，振幅为3.4940645694732666
窗口15：频率为322.8741517882136 Hz，振幅为3.7085747718811035
窗口16：频率为596.3656468971993 Hz，振幅为3.9087741374969482
窗口17：频率为593.2087398800065 Hz，振幅为4.093283176422119
窗口18：频率为713.8068093355571 Hz，振幅为4.261888027191162
窗口19：频率为1177.287962330955 Hz，振幅为4.413177490234375
窗口20：频率为79.5531733012645 Hz，振幅为4.543734073638916
窗口21：频率为650.1588105580321 Hz，振幅为4.651241779327393
窗口22：频率为319.0682691335656 Hz，振幅为4.733795166015625
窗口23：频率为395.33792794657836 Hz，振幅为4.791959285736084
窗口24：频率为2205.0 Hz，振幅为4.82532262802124
窗口25：频率为2205.0 Hz，振幅为4.8369927406311035
窗口26：频率为1676.348838424216

In [78]:
replacement_value = 800
start_index = 250
end_index = 650
replacement_list = [replacement_value] * (end_index - start_index)
print(replacement_list)

# 确保替换列表的长度与要替换的范围的长度相同
if len(replacement_list) == end_index - start_index:
    # 使用切片赋值将替换列表中的元素替换为原始列表中的元素
    amplitude_data[start_index:end_index] = replacement_list

# 打印替换后的列表
for i in amplitude_data:
    print('震幅為:',i)  # 输出：[1, 2, 10, 11, 12, 6, 7, 8, 9]

[800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800, 800,

In [105]:
import math
def freq_to_note(freq):
    # 基準音 A4 的頻率（在西方音樂中，A4 是440 Hz）
    A4_freq = 440.0
    # 半音間隔的比例常數
    semitone_ratio = 2 ** (1/12)
    
    # 計算音高相對於 A4 的半音數
    num_semitones = 12 * math.log2(int(freq) / A4_freq)
    # 將半音數四捨五入到最接近的整數
    num_semitones_rounded = round(num_semitones)
    
    # 定義音名列表（包括升降號）
    note_names = ["A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"]
    
    # 計算音名
    note_index = num_semitones_rounded % 12
    octave = 4 + num_semitones_rounded // 12
    
    return note_names[note_index] + str(octave)

import numpy as np
note_to_semitone = {
    'C': 0,
    'C#': 1, 'Db': 1,
    'D': 2,
    'D#': 3, 'Eb': 3,
    'E': 4,
    'F': 5,
    'F#': 6, 'Gb': 6,
    'G': 7,
    'G#': 8, 'Ab': 8,
    'A': 9,
    'A#': 10, 'Bb': 10,
    'B': 11
}

def note_to_freq(note):
    # 提取音名和八度
    if len(note) == 3:  # like C#4 or Db4
        note_name = note[:2]
        octave = int(note[2])
    else:  # like C4
        note_name = note[0]
        octave = int(note[1])
    
    # 計算相對於C0的半音數
    semitone_offset = note_to_semitone[note_name]
    
    # 計算MIDI音高數
    midi_number = semitone_offset + (octave + 1) * 12
    
    return 440.0 * (2 ** ((midi_number - 69) / 12))


print(note_to_freq('B3'))
print(note_to_freq('C#4'))
print(freq_to_note(260))

246.94165062806206
277.1826309768721
C3


In [106]:
notes = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
note_list = [note + str(octave) for octave in range(1, 9) for note in notes]

def get_note(value):
    try:
        # 找到指定音符的索引
        index = note_list.index(value)
        # 如果索引为0，说明是第一个音符，没有前一个音符
        if index == 0:
            return None
        else:
            # 返回前一个音符
            return [note_list[index - 1],note_list[index + 1]]
    except ValueError:
        # 如果指定音符不在列表中，返回None
        return None
print(get_note('D5'))

['C#5', 'D#5']


In [107]:
replacement_value = 'E3'
value = 'C4'
def change_pitch(replacement_value, value):
    print(get_note(value))
    print(note_to_freq(get_note(value)[0]))
    print(note_to_freq(get_note(value)[1]))
    try:
        for idx in range(200):  # 假设范围是100到200
            freq = frequency_data[idx]
            if freq > note_to_freq(get_note(value)[0]) and freq < note_to_freq(get_note(value)[1]):
                print(f'把頻率{freq}的音改為{note_to_freq(replacement_value)}')
                frequency_data[idx] = note_to_freq(replacement_value)
                #print(frequency[idx])
    except ValueError:
        return None

print(change_pitch(replacement_value, value))

# 输出修改后的列表


['B3', 'C#4']
246.94165062806206
277.1826309768721
把頻率261.84557346256的音改為164.81377845643496
把頻率274.6375094761715的音改為164.81377845643496
把頻率273.3660844460788的音改為164.81377845643496
把頻率256.1668914617507的音改為164.81377845643496
把頻率264.46963080149396的音改為164.81377845643496
把頻率259.44447402268077的音改為164.81377845643496
把頻率258.43530579407314的音改為164.81377845643496
None


In [73]:
replacement_value = 'E3'
start_index = 50
end_index = 350
replacement_list = [replacement_value] * (end_index - start_index)
print(replacement_list)


 #确保替换列表的长度与要替换的范围的长度相同
if len(replacement_list) == end_index - start_index:
    # 使用切片赋值将替换列表中的元素替换为原始列表中的元素
    frequency_data[start_index:end_index] = replacement_list

# 打印替换后的列表
#print(frequency_data)  # 输出：[1, 2, 10, 11, 12, 6, 7, 8, 9]

['B2', 'C#3']


In [108]:
import numpy as np

import librosa
from IPython.display import Audio
print(frequency_data)
def generate_audio(frequency_data, amplitude_data, time_data, sample_rate=44100):
    # 將頻率和振幅數據堆疊為二維數組
    freq_ampl_nparray = np.vstack((frequency_data, amplitude_data))
    
    # 確定時間窗口的長度和跳躍長度
    hop_length = 512
    win_length = 2048

    # 初始化音頻數據
    audio_data = np.zeros((len(time_data) * hop_length + win_length))

    # 將每個時間窗口的頻率和振幅轉換為時間域信號
    for i, (frequency, amplitude) in enumerate(zip(*freq_ampl_nparray)):
        # 計算時間窗口的起始樣本位置
        start_sample = i * hop_length
        # 計算每個時間窗口的時間軸
        time_axis = np.arange(start_sample, start_sample + win_length)
        # 計算正弦波信號
        sin_wave = amplitude * np.sin(2 * np.pi * frequency * time_axis / sample_rate)
        # 添加到總音頻數據中
        audio_data[start_sample:start_sample + win_length] += sin_wave

    # 顯示音頻
    return audio_data

# 調用函數生成音頻數據並顯示
audio_data = generate_audio(frequency_data, amplitude_data, time_data) #是震幅調整的有問題
Audio(data=audio_data, rate=44100)





[2205.0, 2205.0, 2205.0, 2205.0, 2205.0, 2205.0, 381.61503055450044, 234.58031960700134, 175.30792130725234, 76.55502833431488, 2205.0, 380.16788798609457, 996.1252918643668, 1229.2623404265767, 322.8741517882136, 596.3656468971993, 593.2087398800065, 713.8068093355571, 1177.287962330955, 79.5531733012645, 650.1588105580321, 319.0682691335656, 395.33792794657836, 2205.0, 2205.0, 1676.3488384242166, 472.2900391278318, 350.3578744102934, 2205.0, 1824.6170630246884, 1879.8788692979383, 162.57862641846572, 139.893898529477, 906.5585282162087, 244.5257583347589, 118.88237670935749, 497.9836791505885, 1100.3850554671046, 148.8374356448462, 114.68534336540851, 1816.6647281996707, 206.45549649207777, 286.15446405166165, 594.3500529002371, 135.4448236881884, 598.0087354629945, 738.5208070886504, 286.5285573093305, 1701.0948049452318, 226.68777995812997, 529.9703055999397, 2205.0, 1317.8842065803383, 292.07883393648564, 159.38491449380658, 2205.0, 1116.9669635767614, 407.8787452551509, 218.40861

In [81]:
import numpy as np
import librosa
from IPython.display import Audio

# 假设你已经有了 frequency_data、amplitude_data、hop_length 和 win_length

hop_length = 16
win_length = 128

# 将频率和振幅数据转换为 numpy 数组
frequency_data_2d = np.array(frequency_data).reshape(-1, 1)
amplitude_data_2d = np.array(amplitude_data).reshape(-1, 1)

# 将频率和振幅数据组合成复数形式的 STFT 数据
stft_data = frequency_data_2d + 1j * amplitude_data_2d
print(stft_data.shape)
# 使用 iSTFT 将 STFT 数据转换为时域信号
istft_signal = librosa.istft(stft_data, hop_length=hop_length, win_length=win_length)

# 打印输出的时域信号的长度
print(len(istft_signal))
#Audio(istft_signal,rate=44100)

(399, 1)
0


ValueError: zero-size array to reduction operation maximum which has no identity