In [1]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
from librosa.feature import tempogram, chroma_cqt, mfcc
from librosa.core import stft
from sklearn.model_selection import KFold
from sklearn.mixture import GaussianMixture

from scipy.io import wavfile
import matplotlib.pyplot as plt

In [2]:
# 載入音樂樣本並提取MFCC特徵
def extract_features(file_path):
    hop_length = 64
    n_fft = 256
    max_length = 100
    
    # 根據需要進行音樂樣本載入，這裡假設使用wav檔案格式
    sample_rate, signal = wavfile.read(file_path)
    # Convert audio data to floating-point format
    signal = signal.astype(np.float32) / np.iinfo(signal.dtype).max
    
    # Calculate MFCC features
    mfcc_features = mfcc(y=signal, sr=sample_rate)
    # 提取節奏直方圖特徵
    beat_features = tempogram(y=signal, sr=sample_rate, hop_length=hop_length, win_length=n_fft)
    # Reduce beat feature dimensionality
    beat_features = beat_features[:12, :]
    
    # 提取音高直方圖特 徵
    pitch_features = chroma_cqt(y=signal, sr=sample_rate, hop_length=hop_length, n_chroma=12)
    
    # Apply padding or truncation to ensure fixed size
    mfcc_features = pad_or_truncate(mfcc_features, max_length)
    beat_features = pad_or_truncate(beat_features, max_length)
    pitch_features = pad_or_truncate(pitch_features, max_length)
    
    return mfcc_features ,beat_features, pitch_features

# Function to pad or truncate a feature to the desired length
def pad_or_truncate(feature, max_length):
    if feature.shape[1] < max_length:
        # Pad the feature with zeros to the desired length
        feature = np.pad(feature, ((0, 0), (0, max_length - feature.shape[1])), mode='constant')
        
    else:
        # Truncate the feature to the desired length
        feature = feature[:, :max_length]
    
    return feature

# 載入音樂樣本和標籤
def load_data():
    # 設定音樂類別和資料夾路徑
    classes = ['blues', 'country','classical','disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock' ] # , 
    data = []
    labels = []

    for label, class_name in enumerate(classes):
        folder_path = f'genre/{class_name}/'
        print(folder_path)
        for file_name in os.listdir(folder_path):
            if not file_name.endswith('.wav'):
                continue
            file_path = os.path.join(folder_path, file_name)
            features = extract_features(file_path)
            data.append(features)
            labels.append(label)
    return data, labels

# 加載和準備數據
X, y = load_data()
# 初始化GMM模型
gmm = GaussianMixture(n_components=3)

# 將MFCC、節奏直方圖和音高直方圖特徵結合成一個特徵向量
combined_features = []
for i in range(len(X)):
    mfcc_features = X[i]
    mfcc_features, beat_features, pitch_features = X[i]
    mfcc_features_flatten = mfcc_features.flatten()
    beat_features_flatten = beat_features.flatten()
    pitch_features_flatten = pitch_features.flatten()
    
    combined_feature = np.concatenate((mfcc_features_flatten, beat_features_flatten, pitch_features_flatten))
    combined_features.append(combined_feature)
    
X = np.array(combined_features)
y = np.array(y)
combined_features.clear()
print(len(X), len(y))

genre/blues/
genre/country/
genre/classical/
genre/disco/
genre/hiphop/
genre/jazz/
genre/metal/
genre/pop/
genre/reggae/
genre/rock/
500 500


In [3]:
# 5 cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i+1}")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # 訓練GMM模型
    gmm.fit(X_train)

    # 在測試集上進行預測
    y_pred = gmm.predict(X_test)

    # 計算分類準確率
    accuracy = np.mean(y_pred == y_test)
    accuracies.append(accuracy)

# 計算平均準確率
mean_accuracy = np.mean(accuracies)
print("平均分類準確率：", mean_accuracy)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
平均分類準確率： 0.06600000000000002


## Record first time
- label class: blues vs classical
```python
def extract_features(file_path):
    hop_length = 512
    n_fft = 2048

    # 根據需要進行音樂樣本載入，這裡假設使用wav檔案格式
    sample_rate, signal = wavfile.read(file_path)
    # Convert audio data to floating-point format
    signal = signal.astype(np.float32) / np.iinfo(signal.dtype).max
    
    # Calculate MFCC features
    mfcc_features = mfcc(y=signal, sr=sample_rate)
    
    # 提取節奏直方圖特徵
    beat_features = tempogram(y=signal, sr=sample_rate, hop_length=hop_length, win_length=n_fft)
    # 提取音高直方圖特徵
    pitch_features = chroma_cqt(y=signal, sr=sample_rate, hop_length=hop_length, n_chroma=12)
    
    # Apply padding or truncation to ensure fixed size
    max_length = 10
    
    gmm = GaussianMixture(n_components=3)
```
- mfcc_features.shape, beat_features.shape, pitch_features.shape:(20, 10) (2048, 10) (12, 10)
- accuaracy : 0.33

## Record second time
- label class: all
```python
def extract_features(file_path):
    hop_length = 64
    n_fft = 128

    # 根據需要進行音樂樣本載入，這裡假設使用wav檔案格式
    sample_rate, signal = wavfile.read(file_path)
    # Convert audio data to floating-point format
    signal = signal.astype(np.float32) / np.iinfo(signal.dtype).max
    
    # Calculate MFCC features
    mfcc_features = mfcc(y=signal, sr=sample_rate)
    # 提取節奏直方圖特徵
    beat_features = tempogram(y=signal, sr=sample_rate, hop_length=hop_length, win_length=n_fft)
    # Reduce beat feature dimensionality
    beat_features = beat_features[:12, :]
    
    # 提取音高直方圖特徵
    pitch_features = chroma_cqt(y=signal, sr=sample_rate, hop_length=hop_length, n_chroma=12)
    
    # print(mfcc_features.shape, beat_features.shape, pitch_features.shape)
    # Apply padding or truncation to ensure fixed size
    max_length = 500
    mfcc_features = pad_or_truncate(mfcc_features, max_length)
    beat_features = pad_or_truncate(beat_features, max_length)
    pitch_features = pad_or_truncate(pitch_features, max_length)
    
    gmm = GaussianMixture(n_components=3)
```
- mfcc_features.shape, beat_features.shape, pitch_features.shape:(20, 500) (12, 500) (12, 500)
- accuaracy : 0.111999999

## Record third time
- label class: all
```python
def extract_features(file_path):
    hop_length = 64
    n_fft = 256

    # 根據需要進行音樂樣本載入，這裡假設使用wav檔案格式
    sample_rate, signal = wavfile.read(file_path)
    # Convert audio data to floating-point format
    signal = signal.astype(np.float32) / np.iinfo(signal.dtype).max
    
    # Calculate MFCC features
    mfcc_features = mfcc(y=signal, sr=sample_rate)
    # 提取節奏直方圖特徵
    # beat_features = tempogram(y=signal, sr=sample_rate, hop_length=hop_length, win_length=n_fft)
    # # Reduce beat feature dimensionality
    # beat_features = beat_features[:12, :]
    
    # # 提取音高直方圖特 徵
    # pitch_features = chroma_cqt(y=signal, sr=sample_rate, hop_length=hop_length, n_chroma=12)
    
    # print(mfcc_features.shape, beat_features.shape, pitch_features.shape)
    # Apply padding or truncation to ensure fixed size
    max_length = 700
    mfcc_features = pad_or_truncate(mfcc_features, max_length)
    # beat_features = pad_or_truncate(beat_features, max_length)
    # pitch_features = pad_or_truncate(pitch_features, max_length)
    
    return mfcc_features #, beat_features, pitch_features

    gmm = GaussianMixture(n_components=3)
```
- mfcc_features.shape, beat_features.shape, pitch_features.shape:(20, 700)
- accuaracy : 0.112

## Record fourth time
- label class: all
```py
    hop_length = 64
    n_fft = 256
    max_length = 700
    # 根據需要進行音樂樣本載入，這裡假設使用wav檔案格式
    sample_rate, signal = wavfile.read(file_path)
    # Convert audio data to floating-point format
    signal = signal.astype(np.float32) / np.iinfo(signal.dtype).max
    
    # Calculate MFCC features
    mfcc_features = mfcc(y=signal, sr=sample_rate)
    # 提取節奏直方圖特徵
    # beat_features = tempogram(y=signal, sr=sample_rate, hop_length=hop_length, win_length=n_fft)
    # # Reduce beat feature dimensionality
    # beat_features = beat_features[:12, :]
    
    # # 提取音高直方圖特 徵
    # pitch_features = chroma_cqt(y=signal, sr=sample_rate, hop_length=hop_length, n_chroma=12)
    
    # print(mfcc_features.shape, beat_features.shape, pitch_features.shape)
    # Apply padding or truncation to ensure fixed size
    
    mfcc_features = pad_or_truncate(mfcc_features, max_length)
    # beat_features = pad_or_truncate(beat_features, max_length)
    # pitch_features = pad_or_truncate(pitch_features, max_length)
    
    return mfcc_features #, beat_features, pitch_features
```
- mfcc_features.shape:(20, 700) 
- accuaracy : 0.07599

## Record fifth time
- label class: all
```py
def extract_features(file_path):
    hop_length = 64
    n_fft = 256
    max_length = 500
    # 根據需要進行音樂樣本載入，這裡假設使用wav檔案格式
    sample_rate, signal = wavfile.read(file_path)
    # Convert audio data to floating-point format
    signal = signal.astype(np.float32) / np.iinfo(signal.dtype).max
    
    # Calculate MFCC features
    mfcc_features = mfcc(y=signal, sr=sample_rate)
    # 提取節奏直方圖特徵
    # beat_features = tempogram(y=signal, sr=sample_rate, hop_length=hop_length, win_length=n_fft)
    # # Reduce beat feature dimensionality
    # beat_features = beat_features[:12, :]
    
    # # 提取音高直方圖特 徵
    # pitch_features = chroma_cqt(y=signal, sr=sample_rate, hop_length=hop_length, n_chroma=12)
    
    # print(mfcc_features.shape, beat_features.shape, pitch_features.shape)
    # Apply padding or truncation to ensure fixed size
    
    mfcc_features = pad_or_truncate(mfcc_features, max_length)
    # beat_features = pad_or_truncate(beat_features, max_length)
    # pitch_features = pad_or_truncate(pitch_features, max_length)
    
    return mfcc_features #, beat_features, pitch_features
```
- mfcc_features.shape:(20, 500) 
- accuaracy : 0.11

## Record sixth time
- label class: all
```python
def extract_features(file_path):
    hop_length = 64
    n_fft = 256
    max_length = 100
    
    # 根據需要進行音樂樣本載入，這裡假設使用wav檔案格式
    sample_rate, signal = wavfile.read(file_path)
    # Convert audio data to floating-point format
    signal = signal.astype(np.float32) / np.iinfo(signal.dtype).max
    
    # Calculate MFCC features
    mfcc_features = mfcc(y=signal, sr=sample_rate)
    # 提取節奏直方圖特徵
    # beat_features = tempogram(y=signal, sr=sample_rate, hop_length=hop_length, win_length=n_fft)
    # # Reduce beat feature dimensionality
    # beat_features = beat_features[:12, :]
    
    # # 提取音高直方圖特 徵
    # pitch_features = chroma_cqt(y=signal, sr=sample_rate, hop_length=hop_length, n_chroma=12)
    
    # print(mfcc_features.shape, beat_features.shape, pitch_features.shape)
    # Apply padding or truncation to ensure fixed size
    
    mfcc_features = pad_or_truncate(mfcc_features, max_length)
    # beat_features = pad_or_truncate(beat_features, max_length)
    # pitch_features = pad_or_truncate(pitch_features, max_length)
    
    return mfcc_features #, beat_features, pitch_features
```
- mfcc_features.shape:(20, 100) 
- accuaracy : 0.158

## Record seventh time
- label class: all
```python
def extract_features(file_path):
    hop_length = 64
    n_fft = 256
    max_length = 800
    
    # 根據需要進行音樂樣本載入，這裡假設使用wav檔案格式
    sample_rate, signal = wavfile.read(file_path)
    # Convert audio data to floating-point format
    signal = signal.astype(np.float32) / np.iinfo(signal.dtype).max
    
    # Calculate MFCC features
    mfcc_features = mfcc(y=signal, sr=sample_rate)
    # 提取節奏直方圖特徵
    # beat_features = tempogram(y=signal, sr=sample_rate, hop_length=hop_length, win_length=n_fft)
    # # Reduce beat feature dimensionality
    # beat_features = beat_features[:12, :]
    
    # # 提取音高直方圖特 徵
    # pitch_features = chroma_cqt(y=signal, sr=sample_rate, hop_length=hop_length, n_chroma=12)
    
    # print(mfcc_features.shape, beat_features.shape, pitch_features.shape)
    # Apply padding or truncation to ensure fixed size
    
    mfcc_features = pad_or_truncate(mfcc_features, max_length)
    # beat_features = pad_or_truncate(beat_features, max_length)
    # pitch_features = pad_or_truncate(pitch_features, max_length)
    
    return mfcc_features #, beat_features, pitch_features
```
- mfcc_features.shape:(20, 800) 
- accuaracy : 0.088

## Record eighth time
- label class: all
```python
def extract_features(file_path):
    hop_length = 64
    n_fft = 256
    max_length = 100
    
    # 根據需要進行音樂樣本載入，這裡假設使用wav檔案格式
    sample_rate, signal = wavfile.read(file_path)
    # Convert audio data to floating-point format
    signal = signal.astype(np.float32) / np.iinfo(signal.dtype).max
    
    # Calculate MFCC features
    mfcc_features = mfcc(y=signal, sr=sample_rate)
    # 提取節奏直方圖特徵
    # beat_features = tempogram(y=signal, sr=sample_rate, hop_length=hop_length, win_length=n_fft)
    # # Reduce beat feature dimensionality
    # beat_features = beat_features[:12, :]
    
    # # 提取音高直方圖特 徵
    # pitch_features = chroma_cqt(y=signal, sr=sample_rate, hop_length=hop_length, n_chroma=12)
    
    # print(mfcc_features.shape, beat_features.shape, pitch_features.shape)
    # Apply padding or truncation to ensure fixed size
    
    mfcc_features = pad_or_truncate(mfcc_features, max_length)
    # beat_features = pad_or_truncate(beat_features, max_length)
    # pitch_features = pad_or_truncate(pitch_features, max_length)
    
    return mfcc_features #, beat_features, pitch_features
```
- mfcc_features.shape ,beat_features, pitch_features:(20, 100) (12, 100) (12, 100) 
- accuaracy : 0.06600000000000002

## Kaggle record
- label class: all
```pyth
def extract_features(file_path):
    hop_length = 32
    n_fft = 64
    max_length = 100

    # 根據需要進行音樂樣本載入，這裡假設使用wav檔案格式
    sample_rate, signal = wavfile.read(file_path)
    # Convert audio data to floating-point format
    signal = signal.astype(np.float32) / np.iinfo(signal.dtype).max
    
    # Calculate MFCC features
    mfcc_features = mfcc(y=signal, sr=sample_rate)
    # 提取節奏直方圖特徵
    beat_features = tempogram(y=signal, sr=sample_rate, hop_length=hop_length, win_length=n_fft)
    # Reduce beat feature dimensionality
    beat_features = beat_features[:12, :]
    
    # 提取音高直方圖特徵
    pitch_features = chroma_cqt(y=signal, sr=sample_rate, hop_length=hop_length, n_chroma=12)
    
    # print(mfcc_features.shape, beat_features.shape, pitch_features.shape)
    # Apply padding or truncation to ensure fixed size
    mfcc_features = pad_or_truncate(mfcc_features, max_length)
    beat_features = pad_or_truncate(beat_features, max_length)
    pitch_features = pad_or_truncate(pitch_features, max_length)

    gmm = GaussianMixture(n_components=3)
```
- mfcc_features.shape, beat_features.shape, pitch_features.shape:(20, 100) (12, 100) (12, 100)
- accuaracy : 0.124