In [25]:
import os
import time
import librosa
import torch
import torchaudio
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
import time
from torchaudio.transforms import MFCC
import torchaudio.transforms as T


In [26]:
folder_path = 'C:/Users/kevin.KEVIN/Desktop/Audio/audio_files'

In [27]:
def extract_audio_features(filepath):
    # Load with torchaudio for other features
    waveform, sample_rate = torchaudio.load(filepath)
    waveform_np = waveform.numpy().squeeze()  # Convert to numpy for librosa processing

    result = torch.tensor([])

    # Ensure waveform is mono (if stereo, take mean across channels)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # 1. Mel-frequency cepstral coefficients (MFCC) 
    # 13 for arithmetic mean and 13 for coefficient of variation
    mfcc_transform = torchaudio.transforms.MFCC(
        sample_rate=sample_rate, n_mfcc=13, melkwargs={"n_fft": 1024, "hop_length": 128}
    )
    mfccs = mfcc_transform(waveform)
    mfccs_mean = torch.mean(mfccs, dim=-1).squeeze()
    mfccs_std = torch.std(mfccs, dim=-1).squeeze()
    coef_variation = mfccs_std / (mfccs_mean + 1e-6)
    result = torch.cat((result, mfccs_mean, coef_variation), dim=0)

    # 2. Chroma feature (using librosa)
    # 12 for chroma mean
    stft = np.abs(librosa.stft(waveform_np, n_fft=512, hop_length=256))
    chroma = librosa.feature.chroma_stft(S=stft, sr=sample_rate)
    chroma_mean = torch.tensor(np.mean(chroma, axis=1))
    result = torch.cat((result, chroma_mean), dim=0)
    
    # 3. Mel-spectrogram
    # 128 for mel-spectrogram mean
    mel_transform = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate)
    mel_spec = mel_transform(waveform)
    mel_mean = torch.mean(mel_spec, dim=-1).squeeze()
    result = torch.cat((result, mel_mean), dim=0)

    # 4. Zero crossing rate
    # 398 for zero crossing rate
    zcr = librosa.feature.zero_crossing_rate(waveform_np, frame_length=2048, hop_length=512)
    zcr = zcr.flatten()
    result = torch.cat((result, torch.tensor(zcr)), dim=0)

    # 5. Root square energy
    # 398 for RMS energy
    rms = librosa.feature.rms(y=waveform_np, frame_length=2048, hop_length=512)
    rms = rms.flatten()
    rms_tensor = torch.tensor(rms, dtype=torch.float32)
    result = torch.cat((result, rms_tensor), dim=0)

    # 6. Harmonic-to-noise ratio
    # 1 for HNR mean
    harmonic = librosa.effects.harmonic(y=waveform_np)
    percussive = librosa.effects.percussive(y=waveform_np)
    hnr_mean = torch.tensor(np.mean(harmonic / (percussive + 1e-6))).unsqueeze(0)
    result = torch.cat((result, hnr_mean), dim=0)

    '''
    # 3. Zero crossing rate
    zero_crossings = (waveform[:, 1:] * waveform[:, :-1] < 0).sum(dim=1).float()
    zcr_mean = zero_crossings.mean()
    zcr_std = zero_crossings.std()
    #result = torch.cat((result, torch.tensor([zcr_mean, zcr_std])), dim=0)
    result = torch.cat((result, zero_crossings), dim=0)

    # 4. Harmonic-to-noise ratio (using librosa)
    harmonic = librosa.effects.harmonic(waveform_np)
    percussive = librosa.effects.percussive(waveform_np)
    hnr_mean = torch.tensor(np.mean(harmonic / (percussive + 1e-6))).unsqueeze(0)  # Add small constant to avoid division by zero
    hnr_std = torch.tensor(np.std(harmonic / (percussive + 1e-6))).unsqueeze(0)
    result = torch.cat((result, hnr_mean, hnr_std), dim=0)

    # 5. Mel-spectrogram
    
    '''

    return result.numpy(), os.path.basename(filepath)

# Main processing loop
folder_path = "C:/Users/kevin.KEVIN/Desktop/Audio/audio_files"
features_list = []
names_list = []

start_time = time.time()
for filename in tqdm(os.listdir(folder_path), desc="Processing audio files"):
    file_path = os.path.join(folder_path, filename)
    
    if filename.endswith('.wav'):
        features, name = extract_audio_features(file_path)
        features_list.append(features)
        names_list.append(name)

# Convert to DataFrame
final_data = pd.DataFrame(features_list)
final_data['name'] = names_list

print("Data shape:", final_data.shape)
print("Time taken:", time.time() - start_time)


Processing audio files: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s]

Data shape: (2, 964)
Time taken: 0.9608774185180664





In [28]:
final_data.shape

(2, 964)

In [29]:
mfcc_label_mean = "mfcc_arithmetic_mean_"
mfcc_coef_variation = "mfcc_coef_variation_"
chroma_label = "chroma_"
mel_spectrogram_label = "mel_spectrogram_"
zcr_label = "zero_crossing_rate_"
rms_label = "rms_"
hnr_label = "harmonic_to_noise_ratio_"

for column in range(len(final_data.columns)):
    if column < 13:
        final_data = final_data.rename(columns={column: mfcc_label_mean + str(column + 1)})
    elif column < 26:
        final_data = final_data.rename(columns={column: mfcc_coef_variation + str(column - 13 + 1)})
    elif column < 38:
        final_data = final_data.rename(columns={column: chroma_label + str(column - 26 + 1)})
    elif column < 166:
        final_data = final_data.rename(columns={column: mel_spectrogram_label + str(column - 38 + 1)})
    elif column < 564:
        final_data = final_data.rename(columns={column: zcr_label + str(column - 166 + 1)})
    elif column < 962:
        final_data = final_data.rename(columns={column: rms_label + str(column - 564 + 1)})
    else:
        final_data = final_data.rename(columns={column: hnr_label + str(column - 962 + 1)})
    

In [30]:
final_data

Unnamed: 0,mfcc_arithmetic_mean_1,mfcc_arithmetic_mean_2,mfcc_arithmetic_mean_3,mfcc_arithmetic_mean_4,mfcc_arithmetic_mean_5,mfcc_arithmetic_mean_6,mfcc_arithmetic_mean_7,mfcc_arithmetic_mean_8,mfcc_arithmetic_mean_9,mfcc_arithmetic_mean_10,...,rms_391,rms_392,rms_393,rms_394,rms_395,rms_396,rms_397,rms_398,harmonic_to_noise_ratio_1,name
0,-495.520599,79.076004,-11.591728,7.662766,-11.700242,5.068298,-16.33341,-8.82016,-17.248329,-7.643953,...,6.743496e-07,6.743496e-07,6.743496e-07,0.0,0.0,1e-06,2e-06,2e-06,13.971914,audio_1.wav
1,-569.218811,50.350269,18.299604,12.35566,1.612295,12.272793,-16.573362,2.895184,-4.377674,-11.422575,...,,,,,,,,,,audio_2.wav


In [31]:
final_data.columns

Index(['mfcc_arithmetic_mean_1', 'mfcc_arithmetic_mean_2',
       'mfcc_arithmetic_mean_3', 'mfcc_arithmetic_mean_4',
       'mfcc_arithmetic_mean_5', 'mfcc_arithmetic_mean_6',
       'mfcc_arithmetic_mean_7', 'mfcc_arithmetic_mean_8',
       'mfcc_arithmetic_mean_9', 'mfcc_arithmetic_mean_10',
       ...
       'rms_391', 'rms_392', 'rms_393', 'rms_394', 'rms_395', 'rms_396',
       'rms_397', 'rms_398', 'harmonic_to_noise_ratio_1', 'name'],
      dtype='object', length=964)

In [32]:
for column in final_data.columns:
    print(column)

mfcc_arithmetic_mean_1
mfcc_arithmetic_mean_2
mfcc_arithmetic_mean_3
mfcc_arithmetic_mean_4
mfcc_arithmetic_mean_5
mfcc_arithmetic_mean_6
mfcc_arithmetic_mean_7
mfcc_arithmetic_mean_8
mfcc_arithmetic_mean_9
mfcc_arithmetic_mean_10
mfcc_arithmetic_mean_11
mfcc_arithmetic_mean_12
mfcc_arithmetic_mean_13
mfcc_coef_variation_1
mfcc_coef_variation_2
mfcc_coef_variation_3
mfcc_coef_variation_4
mfcc_coef_variation_5
mfcc_coef_variation_6
mfcc_coef_variation_7
mfcc_coef_variation_8
mfcc_coef_variation_9
mfcc_coef_variation_10
mfcc_coef_variation_11
mfcc_coef_variation_12
mfcc_coef_variation_13
chroma_1
chroma_2
chroma_3
chroma_4
chroma_5
chroma_6
chroma_7
chroma_8
chroma_9
chroma_10
chroma_11
chroma_12
mel_spectrogram_1
mel_spectrogram_2
mel_spectrogram_3
mel_spectrogram_4
mel_spectrogram_5
mel_spectrogram_6
mel_spectrogram_7
mel_spectrogram_8
mel_spectrogram_9
mel_spectrogram_10
mel_spectrogram_11
mel_spectrogram_12
mel_spectrogram_13
mel_spectrogram_14
mel_spectrogram_15
mel_spectrogram_16
m