In [1]:
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import csv
import pandas as pd
import os
# np.set_printoptions(threshold=np.inf)

In [6]:
def extract_features(file_path, target_duration = 8, target_sr = 22050):
    # exctract RMS, ZCR, SC, SB
    audio, sr = librosa.load(file_path, sr=target_sr)  # Load audio file
    duration = librosa.get_duration(y=audio, sr=sr)  # Get duration in seconds
    # print(duration)

    FRAME_LENGTH = 1024
    HOP_LENGTH = 512
    
    if duration >= target_duration:
        print(True)
        # Convert target duration to samples
        target_samples = int(target_duration * sr)
        
        # Get the center point of the audio
        center = len(audio) // 2
        # print("Center: " , center)
        # Calculate the start and end points for cropping
        start_sample = max(0, center - target_samples // 2)
        end_sample = min(len(audio), center + target_samples // 2)
        
        # Crop the audio around the center
        cropped_audio = audio[start_sample:end_sample]
        # print(cropped_audio.shape)

        # Check if the audio was cropped
        # ipd.display(ipd.Audio(cropped_audio, rate=sr))

        # Extract RMS from the cropped audio
        rms_cropped = librosa.feature.rms(y=cropped_audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        # print("Mean: ", np.mean(rms_cropped))
        # Extract ZCR from the cropped audio
        zcr_cropped = librosa.feature.zero_crossing_rate(y=cropped_audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        # Extract spectral centroid from the cropped audio
        sc_cropped = librosa.feature.spectral_centroid(y=cropped_audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        # Extract spectral bandwidth from the cropped audio
        sb_cropped = librosa.feature.spectral_bandwidth(y=cropped_audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        
        return np.mean(rms_cropped), np.mean(zcr_cropped), np.mean(sc_cropped), np.mean(sb_cropped)
        # for visualization use the return code below
        # return rms_cropped, cropped_audio
    else:
        # If duration is less than target, process the entire audio
        print(False)
        rms_full = librosa.feature.rms(y=audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        zcr_full = librosa.feature.zero_crossing_rate(y=audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        sc_full = librosa.feature.spectral_centroid(y=audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        sb_full = librosa.feature.spectral_bandwidth(y=audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        return np.mean(rms_full), np.mean(zcr_full), np.mean(sc_full), np.mean(sb_full)

In [8]:
def extract_mean_features(file_path, target_duration=8, target_sr=22050):
    """
    Extract mean values of RMS, ZCR, Spectral Centroid, and Spectral Bandwidth from the audio file.
    """
    audio, sr = librosa.load(file_path, sr=target_sr)  # Load audio file
    duration = librosa.get_duration(y=audio, sr=sr)  # Get duration in seconds

    FRAME_LENGTH = 1024
    HOP_LENGTH = 512

    if duration >= target_duration:
        # Convert target duration to samples
        target_samples = int(target_duration * sr)
        
        # Get the center point of the audio and calculate the crop range
        center = len(audio) // 2
        start_sample = max(0, center - target_samples // 2)
        end_sample = min(len(audio), center + target_samples // 2)
        
        # Crop the audio around the center
        cropped_audio = audio[start_sample:end_sample]

        # Extract features from the cropped audio and return mean values
        rms_cropped = librosa.feature.rms(y=cropped_audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        zcr_cropped = librosa.feature.zero_crossing_rate(y=cropped_audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        sc_cropped = librosa.feature.spectral_centroid(y=cropped_audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        sb_cropped = librosa.feature.spectral_bandwidth(y=cropped_audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        
        return np.mean(rms_cropped), np.mean(zcr_cropped), np.mean(sc_cropped), np.mean(sb_cropped)

    else:
        # Process the entire audio if it's shorter than the target duration
        rms_full = librosa.feature.rms(y=audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        zcr_full = librosa.feature.zero_crossing_rate(y=audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        sc_full = librosa.feature.spectral_centroid(y=audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        sb_full = librosa.feature.spectral_bandwidth(y=audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        
        return np.mean(rms_full), np.mean(zcr_full), np.mean(sc_full), np.mean(sb_full)


def process_audio_files_mean(folder_path, csv_file_path, f_name):
    """
    Process audio files and save mean feature values (RMS, ZCR, SC, SB) to CSV.
    """
    df = pd.read_csv(csv_file_path)
    rms_values = []
    zcr_values = []
    sc_values = []
    sb_values = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.mp3') or file.endswith('.wav'):
                file_path = os.path.join(root, file)

                # Extract mean feature values for each audio file
                rms, zcr, sc, sb = extract_mean_features(file_path)
                rms_values.append(rms)
                zcr_values.append(zcr)
                sc_values.append(sc)
                sb_values.append(sb)

    print("Number of processed files: ", len(rms_values))
    
    # Add mean values as new columns in the CSV
    df['MeanRMS'] = rms_values
    df['MeanZCR'] = zcr_values
    df['MeanSpectralCentroid'] = sc_values
    df['MeanSpectralBandwidth'] = sb_values

    # Save the updated DataFrame back to a new CSV
    df.to_csv(f"result/{f_name}", index=False)
    print(f"Successfully added mean feature values to {f_name}")
    
    print(df)


# Define your file paths and process the audio files
human_audio_folder = "AudioData/human"
synthetic_audio_folder = "AudioData/synthetic"

# process_audio_files_mean(human_audio_folder, "result/human_mfcc_features.csv", "human_mean_features.csv")
process_audio_files_mean(synthetic_audio_folder, "result/synthetic_mfcc_features.csv", "synthetic_mean_features.csv")


Number of processed files:  10
Successfully added mean feature values to synthetic_mean_features.csv
               file_name  mfcc_feature1  mfcc_feature2  mfcc_feature3  \
0   spanish86_cloned.mp3    -253.368515     135.128464      -7.459697   
1    swedish4_cloned.mp3    -166.629471      89.696144       0.042984   
2   tagalog12_cloned.mp3    -226.903900     105.014870     -12.404805   
3  taiwanese6_cloned.mp3    -242.969666     140.078857      -0.760939   
4     tajiki1_cloned.mp3    -171.743820      81.141678     -36.365517   
5      thai10_cloned.mp3    -193.134186      84.791306      -3.990597   
6   tigrigna5_cloned.mp3    -245.046997     126.281441     -15.192507   
7    turkish6_cloned.mp3    -253.869415     138.880753     -10.747830   
8      urdo10_cloned.mp3    -196.868088     144.456467     -16.745396   
9     yidish3_cloned.mp3    -188.208832     163.903702     -39.247017   

   mfcc_feature4  mfcc_feature5  mfcc_feature6  mfcc_feature7  mfcc_feature8  \
0      32.95785