# 1. Define Imports

In [1]:
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import csv
import pandas as pd
import os

from tqdm import tqdm
from datasets import load_dataset

# 2. Data Cleaning

In [2]:
# Convert stereo to mono (two or more channels to sigle channel)
def pre_process(audio_file):
    # conert to single hz
    target_sr = 22050 
    # Load the data with resampling
    signal, sr = librosa.load(audio_file, sr=target_sr, mono=False, res_type='kaiser_best')
    if signal.ndim > 1:  # Check if the signal is stereo
        print(True)
        # Convert stereo to mono by averaging the channels
        signal = np.mean(signal, axis=0)
    return signal, target_sr

In [3]:
# NOTE: For training. No need to copy
def pre_process_hugging_face(audio_array, original_sr):
    # Target sampling rate for resampling
    target_sr = 22050 
    
    # Resample the audio array to target sampling rate
    signal = librosa.resample(audio_array.astype(np.float32), orig_sr=original_sr, target_sr=target_sr)
    
    # Check if it's stereo (more than 1 channel)
    if signal.ndim > 1:
        # Convert stereo to mono by averaging the channels
        signal = np.mean(signal, axis=0)
    
    return signal, target_sr

# 3. Extract MFCC

In [4]:
def extract_MFCC(signal, sr):
    signal.shape # The output is 1 dimensional array with the following value
    # Sample rate
    # print("Signal: ", signal)
    # print("Sample Rate: ", sr)

    # Extract MFCC
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)
    # print(mfccs.shape) # First value is the number of rows, second value is the number of columns or frames
    # print(mfccs)

    # First Derivative -  Capture the temporal dynamics of the speech signal, providing information about how the MFCCs are changing.
    delta_mfccs = librosa.feature.delta(mfccs)
    # Second Derivative - Capture the dynamics of the delta features, providing additional information about the speech signalâ€™s temporal characteristics.
    delta2_mfccs = librosa.feature.delta(mfccs, order=2)

    comprehensive_mfcc = np.concatenate((mfccs, delta_mfccs, delta2_mfccs))
    # print(comprehensive_mfcc)
    return comprehensive_mfcc

# 4. Extract RMS, ZCR, SC, SB

### Note: 
#### If you want the mean switch the return value

In [5]:
def extract_features(audio, sr, target_duration = 2, target_sr = 22050):
    # exctract RMS, ZCR, SC, SB
    # audio, sr = librosa.load(file_path, sr=target_sr)  # Load audio file
    duration = librosa.get_duration(y=audio, sr=sr)  # Get duration in seconds
    
    # print(duration)

    FRAME_LENGTH = 1024
    HOP_LENGTH = 512
    
    if duration >= target_duration:
        # print(True)
        # Convert target duration to samples
        target_samples = int(target_duration * sr)
        
        # Get the center point of the audio
        center = len(audio) // 2
        # print("Center: " , center)
        # Calculate the start and end points for cropping
        start_sample = max(0, center - target_samples // 2)
        end_sample = min(len(audio), center + target_samples // 2)
        
        # Crop the audio around the center
        cropped_audio = audio[start_sample:end_sample]
        # print(cropped_audio.shape)

        # Check if the audio was cropped
        # ipd.display(ipd.Audio(cropped_audio, rate=sr))

        # Extract RMS from the cropped audio
        rms_cropped = librosa.feature.rms(y=cropped_audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        # Extract ZCR from the cropped audio
        zcr_cropped = librosa.feature.zero_crossing_rate(y=cropped_audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        # Extract spectral centroid from the cropped audio
        sc_cropped = librosa.feature.spectral_centroid(y=cropped_audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        # Extract spectral bandwidth from the cropped audio
        sb_cropped = librosa.feature.spectral_bandwidth(y=cropped_audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        
        # return rms_cropped, zcr_cropped, sc_cropped, sb_cropped
        return np.mean(rms_cropped), np.mean(zcr_cropped), np.mean(sc_cropped), np.mean(sb_cropped) # for mean
    else:
        # If duration is less than target, process the entire audio
        # print(False)
        rms_full = librosa.feature.rms(y=audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        zcr_full = librosa.feature.zero_crossing_rate(y=audio, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        sc_full = librosa.feature.spectral_centroid(y=audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        sb_full = librosa.feature.spectral_bandwidth(y=audio, sr=sr, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
        # return rms_full, zcr_full, sc_full, sb_full
        return np.mean(rms_full), np.mean(zcr_full), np.mean(sc_full), np.mean(sb_full) # for mean

# 5. Summarize

In [6]:
# Function to process all files in a given folder and accumulate features
# README: For complete values
def process_audio_files(folder_path, output_file_name):
    mfcc_features = []
    rms_values = []
    zcr_values = []
    sc_values = []
    sb_values = []
    file_names = []  # List to store file names
    for root, dirs, files in tqdm(os.walk(folder_path)):
        for file in files:
            if file.endswith('.mp3') or file.endswith('.wav'):
                file_path = os.path.join(root, file)
                print(f"Processing {file}")
                
                # Load the audio signal
                # signal, sr = librosa.load(file_path, sr=22050)
                signal, sr = pre_process(file_path)
                
                # Extract MFCC features
                mfcc = extract_MFCC(signal, sr)
                mfcc_mean = np.mean(mfcc.T, axis=0)
                mfcc_features.append(mfcc_mean)
                
                # Extract RMS, ZCR, SC, SB features
                rms, zcr, sc, sb = extract_features(signal, sr)
                rms_values.append(rms)
                zcr_values.append(zcr)
                sc_values.append(sc)
                sb_values.append(sb)

                file_names.append(file)
            else:
                continue
                
    print("Number of processed files: ", len(rms_values))
    # Create dataframes for MFCC and other features
    mfcc_df = pd.DataFrame(mfcc_features, columns=[f'mfcc_feature{i+1}' for i in range(mfcc_features[0].shape[0])])
    rms_df = pd.DataFrame(rms_values, columns=[f'RMS{i+1}' for i in range(len(rms_values[0]))])
    zcr_df = pd.DataFrame(zcr_values, columns=[f'ZCR{i+1}' for i in range(len(zcr_values[0]))])
    sc_df = pd.DataFrame(sc_values, columns=[f'SpectralCentroid{i+1}' for i in range(len(sc_values[0]))])
    sb_df = pd.DataFrame(sb_values, columns=[f'SpectralBandwidth{i+1}' for i in range(len(sb_values[0]))])

    # Combine all features into a single DataFrame
    combined_df = pd.concat([pd.DataFrame(file_names, columns=['file_name']), mfcc_df, rms_df, zcr_df, sc_df, sb_df], axis=1)

    create_csv(combined_df, output_file_name)

In [7]:
# README: for mean values
def process_audio_files_mean(folder_path, output_file_name):
    mfcc_features = []
    rms_values = []
    zcr_values = []
    sc_values = []
    sb_values = []
    file_names = []  # To keep track of file names

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.mp3') or file.endswith('.wav') or file.endswith('.flac'):
                file_path = os.path.join(root, file)
                print(f"Processing {file}")
                # Load the audio signal
                signal, sr = pre_process(file_path)

                # Extract MFCC features and calculate the mean over time
                mfcc = extract_MFCC(signal, sr)
                mfcc_mean = np.mean(mfcc.T, axis=0)
                mfcc_features.append(mfcc_mean)

                # Extract mean values of RMS, ZCR, SC, SB for the audio file
                rms, zcr, sc, sb = extract_features(signal, sr)
                rms_values.append(rms)
                zcr_values.append(zcr)
                sc_values.append(sc)
                sb_values.append(sb)

                # Append file name for reference
                file_names.append(file)

    print("Number of processed files: ", len(rms_values))

    # Create DataFrame for MFCC features
    mfcc_df = pd.DataFrame(mfcc_features, columns=[f'mfcc_feature{i+1}' for i in range(mfcc_features[0].shape[0])])

    # Create a DataFrame for the mean values of additional features
    df = pd.DataFrame({
        'MeanRMS': rms_values,
        'MeanZCR': zcr_values,
        'MeanSpectralCentroid': sc_values,
        'MeanSpectralBandwidth': sb_values
    })

    # Combine MFCC features, additional mean features, and file names
    combined_df = pd.concat([pd.DataFrame(file_names, columns=['file_name']), mfcc_df, df], axis=1)

    create_csv(combined_df, output_file_name)

In [10]:
# For hugging face dataset only
# Full values
def process_audio_files_hugging_face(dataset, output_file_name):
    mfcc_features = []
    rms_values = []
    zcr_values = []
    sc_values = []
    sb_values = []
    file_names = []  # List to store file names

    # Loop through the dataset
    for i, item in tqdm(enumerate(dataset['train'])):
        # Access the audio path and file name
        file_name = item['audio']['path']
        arr_audio = item['audio']['array']
        sampling_rate = item['audio']['sampling_rate']
        print(f"Processing {file_name}")
        
        # Load the audio signal
        signal, sr = pre_process_hugging_face(arr_audio, sampling_rate)

        # Extract MFCC features
        mfcc = extract_MFCC(signal, sr)
        mfcc_mean = np.mean(mfcc.T, axis=0)
        mfcc_features.append(mfcc_mean)

        # Extract RMS, ZCR, SC, SB features
        rms, zcr, sc, sb = extract_features(signal, sr)
        rms_values.append(rms)
        zcr_values.append(zcr)
        sc_values.append(sc)
        sb_values.append(sb)

        file_names.append(file_name)

    # Create dataframes for MFCC and other features
    mfcc_df = pd.DataFrame(mfcc_features, columns=[f'mfcc_feature{i+1}' for i in range(mfcc_features[0].shape[0])])
    rms_df = pd.DataFrame(rms_values, columns=[f'RMS{i+1}' for i in range(len(rms_values[0]))])
    zcr_df = pd.DataFrame(zcr_values, columns=[f'ZCR{i+1}' for i in range(len(zcr_values[0]))])
    sc_df = pd.DataFrame(sc_values, columns=[f'SpectralCentroid{i+1}' for i in range(len(sc_values[0]))])
    sb_df = pd.DataFrame(sb_values, columns=[f'SpectralBandwidth{i+1}' for i in range(len(sb_values[0]))])
    

    # Combine all features into a single DataFrame
    combined_df = pd.concat([pd.DataFrame(file_names, columns=['file_name']), mfcc_df, rms_df, zcr_df, sc_df, sb_df], axis=1)

    # Save the DataFrame to CSV
    create_csv(combined_df, output_file_name)

# 6. Save to CSV file

In [11]:
# Create a annotated dataset
def create_csv(combined_df, output_file_name):
    # Save to CSV
    combined_df.to_csv(f"result/{output_file_name}", index=False)
    print(f"Successfully saved combined features to {output_file_name}")

# 7. Extract Training Features

In [14]:
# --------------------------------------------------------Deep fake detection------------------------------------------------------
human_audio_folder = "AudioData/human"
synthetic_audio_folder = "AudioData/synthetic/Speechify"

# Human
# process_audio_files(human_audio_folder, "deepfake_detection/Human_full_official.csv")

# AI 
# process_audio_files(synthetic_audio_folder, "deepfake_detection/Synthetic_full_official.csv")
process_audio_files(synthetic_audio_folder, "deepfake_detection/Synthetic_full_speechify.csv") # Speechify

# AI Hugging Face
# ds = load_dataset("saahith/synthetic_with_val")
# ds = load_dataset("birgermoell/synthetic_compassion_wav")
# process_audio_files_hugging_face(ds, "deepfake_detection/Synthetic_full_hf_birgermoell.csv")

# -----------------------------------------------------Speaker identification-------------------------------------------------------

speaker_audio_folder = "AudioData/speaker_audio/speaker_audio"
other_audio_folder = "AudioData/speaker_audio/other_human_audio"

# for all
# process_audio_files(speaker_audio_folder, "voice_recognition/vr_speaker_all_features.csv")
# process_audio_files(other_audio_folder, "voice_recognition/vr_other_all_features.csv")

# for mean
# process_audio_files_mean(speaker_audio_folder, "voice_recognition/training_mean/vr_speaker_mean_features2.csv")
# process_audio_files_mean(other_audio_folder, "voice_recognition/training_mean/vr_other_mean_features2.csv")\

0it [00:00, ?it/s]

Processing spanish86_cloned.mp3
True
Processing speechify_cloned_voice_file107_2024-10-08_11-56-11.mp3
True
Processing speechify_cloned_voice_file108_2024-10-08_12-11-52.mp3
True
Processing speechify_cloned_voice_file110_2024-10-08_12-12-18.mp3
True
Processing speechify_cloned_voice_file112_2024-10-08_12-13-16.mp3
True
Processing speechify_cloned_voice_file115_2024-10-08_12-13-48.mp3
True
Processing speechify_cloned_voice_file116_2024-10-08_12-16-58.mp3
True
Processing speechify_cloned_voice_file119_2024-10-08_12-18-59.mp3
True
Processing speechify_cloned_voice_file122_2024-10-08_12-19-22.mp3
True
Processing speechify_cloned_voice_file125_2024-10-08_12-20-51.mp3
True
Processing speechify_cloned_voice_file127_2024-10-08_12-21-13.mp3
True
Processing speechify_cloned_voice_file12_2024-10-06_18-55-26.mp3
True
Processing speechify_cloned_voice_file130_2024-10-08_12-22-14.mp3
True
Processing speechify_cloned_voice_file134_2024-10-08_12-24-46.mp3
True
Processing speechify_cloned_voice_file137

1it [00:20, 20.77s/it]

True
Number of processed files:  68
Successfully saved combined features to deepfake_detection/Synthetic_full_speechify.csv





# 8. For training only

In [19]:
ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcript', 'duration'],
        num_rows: 405
    })
    validation: Dataset({
        features: ['audio', 'transcript', 'duration'],
        num_rows: 86
    })
    test: Dataset({
        features: ['audio', 'transcript', 'duration'],
        num_rows: 88
    })
})

# For Testing Extract 1 Audio

In [28]:
#  For testing. Extracting 1 audio file

loc = "AudioData/test/voice_recognition_test"
# FULL
# process_audio_files(loc, "voice_recognition/test_full/vr_same_test4.csv") #Change the file name

# MEAN
process_audio_files_mean(loc, "voice_recognition//test_mean//vr_notsame_mean_test3.csv") #Change the file name

[[-5.0998273e+02 -4.8956113e+02 -4.8937140e+02 ... -4.9340005e+02
  -4.9539423e+02 -5.0779730e+02]
 [ 5.0587605e+01  6.6524628e+01  6.6727997e+01 ...  5.9833664e+01
   6.1890678e+01  5.5993782e+01]
 [ 2.4690987e+01  2.5347982e+01  2.4821363e+01 ...  3.2736610e+01
   3.3107082e+01  3.4006363e+01]
 ...
 [-6.4265317e-01 -6.4265317e-01 -6.4265317e-01 ... -6.3449788e-01
  -6.3449788e-01 -6.3449788e-01]
 [-1.3168605e-01 -1.3168605e-01 -1.3168605e-01 ... -5.3732771e-01
  -5.3732771e-01 -5.3732771e-01]
 [ 4.8490107e-01  4.8490107e-01  4.8490107e-01 ... -5.6581754e-02
  -5.6581754e-02 -5.6581754e-02]]
True
Number of processed files:  1
Successfully saved combined features to voice_recognition//test_mean//vr_notsame_mean_test3.csv
