In [1]:
%pip install librosa pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import librosa
import numpy as np
import pandas as pd

def extract_mfcc_features(audio_path, n_mfcc=13):
    y, sr = librosa.load(audio_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc_mean = np.mean(mfcc.T, axis=0)
    return mfcc_mean

def process_directory(directory, label):
    features = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.flac'):
                file_path = os.path.join(root, file)
                mfcc_features = extract_mfcc_features(file_path)
                feature_entry = mfcc_features.tolist()
                feature_entry.append(label)
                features.append(feature_entry)
    return features

def main():
    base_dir = 'c:\\Users\\MAWEL\\Desktop\\Final Project\\Dataset'
    ai_generated_dir = os.path.join(base_dir, 'ai_generated')
    human_generated_dir = os.path.join(base_dir, 'human_generated')

    # Extract features for AI-generated audio
    ai_features = process_directory(ai_generated_dir, 'FAKE')

    # Extract features for human-generated audio
    human_features = process_directory(human_generated_dir, 'REAL')

    # Combine the features and create a DataFrame
    all_features = ai_features + human_features
    column_names = [f'mfcc_{i+1}' for i in range(len(all_features[0]) - 1)] + ['label']
    df = pd.DataFrame(all_features, columns=column_names)

    # Save the DataFrame to a CSV file
    output_csv = 'c:\\Users\\MAWEL\\Desktop\\Final Project\\Dataset\\audio_features.csv'
    df.to_csv(output_csv, index=False)
    print(f'MFCC features saved to {output_csv}')

if __name__ == '__main__':
    main()

MFCC features saved to c:\Users\MAWEL\Desktop\Final Project\Dataset\audio_features.csv
