# Audio Tracks Processing and Dataset Creation
For convenience, all audio files from the GTZAN dataset, originally divided by genre into separate folders, have been placed together in a single directory called "GTZAN_30s". In this notebook, the 30s tracks are split into 3s pieces. Then, spectral and other music-specific features are extracted from the tracks and the final dataset is created, containing tracks features classified by their musical genre.

In [None]:
# Imports

import os
from tqdm.notebook import tqdm
from pathlib import Path

import librosa
from pydub import AudioSegment

In [None]:
# 30s .wav files are split into 3s .wav files

input_folder = Path(r"..\dataset\GTZAN_30s")
output_folder = Path(r"..\dataset\GTZAN_3s")
output_folder.mkdir(exist_ok=True)

chunk_length_ms = 3 * 1000  # 3s

for wav_file in tqdm(sorted(input_folder.glob("*.wav"))):
    audio = AudioSegment.from_wav(wav_file)
    file_name = wav_file.stem 
    
    num_chunks = len(audio) // chunk_length_ms
    
    for i in range(num_chunks):
        start_ms = i * chunk_length_ms
        end_ms = start_ms + chunk_length_ms
        chunk = audio[start_ms:end_ms]
        
        chunk_name = f"{file_name}_{i:02d}.wav"
        chunk.export(output_folder / chunk_name, format="wav")

In [None]:
# Audio tracks analysis and feature extraction

def extract_features(filepath):
    # 22.05 kHz sampling
    y, sr = librosa.load(filepath, sr=22050)

    features = {}

    # Mel Frequency Cepstral Coefficients
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=15)
    for i in range(mfcc.shape[0]):
        features[f'mfcc_{i+1}_mean'] = np.mean(mfcc[i])
        features[f'mfcc_{i+1}_std'] = np.std(mfcc[i])

    # Chroma
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    for i in range(chroma.shape[0]):
        features[f'chroma_{i+1}_mean'] = np.mean(chroma[i])

    # Spectral contrast
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    for i in range(contrast.shape[0]):
        features[f'contrast_{i+1}_mean'] = np.mean(contrast[i])

    # Other spectral features
    features['spec_centroid_mean'] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    features['spec_bandwidth_mean'] = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    features['spec_rolloff_mean'] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    features['zcr_mean'] = np.mean(librosa.feature.zero_crossing_rate(y))
    features['rms_mean'] = np.mean(librosa.feature.rms(y=y))

    # Tempo
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    features['tempo'] = tempo

    return features

In [None]:
# Using the extract_features function to create the dataset

audio_dir = "../dataset/GTZAN_3s"
dataset = []

for filename in tqdm(os.listdir(audio_dir), desc="Processing audio files"):
    if filename.lower().endswith(".wav"):
        filepath = os.path.join(audio_dir, filename)
        
        # Extract genre from filename
        genre = filename.split('.')[0]
        
        try:
            features = extract_features(filepath)
            features['genre'] = genre
            
            dataset.append(features)
        except Exception as e:
            print(f"Error processing {filename}: {e}")

df = pd.DataFrame(dataset)   

df['tempo'] = df['tempo'].astype(float)

df.to_csv("../df/project_features.csv", index=False)

df.columns   

Processing audio files:   0%|          | 0/9991 [00:00<?, ?it/s]

Index(['mfcc_1_mean', 'mfcc_1_std', 'mfcc_2_mean', 'mfcc_2_std', 'mfcc_3_mean',
       'mfcc_3_std', 'mfcc_4_mean', 'mfcc_4_std', 'mfcc_5_mean', 'mfcc_5_std',
       'mfcc_6_mean', 'mfcc_6_std', 'mfcc_7_mean', 'mfcc_7_std', 'mfcc_8_mean',
       'mfcc_8_std', 'mfcc_9_mean', 'mfcc_9_std', 'mfcc_10_mean',
       'mfcc_10_std', 'mfcc_11_mean', 'mfcc_11_std', 'mfcc_12_mean',
       'mfcc_12_std', 'mfcc_13_mean', 'mfcc_13_std', 'mfcc_14_mean',
       'mfcc_14_std', 'mfcc_15_mean', 'mfcc_15_std', 'chroma_1_mean',
       'chroma_2_mean', 'chroma_3_mean', 'chroma_4_mean', 'chroma_5_mean',
       'chroma_6_mean', 'chroma_7_mean', 'chroma_8_mean', 'chroma_9_mean',
       'chroma_10_mean', 'chroma_11_mean', 'chroma_12_mean', 'contrast_1_mean',
       'contrast_2_mean', 'contrast_3_mean', 'contrast_4_mean',
       'contrast_5_mean', 'contrast_6_mean', 'contrast_7_mean',
       'spec_centroid_mean', 'spec_bandwidth_mean', 'spec_rolloff_mean',
       'zcr_mean', 'rms_mean', 'tempo', 'genre'],
      