In [1]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.utils import to_categorical
import warnings
from librosa.feature.rhythm import tempo
from tqdm import tqdm




2024-05-10 09:20:37.274180: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-10 09:20:37.274311: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-10 09:20:37.368123: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-10 09:20:37.601287: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def extract_audio_features(input_dir, output_file_X, output_file_y):
    audio_data = []
    genre_labels = []

    for genre_dir in os.listdir(input_dir):
        genre_label = genre_dir
        subdirectory_path = os.path.join(input_dir, genre_dir)

        for filename in tqdm(os.listdir(subdirectory_path), desc=f"Processing files in {genre_dir}"):
            filepath = os.path.join(subdirectory_path, filename)
            try:
                audio_features = load_audio_and_preprocess(filepath)
                audio_data.append(audio_features)
                genre_labels.append(genre_label)
            except UserWarning as e:
                if "Trying to estimate tuning from empty frequency set" in str(e):
                    print(f"Error processing file {filename} in {genre_dir}: {e}")
                    print(filepath)

    audio_data_np = np.array(audio_data)
    genre_labels_np = np.array(genre_labels)

    encoder = LabelEncoder()
    genre_labels_encoded = encoder.fit_transform(genre_labels_np)
    genre_labels_encoded_categorical = to_categorical(genre_labels_encoded)

    np.save(output_file_X, audio_data_np)
    np.save(output_file_y, genre_labels_encoded_categorical)

    return audio_data_np, genre_labels_encoded_categorical



In [5]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    return mel_spectrogram

# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="genres_segment",
    output_file_X="extracted_features/X_mel.npy",
    output_file_y="extracted_features/y_mel.npy"
)

## Load the validation data
#X_val, y_val = extract_audio_features(
#    input_dir="devided_data/Validation",
#    output_file_X="extracted_features/X_val_mel.npy",
#    output_file_y="extracted_features/y_val_mel.npy"
#)
#
## Load the test data
#X_test, y_test = extract_audio_features(
#    input_dir="devided_data/Test",
#    output_file_X="extracted_features/X_test_mel.npy",
#    output_file_y="extracted_features/y_test_mel.npy"
#)

Processing files in hiphop: 100%|██████████| 98/98 [00:10<00:00,  9.14it/s]
Processing files in classical: 100%|██████████| 98/98 [00:10<00:00,  9.33it/s]
Processing files in blues: 100%|██████████| 100/100 [00:10<00:00,  9.28it/s]
Processing files in metal: 100%|██████████| 100/100 [00:10<00:00,  9.25it/s]
Processing files in jazz: 100%|██████████| 100/100 [00:10<00:00,  9.58it/s]
Processing files in country: 100%|██████████| 97/97 [00:10<00:00,  9.34it/s]
Processing files in pop: 100%|██████████| 100/100 [00:10<00:00,  9.47it/s]
Processing files in rock: 100%|██████████| 99/99 [00:10<00:00,  9.47it/s]
Processing files in disco: 100%|██████████| 99/99 [00:10<00:00,  9.35it/s]
Processing files in reggae: 100%|██████████| 100/100 [00:10<00:00,  9.15it/s]


In [None]:
tempogram = librosa.feature.tempogram(y=y, sr=sr)


In [None]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    tempogram = librosa.feature.tempogram(y=y, sr=sr)
    return tempogram

# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="genres_segment",
    output_file_X="extracted_features/X_tempo.npy",
    output_file_y="extracted_features/y_tempo.npy"
)

Processing files in hiphop: 100%|███████████████████████████████████████████████████████| 98/98 [00:15<00:00,  6.18it/s]
Processing files in classical: 100%|████████████████████████████████████████████████████| 98/98 [00:14<00:00,  6.82it/s]
Processing files in blues: 100%|██████████████████████████████████████████████████████| 100/100 [00:14<00:00,  6.81it/s]
Processing files in metal: 100%|██████████████████████████████████████████████████████| 100/100 [00:14<00:00,  6.83it/s]
Processing files in jazz: 100%|███████████████████████████████████████████████████████| 100/100 [00:14<00:00,  6.76it/s]
Processing files in country:  19%|██████████                                            | 18/97 [00:02<00:11,  6.93it/s]

In [8]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    MFCC_features = librosa.feature.mfcc(y=y, sr=sr)
    return MFCC_features

# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="genres_segment",
    output_file_X="extracted_features/X_mfcc.npy",
    output_file_y="extracted_features/y_mfcc.npy"
)

Processing files in hiphop: 100%|██████████| 98/98 [00:11<00:00,  8.68it/s]
Processing files in classical: 100%|██████████| 98/98 [00:11<00:00,  8.61it/s]
Processing files in blues: 100%|██████████| 100/100 [00:11<00:00,  8.72it/s]
Processing files in metal: 100%|██████████| 100/100 [00:11<00:00,  9.07it/s]
Processing files in jazz: 100%|██████████| 100/100 [00:10<00:00,  9.29it/s]
Processing files in country: 100%|██████████| 97/97 [00:10<00:00,  9.49it/s]
Processing files in pop: 100%|██████████| 100/100 [00:10<00:00,  9.45it/s]
Processing files in rock: 100%|██████████| 99/99 [00:12<00:00,  8.05it/s]
Processing files in disco: 100%|██████████| 99/99 [00:12<00:00,  7.84it/s]
Processing files in reggae: 100%|██████████| 100/100 [00:12<00:00,  7.77it/s]


In [4]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    MFCC_features = librosa.feature.mfcc(y=y, sr=sr)

    MFCC_mean = np.mean(MFCC_features)
    MFCC_std = np.std(MFCC_features)

    MFCC_features_standardized = (MFCC_features - MFCC_mean) / MFCC_std

    return MFCC_features_standardized
# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="mix3/train",
    output_file_X="mix3/features/X_train.npy",
    output_file_y="mix3/features/y_train.npy"
)

# Load the validation data
X_val, y_val = extract_audio_features(
    input_dir="mix3/val",
    output_file_X="mix3/features/X_val.npy",
    output_file_y="mix3/features/y_val.npy"
)

# Load the test data
X_test, y_test = extract_audio_features(
    input_dir="mix3/test",
    output_file_X="mix3/features/X_test.npy",
    output_file_y="mix3/features/y_test.npy"
)

Processing files in hiphop: 100%|██████████| 948/948 [00:28<00:00, 32.83it/s]
Processing files in classical: 100%|██████████| 948/948 [00:33<00:00, 27.96it/s]
Processing files in blues: 100%|██████████| 950/950 [00:31<00:00, 30.14it/s]
Processing files in metal: 100%|██████████| 950/950 [00:24<00:00, 38.36it/s]
Processing files in jazz: 100%|██████████| 940/940 [00:25<00:00, 36.63it/s]
Processing files in country: 100%|██████████| 949/949 [00:29<00:00, 31.87it/s]
Processing files in pop: 100%|██████████| 950/950 [00:25<00:00, 37.78it/s]
Processing files in rock: 100%|██████████| 949/949 [00:24<00:00, 38.58it/s]
Processing files in disco: 100%|██████████| 950/950 [00:28<00:00, 33.35it/s]
Processing files in reggae: 100%|██████████| 950/950 [00:24<00:00, 39.50it/s]
Processing files in hiphop: 100%|██████████| 119/119 [00:03<00:00, 34.84it/s]
Processing files in classical: 100%|██████████| 119/119 [00:03<00:00, 31.86it/s]
Processing files in blues: 100%|██████████| 119/119 [00:02<00:00, 4

In [6]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    chroma_features = librosa.feature.chroma_stft(y=y, sr=sr)
    return chroma_features

# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="genres_segment",
    output_file_X="extracted_features/X_chroma.npy",
    output_file_y="extracted_features/y_chroma.npy"
)

# Load the validation data
#X_val, y_val = extract_audio_features(
#    input_dir="mix3/val",
#    output_file_X="mix3/features/X_val.npy",
#    output_file_y="mix3/features/y_val.npy"
#)
#
## Load the test data
#X_test, y_test = extract_audio_features(
#    input_dir="mix3/test",
#    output_file_X="mix3/features/X_test.npy",
#    output_file_y="mix3/features/y_test.npy"
#)

Processing files in hiphop: 100%|██████████| 98/98 [00:15<00:00,  6.50it/s]
Processing files in classical: 100%|██████████| 98/98 [00:14<00:00,  6.81it/s]
Processing files in blues: 100%|██████████| 100/100 [00:15<00:00,  6.40it/s]
Processing files in metal: 100%|██████████| 100/100 [00:15<00:00,  6.62it/s]
Processing files in jazz: 100%|██████████| 100/100 [00:14<00:00,  6.76it/s]
Processing files in country: 100%|██████████| 97/97 [00:14<00:00,  6.64it/s]
Processing files in pop: 100%|██████████| 100/100 [00:16<00:00,  6.22it/s]
Processing files in rock: 100%|██████████| 99/99 [00:15<00:00,  6.30it/s]
Processing files in disco: 100%|██████████| 99/99 [00:15<00:00,  6.44it/s]
Processing files in reggae: 100%|██████████| 100/100 [00:16<00:00,  6.20it/s]


In [7]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
    return tonnetz

# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="genres_segment",
    output_file_X="extracted_features/X_tonnetz.npy",
    output_file_y="extracted_features/y_tonnetz.npy"
)

Processing files in hiphop: 100%|██████████| 98/98 [00:49<00:00,  1.99it/s]
Processing files in classical: 100%|██████████| 98/98 [00:49<00:00,  1.98it/s]
Processing files in blues: 100%|██████████| 100/100 [00:49<00:00,  2.04it/s]
Processing files in metal: 100%|██████████| 100/100 [00:49<00:00,  2.03it/s]
Processing files in jazz: 100%|██████████| 100/100 [00:47<00:00,  2.10it/s]
Processing files in country: 100%|██████████| 97/97 [00:46<00:00,  2.10it/s]
Processing files in pop: 100%|██████████| 100/100 [00:47<00:00,  2.10it/s]
Processing files in rock: 100%|██████████| 99/99 [00:49<00:00,  2.02it/s]
Processing files in disco: 100%|██████████| 99/99 [00:49<00:00,  2.02it/s]
Processing files in reggae: 100%|██████████| 100/100 [00:48<00:00,  2.05it/s]


In [5]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)

    spectral_centroid_mean = np.mean(spectral_centroid)
    spectral_centroid_std = np.std(spectral_centroid)

    spectral_centroid_standardized = (spectral_centroid - spectral_centroid_mean) / spectral_centroid_std

    return spectral_centroid_standardized

# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="gtzan_try3/train",
    output_file_X="gtzan_try3/features/X_train.npy",
    output_file_y="gtzan_try3/features/y_train.npy"
)

# Load the validation data
X_val, y_val = extract_audio_features(
    input_dir="gtzan_try3/val",
    output_file_X="gtzan_try3/features/X_val.npy",
    output_file_y="gtzan_try3/features/y_val.npy"
)

# Load the test data
X_test, y_test = extract_audio_features(
    input_dir="gtzan_try3/test",
    output_file_X="gtzan_try3/features/X_test.npy",
    output_file_y="gtzan_try3/features/y_test.npy"
)

Processing files in hiphop: 100%|██████████| 798/798 [00:06<00:00, 124.83it/s]
Processing files in classical: 100%|██████████| 798/798 [00:06<00:00, 125.95it/s]
Processing files in blues: 100%|██████████| 800/800 [00:06<00:00, 128.53it/s]
Processing files in metal: 100%|██████████| 800/800 [00:06<00:00, 126.64it/s]
Processing files in jazz: 100%|██████████| 790/790 [00:06<00:00, 126.62it/s]
Processing files in country: 100%|██████████| 800/800 [00:06<00:00, 128.29it/s]
Processing files in pop: 100%|██████████| 800/800 [00:06<00:00, 127.46it/s]
Processing files in rock: 100%|██████████| 799/799 [00:06<00:00, 129.69it/s]
Processing files in disco: 100%|██████████| 800/800 [00:06<00:00, 129.71it/s]
Processing files in reggae: 100%|██████████| 800/800 [00:06<00:00, 130.50it/s]
Processing files in hiphop: 100%|██████████| 100/100 [00:00<00:00, 127.95it/s]
Processing files in classical: 100%|██████████| 100/100 [00:00<00:00, 123.95it/s]
Processing files in blues: 100%|██████████| 100/100 [00

In [6]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    zero_crossing_rate_features = librosa.feature.zero_crossing_rate(y)

    zero_crossing_rate_mean = np.mean(zero_crossing_rate_features)
    zero_crossing_rate_std = np.std(zero_crossing_rate_features)

    zero_crossing_rate_features_standardized = (zero_crossing_rate_features - zero_crossing_rate_mean) / zero_crossing_rate_std

    return zero_crossing_rate_features_standardized

# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="gtzan_try3/train",
    output_file_X="gtzan_try3/features/X_train.npy",
    output_file_y="gtzan_try3/features/y_train.npy"
)

# Load the validation data
X_val, y_val = extract_audio_features(
    input_dir="gtzan_try3/val",
    output_file_X="gtzan_try3/features/X_val.npy",
    output_file_y="gtzan_try3/features/y_val.npy"
)

# Load the test data
X_test, y_test = extract_audio_features(
    input_dir="gtzan_try3/test",
    output_file_X="gtzan_try3/features/X_test.npy",
    output_file_y="gtzan_try3/features/y_test.npy"
)

Processing files in hiphop: 100%|██████████| 798/798 [00:02<00:00, 383.23it/s]
Processing files in classical: 100%|██████████| 798/798 [00:02<00:00, 389.23it/s]
Processing files in blues: 100%|██████████| 800/800 [00:02<00:00, 396.36it/s]
Processing files in metal: 100%|██████████| 800/800 [00:02<00:00, 397.39it/s]
Processing files in jazz: 100%|██████████| 790/790 [00:02<00:00, 377.71it/s]
Processing files in country: 100%|██████████| 800/800 [00:02<00:00, 391.66it/s]
Processing files in pop: 100%|██████████| 800/800 [00:02<00:00, 380.17it/s]
Processing files in rock: 100%|██████████| 799/799 [00:02<00:00, 397.18it/s]
Processing files in disco: 100%|██████████| 800/800 [00:02<00:00, 381.82it/s]
Processing files in reggae: 100%|██████████| 800/800 [00:02<00:00, 336.52it/s]
Processing files in hiphop: 100%|██████████| 100/100 [00:00<00:00, 328.41it/s]
Processing files in classical: 100%|██████████| 100/100 [00:00<00:00, 324.45it/s]
Processing files in blues: 100%|██████████| 100/100 [00

In [7]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    spectral_bandwidth_features = librosa.feature.spectral_bandwidth(y=y, sr=sr)

    spectral_bandwidth_mean = np.mean(spectral_bandwidth_features)
    spectral_bandwidth_std = np.std(spectral_bandwidth_features)

    spectral_bandwidth_features_standardized = (spectral_bandwidth_features - spectral_bandwidth_mean) / spectral_bandwidth_std

    return spectral_bandwidth_features_standardized

# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="gtzan_try3/train",
    output_file_X="gtzan_try3/features/X_train.npy",
    output_file_y="gtzan_try3/features/y_train.npy"
)

# Load the validation data
X_val, y_val = extract_audio_features(
    input_dir="gtzan_try3/val",
    output_file_X="gtzan_try3/features/X_val.npy",
    output_file_y="gtzan_try3/features/y_val.npy"
)

# Load the test data
X_test, y_test = extract_audio_features(
    input_dir="gtzan_try3/test",
    output_file_X="gtzan_try3/features/X_test.npy",
    output_file_y="gtzan_try3/features/y_test.npy"
)

Processing files in hiphop: 100%|██████████| 798/798 [00:09<00:00, 84.43it/s] 
Processing files in classical: 100%|██████████| 798/798 [00:07<00:00, 100.43it/s]
Processing files in blues: 100%|██████████| 800/800 [00:07<00:00, 102.81it/s]
Processing files in metal: 100%|██████████| 800/800 [00:07<00:00, 101.12it/s]
Processing files in jazz: 100%|██████████| 790/790 [00:07<00:00, 102.32it/s]
Processing files in country: 100%|██████████| 800/800 [00:07<00:00, 101.17it/s]
Processing files in pop: 100%|██████████| 800/800 [00:07<00:00, 103.33it/s]
Processing files in rock: 100%|██████████| 799/799 [00:07<00:00, 103.27it/s]
Processing files in disco: 100%|██████████| 800/800 [00:07<00:00, 106.25it/s]
Processing files in reggae: 100%|██████████| 800/800 [00:07<00:00, 102.47it/s]
Processing files in hiphop: 100%|██████████| 100/100 [00:00<00:00, 100.48it/s]
Processing files in classical: 100%|██████████| 100/100 [00:00<00:00, 102.93it/s]
Processing files in blues: 100%|██████████| 100/100 [00

In [8]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    energy_features = librosa.feature.rms(y=y)

    energy_mean = np.mean(energy_features)
    energy_std = np.std(energy_features)

    energy_features_standardized = (energy_features - energy_mean) / energy_std

    return energy_features_standardized


# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="gtzan_try3/train",
    output_file_X="gtzan_try3/features/X_train.npy",
    output_file_y="gtzan_try3/features/y_train.npy"
)

# Load the validation data
X_val, y_val = extract_audio_features(
    input_dir="gtzan_try3/val",
    output_file_X="gtzan_try3/features/X_val.npy",
    output_file_y="gtzan_try3/features/y_val.npy"
)

# Load the test data
X_test, y_test = extract_audio_features(
    input_dir="gtzan_try3/test",
    output_file_X="gtzan_try3/features/X_test.npy",
    output_file_y="gtzan_try3/features/y_test.npy"
)

Processing files in hiphop: 100%|██████████| 798/798 [00:04<00:00, 165.52it/s]
Processing files in classical: 100%|██████████| 798/798 [00:04<00:00, 162.42it/s]
Processing files in blues: 100%|██████████| 800/800 [00:04<00:00, 161.18it/s]
Processing files in metal: 100%|██████████| 800/800 [00:04<00:00, 164.33it/s]
Processing files in jazz: 100%|██████████| 790/790 [00:04<00:00, 163.05it/s]
Processing files in country: 100%|██████████| 800/800 [00:04<00:00, 170.46it/s]
Processing files in pop: 100%|██████████| 800/800 [00:04<00:00, 169.29it/s]
Processing files in rock: 100%|██████████| 799/799 [00:04<00:00, 169.83it/s]
Processing files in disco: 100%|██████████| 800/800 [00:04<00:00, 163.55it/s]
Processing files in reggae: 100%|██████████| 800/800 [00:04<00:00, 174.44it/s]
Processing files in hiphop: 100%|██████████| 100/100 [00:00<00:00, 155.79it/s]
Processing files in classical: 100%|██████████| 100/100 [00:00<00:00, 164.90it/s]
Processing files in blues: 100%|██████████| 100/100 [00

In [3]:
def load_audio_and_preprocess(filepath):
 
    y, sr = librosa.load(filepath)
    spectral_contrast_features = librosa.feature.spectral_contrast(y=y, sr=sr)
    
    spectral_contrast_mean = np.mean(spectral_contrast_features, axis=1)
    spectral_contrast_std = np.std(spectral_contrast_features, axis=1)
    
    spectral_contrast_features_standardized = (spectral_contrast_features - spectral_contrast_mean[:, None]) / spectral_contrast_std[:, None]
    
    return spectral_contrast_features_standardized


# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="gtzan_try3/train",
    output_file_X="gtzan_try3/features/X_train.npy",
    output_file_y="gtzan_try3/features/y_train.npy"
)

# Load the validation data
X_val, y_val = extract_audio_features(
    input_dir="gtzan_try3/val",
    output_file_X="gtzan_try3/features/X_val.npy",
    output_file_y="gtzan_try3/features/y_val.npy"
)

# Load the test data
X_test, y_test = extract_audio_features(
    input_dir="gtzan_try3/test",
    output_file_X="gtzan_try3/features/X_test.npy",
    output_file_y="gtzan_try3/features/y_test.npy"
)    

Processing files in hiphop: 100%|██████████| 798/798 [00:11<00:00, 72.44it/s]
Processing files in classical: 100%|██████████| 798/798 [00:09<00:00, 82.61it/s]
Processing files in blues: 100%|██████████| 800/800 [00:09<00:00, 85.32it/s]
Processing files in metal: 100%|██████████| 800/800 [00:08<00:00, 90.12it/s]
Processing files in jazz: 100%|██████████| 790/790 [00:09<00:00, 85.16it/s]
Processing files in country: 100%|██████████| 800/800 [00:09<00:00, 83.08it/s]
Processing files in pop: 100%|██████████| 800/800 [00:09<00:00, 82.97it/s]
Processing files in rock: 100%|██████████| 799/799 [00:09<00:00, 84.91it/s]
Processing files in disco: 100%|██████████| 800/800 [00:09<00:00, 85.93it/s]
Processing files in reggae: 100%|██████████| 800/800 [00:09<00:00, 86.01it/s]
Processing files in hiphop: 100%|██████████| 100/100 [00:01<00:00, 83.12it/s]
Processing files in classical: 100%|██████████| 100/100 [00:01<00:00, 86.72it/s]
Processing files in blues: 100%|██████████| 100/100 [00:01<00:00, 8

In [11]:
def load_audio_and_preprocess(filepath):

    y, sr = librosa.load(filepath)
    
    # Extract rhythm pattern features
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
    rhythm_pattern_features = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr)
    
    # Standardize the features
    rhythm_pattern_mean = np.mean(rhythm_pattern_features, axis=1)
    rhythm_pattern_std = np.std(rhythm_pattern_features, axis=1)
    rhythm_pattern_std[rhythm_pattern_std == 0] = 1  # Avoid division by zero
    rhythm_pattern_features_standardized = (rhythm_pattern_features - rhythm_pattern_mean[:, None]) / rhythm_pattern_std[:, None]
    
    return rhythm_pattern_features_standardized


# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="gtzan_try3/train",
    output_file_X="gtzan_try3/features/X_train.npy",
    output_file_y="gtzan_try3/features/y_train.npy"
)

# Load the validation data
X_val, y_val = extract_audio_features(
    input_dir="gtzan_try3/val",
    output_file_X="gtzan_try3/features/X_val.npy",
    output_file_y="gtzan_try3/features/y_val.npy"
)

# Load the test data
X_test, y_test = extract_audio_features(
    input_dir="gtzan_try3/test",
    output_file_X="gtzan_try3/features/X_test.npy",
    output_file_y="gtzan_try3/features/y_test.npy"
)

Processing files in hiphop: 100%|██████████| 798/798 [01:38<00:00,  8.08it/s]
Processing files in classical: 100%|██████████| 798/798 [01:36<00:00,  8.25it/s]
Processing files in blues: 100%|██████████| 800/800 [01:39<00:00,  8.06it/s]
Processing files in metal: 100%|██████████| 800/800 [01:37<00:00,  8.21it/s]
Processing files in jazz: 100%|██████████| 790/790 [01:33<00:00,  8.43it/s]
Processing files in country: 100%|██████████| 800/800 [01:37<00:00,  8.23it/s]
Processing files in pop: 100%|██████████| 800/800 [01:39<00:00,  8.04it/s]
Processing files in rock: 100%|██████████| 799/799 [01:40<00:00,  7.98it/s]
Processing files in disco: 100%|██████████| 800/800 [01:47<00:00,  7.46it/s]
Processing files in reggae: 100%|██████████| 800/800 [01:52<00:00,  7.13it/s]
Processing files in hiphop: 100%|██████████| 100/100 [00:14<00:00,  6.75it/s]
Processing files in classical: 100%|██████████| 100/100 [00:14<00:00,  7.04it/s]
Processing files in blues: 100%|██████████| 100/100 [00:17<00:00,  

In [3]:
def load_audio_and_preprocess(filepath):

    y, sr = librosa.load(filepath)
    
    # Extract harmonic features
    harmonic, percussive = librosa.effects.hpss(y)
    pitch, mag = librosa.core.piptrack(y=y, sr=sr)
    harmonic_features = np.concatenate([pitch, mag], axis=0)
    
    # Standardize the features
    harmonic_features_mean = np.mean(harmonic_features, axis=1)
    harmonic_features_std = np.std(harmonic_features, axis=1)
    harmonic_features_std[harmonic_features_std == 0] = 1  # Avoid division by zero
    harmonic_features_standardized = (harmonic_features - harmonic_features_mean[:, None]) / harmonic_features_std[:, None]
    
    return harmonic_features_standardized


# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="gtzan_try3/train",
    output_file_X="gtzan_try3/features/X_train.npy",
    output_file_y="gtzan_try3/features/y_train.npy"
)

# Load the validation data
X_val, y_val = extract_audio_features(
    input_dir="gtzan_try3/val",
    output_file_X="gtzan_try3/features/X_val.npy",
    output_file_y="gtzan_try3/features/y_val.npy"
)

# Load the test data
X_test, y_test = extract_audio_features(
    input_dir="gtzan_try3/test",
    output_file_X="gtzan_try3/features/X_test.npy",
    output_file_y="gtzan_try3/features/y_test.npy"
)

Processing files in hiphop:   0%|          | 0/798 [00:00<?, ?it/s]Processing files in hiphop: 100%|██████████| 798/798 [02:15<00:00,  5.87it/s]
Processing files in classical: 100%|██████████| 798/798 [02:14<00:00,  5.93it/s]
Processing files in blues: 100%|██████████| 800/800 [02:15<00:00,  5.91it/s]
Processing files in metal: 100%|██████████| 800/800 [02:20<00:00,  5.68it/s]
Processing files in jazz: 100%|██████████| 790/790 [02:13<00:00,  5.93it/s]
Processing files in country: 100%|██████████| 800/800 [02:14<00:00,  5.94it/s]
Processing files in pop: 100%|██████████| 800/800 [02:15<00:00,  5.91it/s]
Processing files in rock: 100%|██████████| 799/799 [02:22<00:00,  5.60it/s]
Processing files in disco: 100%|██████████| 800/800 [02:21<00:00,  5.66it/s]
Processing files in reggae:  56%|█████▋    | 450/800 [08:21<1:22:14, 14.10s/it]

In [13]:
# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="gtzan_try_segment/train",
    output_file_X="gtzan_try_segment/features/X_train.npy",
    output_file_y="gtzan_try_segment/features/y_train.npy"
)

# Load the validation data
X_val, y_val = extract_audio_features(
    input_dir="gtzan_try_segment/val",
    output_file_X="gtzan_try_segment/features/X_val.npy",
    output_file_y="gtzan_try_segment/features/y_val.npy"
)

# Load the test data
X_test, y_test = extract_audio_features(
    input_dir="gtzan_try_segment/test",
    output_file_X="gtzan_try_segment/features/X_test.npy",
    output_file_y="gtzan_try_segment/features/y_test.npy"
)


Processing files in hiphop: 100%|█████████████████████████████████████████████████████| 478/478 [00:04<00:00, 98.68it/s]
Processing files in classical: 100%|█████████████████████████████████████████████████| 478/478 [00:04<00:00, 102.58it/s]
Processing files in blues: 100%|██████████████████████████████████████████████████████| 480/480 [00:04<00:00, 99.13it/s]
Processing files in metal: 100%|█████████████████████████████████████████████████████| 480/480 [00:04<00:00, 100.11it/s]
Processing files in jazz: 100%|██████████████████████████████████████████████████████| 474/474 [00:04<00:00, 100.15it/s]
Processing files in country: 100%|████████████████████████████████████████████████████| 480/480 [00:04<00:00, 97.87it/s]
Processing files in pop: 100%|███████████████████████████████████████████████████████| 480/480 [00:04<00:00, 101.16it/s]
Processing files in rock: 100%|██████████████████████████████████████████████████████| 479/479 [00:04<00:00, 102.80it/s]
Processing files in disco: 100%|

In [2]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from tqdm import tqdm

def extract_audio_features(input_dir, output_dir):
    mel_spectrogram = []
    mfcc = []
    chroma = []
    spectral_contrast = []
    zero_crossing_rate = []
    spectral_centroid = []
    spectral_rolloff = []
    tonnetz = []
    spectral_bandwidth = []
    spectral_flatness = []
    poly_features = []
    rms = []
    tempogram = []
    genre_labels = []

    for genre_dir in os.listdir(input_dir):
        genre_label = genre_dir
        subdirectory_path = os.path.join(input_dir, genre_dir)

        for filename in tqdm(os.listdir(subdirectory_path), desc=f"Processing files in {genre_dir}"):
            filepath = os.path.join(subdirectory_path, filename)
            try:
                (mel_spectrogram_data, mfcc_data, chroma_data, spectral_contrast_data, zero_crossing_rate_data,
                 spectral_centroid_data, spectral_rolloff_data, tonnetz_data, spectral_bandwidth_data,
                 spectral_flatness_data, poly_features_data, rms_data, tempogram_data) = load_audio_and_preprocess(filepath)
                mel_spectrogram.append(mel_spectrogram_data)
                mfcc.append(mfcc_data)
                chroma.append(chroma_data)
                spectral_contrast.append(spectral_contrast_data)
                zero_crossing_rate.append(zero_crossing_rate_data)
                spectral_centroid.append(spectral_centroid_data)
                spectral_rolloff.append(spectral_rolloff_data)
                tonnetz.append(tonnetz_data)
                spectral_bandwidth.append(spectral_bandwidth_data)
                spectral_flatness.append(spectral_flatness_data)
                poly_features.append(poly_features_data)
                rms.append(rms_data)
                tempogram.append(tempogram_data)
                genre_labels.append(genre_label)
            except UserWarning as e:
                if "Trying to estimate tuning from empty frequency set" in str(e):
                    print(f"Error processing file {filename} in {genre_dir}: {e}")
                    print(filepath)

    # Convert lists to NumPy arrays
    mel_spectrogram_np = np.array(mel_spectrogram)
    mfcc_np = np.array(mfcc)
    chroma_np = np.array(chroma)
    spectral_contrast_np = np.array(spectral_contrast)
    zero_crossing_rate_np = np.array(zero_crossing_rate)
    spectral_centroid_np = np.array(spectral_centroid)
    spectral_rolloff_np = np.array(spectral_rolloff)
    tonnetz_np = np.array(tonnetz)
    spectral_bandwidth_np = np.array(spectral_bandwidth)
    spectral_flatness_np = np.array(spectral_flatness)
    poly_features_np = np.array(poly_features)
    rms_np = np.array(rms)
    tempogram_np = np.array(tempogram)
    genre_labels_np = np.array(genre_labels)

    # Encode and categorize the genre labels
    encoder = LabelEncoder()
    genre_labels_encoded = encoder.fit_transform(genre_labels_np)
    genre_labels_encoded_categorical = to_categorical(genre_labels_encoded)

    # Save the individual feature sets and labels
    np.save(os.path.join(output_dir, "X_train_mel_spectrogram.npy"), mel_spectrogram_np)
    np.save(os.path.join(output_dir, "X_train_mfcc.npy"), mfcc_np)
    np.save(os.path.join(output_dir, "X_train_chroma.npy"), chroma_np)
    np.save(os.path.join(output_dir, "X_train_spectral_contrast.npy"), spectral_contrast_np)
    np.save(os.path.join(output_dir, "X_train_zero_crossing_rate.npy"), zero_crossing_rate_np)
    np.save(os.path.join(output_dir, "X_train_spectral_centroid.npy"), spectral_centroid_np)
    np.save(os.path.join(output_dir, "X_train_spectral_rolloff.npy"), spectral_rolloff_np)
    np.save(os.path.join(output_dir, "X_train_tonnetz.npy"), tonnetz_np)
    np.save(os.path.join(output_dir, "X_train_spectral_bandwidth.npy"), spectral_bandwidth_np)
    np.save(os.path.join(output_dir, "X_train_spectral_flatness.npy"), spectral_flatness_np)
    np.save(os.path.join(output_dir, "X_train_poly_features.npy"), poly_features_np)
    np.save(os.path.join(output_dir, "X_train_rms.npy"), rms_np)
    np.save(os.path.join(output_dir, "X_train_tempogram.npy"), tempogram_np)
    np.save(os.path.join(output_dir, "y_train.npy"), genre_labels_encoded_categorical)

    return (mel_spectrogram_np, mfcc_np, chroma_np, spectral_contrast_np, zero_crossing_rate_np,
            spectral_centroid_np, spectral_rolloff_np, tonnetz_np, spectral_bandwidth_np,
            spectral_flatness_np, poly_features_np, rms_np, tempogram_np), genre_labels_encoded_categorical

def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)

    # Mel-Spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)

    # MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    # Chroma
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)

    # Spectral Contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    # Zero Crossing Rate
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)

    # Spectral Centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)

    # Spectral Rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)

    # Tonnetz
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr)

    # Spectral Bandwidth
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)

    # Spectral Flatness
    spectral_flatness = librosa.feature.spectral_flatness(S=mel_spectrogram)

    # Poly Features
    poly_features = librosa.feature.poly_features(y=y, sr=sr)

    # RMS
    rms = librosa.feature.rms(y=y)

    # Tempogram
    tempogram = librosa.feature.tempogram(y=y, sr=sr)

    return mel_spectrogram, mfcc, chroma, spectral_contrast, zero_crossing_rate, \
           spectral_centroid, spectral_rolloff, tonnetz, spectral_bandwidth, \
           spectral_flatness, poly_features, rms, tempogram

# Load data function
def load_data(input_dir, output_dir, function):
    features, labels = function(input_dir=input_dir, output_dir=output_dir)

    # Unpack the features tuple
    X_mel_spectrogram, X_mfcc, X_chroma, X_spectral_contrast, X_zero_crossing_rate, \
    X_spectral_centroid, X_spectral_rolloff, X_tonnetz, X_spectral_bandwidth, \
    X_spectral_flatness, X_poly_features, X_rms, X_tempogram = features

    # Unpack the labels
    y = labels

    return X_mel_spectrogram, X_mfcc, X_chroma, X_spectral_contrast, X_zero_crossing_rate, \
           X_spectral_centroid, X_spectral_rolloff, X_tonnetz, X_spectral_bandwidth, \
           X_spectral_flatness, X_poly_features, X_rms, X_tempogram, y



In [None]:
# Load the training data
X_train_mel_spectrogram, X_train_mfcc, X_train_chroma, X_train_spectral_contrast, X_train_zero_crossing_rate, \
X_train_spectral_centroid, X_train_spectral_rolloff, X_train_tonnetz, X_train_spectral_bandwidth, \
X_train_spectral_flatness, X_train_poly_features, X_train_rms, X_train_tempogram, y_train = \
    load_data(input_dir="gtzan_try_segment/train", output_dir="gtzan_try_segment/features", function=extract_audio_features)

# Load the validation data
X_val_mel_spectrogram, X_val_mfcc, X_val_chroma, X_val_spectral_contrast, X_val_zero_crossing_rate, \
X_val_spectral_centroid, X_val_spectral_rolloff, X_val_tonnetz, X_val_spectral_bandwidth, \
X_val_spectral_flatness, X_val_poly_features, X_val_rms, X_val_tempogram, y_val = \
    load_data(input_dir="gtzan_try_segment/val", output_dir="gtzan_try_segment/features", function=extract_audio_features)

# Load the test data
X_test_mel_spectrogram, X_test_mfcc, X_test_chroma, X_test_spectral_contrast, X_test_zero_crossing_rate, \
X_test_spectral_centroid, X_test_spectral_rolloff, X_test_tonnetz, X_test_spectral_bandwidth, \
X_test_spectral_flatness, X_test_poly_features, X_test_rms, X_test_tempogram, y_test = \
    load_data(input_dir="gtzan_try_segment/test", output_dir="gtzan_try_segment/features", function=extract_audio_features)

Processing files in hiphop: 100%|█████████████████████████████████████████████████████| 478/478 [03:47<00:00,  2.10it/s]
Processing files in classical: 100%|██████████████████████████████████████████████████| 478/478 [03:46<00:00,  2.11it/s]
Processing files in blues: 100%|██████████████████████████████████████████████████████| 480/480 [03:46<00:00,  2.12it/s]
Processing files in metal: 100%|██████████████████████████████████████████████████████| 480/480 [03:47<00:00,  2.11it/s]
Processing files in jazz: 100%|███████████████████████████████████████████████████████| 474/474 [03:43<00:00,  2.12it/s]
Processing files in country: 100%|████████████████████████████████████████████████████| 480/480 [03:46<00:00,  2.11it/s]
Processing files in pop: 100%|████████████████████████████████████████████████████████| 480/480 [03:49<00:00,  2.09it/s]
Processing files in rock: 100%|███████████████████████████████████████████████████████| 479/479 [03:48<00:00,  2.10it/s]
Processing files in disco: 100%|

In [21]:
def load_audio_and_preprocess(filepath):
    y, sr = librosa.load(filepath)
    
    # Set the hop length to 256 samples
    hop_length = 256
    
    # Extract the existing features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=hop_length)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr, hop_length=hop_length)
    energy = librosa.feature.rms(y=y)
    zcr = librosa.feature.zero_crossing_rate(y)
    
    # Extract additional features
    beat_hist = librosa.feature.fourier_tempogram(y=y, sr=sr, hop_length=hop_length)
    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
    tempo = librosa.feature.rhythm.tempo(y=y, sr=sr)[0]
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length)
    spectral_flatness = librosa.feature.spectral_flatness(y=y, hop_length=hop_length)
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=hop_length)
    loudness = librosa.feature.rms(y=y)
    
    # Standardize the features
    mfcc_mean = np.mean(mfcc)
    mfcc_std = np.std(mfcc)
    mfcc_standardized = (mfcc - mfcc_mean) / mfcc_std
    
    mel_spectrogram_mean = np.mean(mel_spectrogram)
    mel_spectrogram_std = np.std(mel_spectrogram)
    mel_spectrogram_standardized = (mel_spectrogram - mel_spectrogram_mean) / mel_spectrogram_std
    
    chroma_stft_mean = np.mean(chroma_stft)
    chroma_stft_std = np.std(chroma_stft)
    chroma_stft_standardized = (chroma_stft - chroma_stft_mean) / chroma_stft_std
    
    spec_cent_mean = np.mean(spec_cent)
    spec_cent_std = np.std(spec_cent)
    spec_cent_standardized = (spec_cent - spec_cent_mean) / spec_cent_std
    
    spec_bw_mean = np.mean(spec_bw)
    spec_bw_std = np.std(spec_bw)
    spec_bw_standardized = (spec_bw - spec_bw_mean) / spec_bw_std
    
    energy_mean = np.mean(energy)
    energy_std = np.std(energy)
    energy_standardized = (energy - energy_mean) / energy_std
    
    zcr_mean = np.mean(zcr)
    zcr_std = np.std(zcr)
    zcr_standardized = (zcr - zcr_mean) / zcr_std
    
    beat_hist_mean = np.mean(beat_hist)
    beat_hist_std = np.std(beat_hist)
    beat_hist_standardized = (beat_hist - beat_hist_mean) / beat_hist_std
    
    onset_env_mean = np.mean(onset_env)
    onset_env_std = np.std(onset_env)
    onset_env_standardized = (onset_env - onset_env_mean) / onset_env_std
    
    tempo_std = np.std(tempo)
    if tempo_std != 0:
        tempo_standardized = (tempo - np.mean(tempo)) / tempo_std
    else:
        tempo_standardized = tempo - np.mean(tempo)
    
    chroma_mean = np.mean(chroma)
    chroma_std = np.std(chroma)
    chroma_standardized = (chroma - chroma_mean) / chroma_std
    
    spectral_flatness_mean = np.mean(spectral_flatness)
    spectral_flatness_std = np.std(spectral_flatness)
    spectral_flatness_standardized = (spectral_flatness - spectral_flatness_mean) / spectral_flatness_std
    
    spectral_contrast_mean = np.mean(spectral_contrast)
    spectral_contrast_std = np.std(spectral_contrast)
    spectral_contrast_standardized = (spectral_contrast - spectral_contrast_mean) / spectral_contrast_std
    
    loudness_mean = np.mean(loudness)
    loudness_std = np.std(loudness)
    loudness_standardized = (loudness - loudness_mean) / loudness_std
    
    return {
        "mfcc": mfcc_standardized,
        "mel_spectrogram": mel_spectrogram_standardized,
        "chroma_stft": chroma_stft_standardized,
        "spec_cent": spec_cent_standardized,
        "spec_bw": spec_bw_standardized,
        "energy": energy_standardized,
        "zcr": zcr_standardized,
        "beat_hist": beat_hist_standardized,
        "onset_env": onset_env_standardized,
        "tempo": tempo_standardized,
        "chroma": chroma_standardized,
        "spectral_flatness": spectral_flatness_standardized,
        "spectral_contrast": spectral_contrast_standardized,
        "loudness": loudness_standardized
    }

In [22]:
# Load the training data
X_train, y_train = extract_audio_features(
    input_dir="gtzan_try_segment/train",
    output_file_X="gtzan_try_segment/features/X_train.npy",
    output_file_y="gtzan_try_segment/features/y_train.npy"
)

# Access the individual features from the X_train data
mfcc_train = X_train["mfcc"]
mel_spectrogram_train = X_train["mel_spectrogram"]
chroma_stft_train = X_train["chroma_stft"]
spec_cent_train = X_train["spec_cent"]
spec_bw_train = X_train["spec_bw"]
energy_train = X_train["energy"]
zcr_train = X_train["zcr"]
beat_hist_train = X_train["beat_hist"]
onset_env_train = X_train["onset_env"]
tempo_train = X_train["tempo"]
chroma_train = X_train["chroma"]
spectral_flatness_train = X_train["spectral_flatness"]
spectral_contrast_train = X_train["spectral_contrast"]
loudness_train = X_train["loudness"]

# Load the validation data
X_val, y_val = extract_audio_features(
    input_dir="gtzan_try_segment/val",
    output_file_X="gtzan_try_segment/features/X_val.npy",
    output_file_y="gtzan_try_segment/features/y_val.npy"
)

# Access the individual features from the X_val data
mfcc_val = X_val["mfcc"]
mel_spectrogram_val = X_val["mel_spectrogram"]
chroma_stft_val = X_val["chroma_stft"]
spec_cent_val = X_val["spec_cent"]
spec_bw_val = X_val["spec_bw"]
energy_val = X_val["energy"]
zcr_val = X_val["zcr"]
beat_hist_val = X_val["beat_hist"]
onset_env_val = X_val["onset_env"]
tempo_val = X_val["tempo"]
chroma_val = X_val["chroma"]
spectral_flatness_val = X_val["spectral_flatness"]
spectral_contrast_val = X_val["spectral_contrast"]
loudness_val = X_val["loudness"]

# Load the test data
X_test, y_test = extract_audio_features(
    input_dir="gtzan_try_segment/test",
    output_file_X="gtzan_try_segment/features/X_test.npy",
    output_file_y="gtzan_try_segment/features/y_test.npy"
)

# Access the individual features from the X_test data
mfcc_test = X_test["mfcc"]
mel_spectrogram_test = X_test["mel_spectrogram"]
chroma_stft_test = X_test["chroma_stft"]
spec_cent_test = X_test["spec_cent"]
spec_bw_test = X_test["spec_bw"]
energy_test = X_test["energy"]
zcr_test = X_test["zcr"]
beat_hist_test = X_test["beat_hist"]
onset_env_test = X_test["onset_env"]
tempo_test = X_test["tempo"]
chroma_test = X_test["chroma"]
spectral_flatness_test = X_test["spectral_flatness"]
spectral_contrast_test = X_test["spectral_contrast"]
loudness_test = X_test["loudness"]

Processing files in hiphop:   0%|          | 0/478 [00:00<?, ?it/s]Processing files in hiphop: 100%|██████████| 478/478 [06:30<00:00,  1.22it/s]
Processing files in classical: 100%|██████████| 478/478 [06:55<00:00,  1.15it/s]
Processing files in blues: 100%|██████████| 480/480 [06:28<00:00,  1.23it/s]
Processing files in metal: 100%|██████████| 480/480 [06:53<00:00,  1.16it/s]
Processing files in jazz: 100%|██████████| 474/474 [06:41<00:00,  1.18it/s]
Processing files in country: 100%|██████████| 480/480 [06:22<00:00,  1.26it/s]
Processing files in pop: 100%|██████████| 480/480 [06:21<00:00,  1.26it/s]
Processing files in rock: 100%|██████████| 479/479 [06:20<00:00,  1.26it/s]
Processing files in disco: 100%|██████████| 480/480 [06:14<00:00,  1.28it/s]
Processing files in reggae: 100%|██████████| 480/480 [06:14<00:00,  1.28it/s]


: 