Task 6:  Music Genre Classification Description

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# Define the path to your zipped file on Google Drive
# Replace 'your_folder_name' with the actual folder name in your Drive
zip_path = '/content/drive/MyDrive/Colab Notebooks/Data.zip'

In [13]:
dataset_path = '/content/GTZAN/genres_original'

Step 1: Setup and Data Loading

In [8]:
import os
import zipfile
import warnings
import librosa
import numpy as np

# Suppress the specific warnings from librosa
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# --- STEP 1: DOWNLOAD AND UNZIP FROM KAGGLE ---
# Install the Kaggle API
print("Installing Kaggle API...")
os.system("pip install kaggle")
os.system("pip install --upgrade --force-reinstall --no-deps kaggle")

# Authenticate with Kaggle API token
# A file upload dialog will appear. Upload the 'kaggle.json' file you downloaded.
print("\nPlease upload your kaggle.json file. Click 'Choose Files' below.")
from google.colab import files
files.upload()
print("\nAuthentication complete.")

# Create a directory to store the Kaggle API key
!mkdir ~/.kaggle
# Move the uploaded kaggle.json to the correct directory
!mv kaggle.json ~/.kaggle/
# Set permissions
!chmod 600 ~/.kaggle/kaggle.json

# Define the Kaggle dataset and paths
kaggle_dataset = 'andradaolteanu/gtzan-dataset-music-genre-classification'
dataset_zip_name = 'genres_original.zip'
extract_path = '/content/GTZAN'

# Download the dataset from Kaggle
print(f"\nDownloading dataset from Kaggle: {kaggle_dataset}")
!kaggle datasets download -d {kaggle_dataset}

# Unzip the downloaded file
print(f"\nUnzipping {dataset_zip_name}...")
with zipfile.ZipFile(dataset_zip_name, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("Unzipping complete.")


# --- STEP 2: PROCESS THE DATA ---
dataset_path = os.path.join(extract_path, 'genres_original')

# Check if the folder was created
if not os.path.exists(dataset_path):
    print(f"Error: The directory {dataset_path} was not created. Please check the unzipping step.")
else:
    genres = sorted(os.listdir(dataset_path))
    print(f"Found genres: {genres}")

    features = []
    labels = []

    # Loop through each genre
    for genre in genres:
        genre_path = os.path.join(dataset_path, genre)
        if os.path.isdir(genre_path):
            print(f"Processing genre: {genre}")
            for filename in os.listdir(genre_path):
                if filename.endswith('.wav'):
                    file_path = os.path.join(genre_path, filename)
                    try:
                        y, sr = librosa.load(file_path, mono=True, duration=30)
                        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
                        mfccs_mean = np.mean(mfccs.T, axis=0)

                        features.append(mfccs_mean)
                        labels.append(genre)
                    except Exception as e:
                        if "jazz.00054.wav" in file_path:
                            print(f"Skipping known corrupted file: {file_path}")
                        else:
                            print(f"Error processing {file_path}: {e}")

    # Continue with the rest of your data processing and model training
    # ... (e.g., train_test_split, scaling, model training)
    print("\nFeature extraction complete. You can now proceed with model training.")

Installing Kaggle API...

Please upload your kaggle.json file. Click 'Choose Files' below.


Saving kaggle.json to kaggle.json

Authentication complete.

Downloading dataset from Kaggle: andradaolteanu/gtzan-dataset-music-genre-classification
Dataset URL: https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification
License(s): other
Downloading gtzan-dataset-music-genre-classification.zip to /content
100% 1.21G/1.21G [00:10<00:00, 58.3MB/s]
100% 1.21G/1.21G [00:10<00:00, 119MB/s] 

Unzipping genres_original.zip...
Unzipping complete.
Found genres: ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
Processing genre: blues
Processing genre: classical
Processing genre: country
Processing genre: disco
Processing genre: hiphop
Processing genre: jazz
Skipping known corrupted file: /content/GTZAN/genres_original/jazz/jazz.00054.wav
Processing genre: metal
Processing genre: pop
Processing genre: reggae
Processing genre: rock

Feature extraction complete. You can now proceed with model training.


Step 2: Data Preprocessing and Model Training

In [9]:
# Convert lists to NumPy arrays
X = np.array(features)
y = np.array(labels)

# Encode the labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a classifier (e.g., SVM)
model = SVC(kernel='rbf', C=10)
model.fit(X_train_scaled, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Print a classification report for detailed metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Model Accuracy: 67.00%

Classification Report:
              precision    recall  f1-score   support

       blues       0.81      0.85      0.83        20
   classical       0.81      0.85      0.83        20
     country       0.71      0.60      0.65        20
       disco       0.50      0.65      0.57        20
      hiphop       0.61      0.55      0.58        20
        jazz       0.75      0.75      0.75        20
       metal       0.93      0.70      0.80        20
         pop       0.78      0.70      0.74        20
      reggae       0.52      0.60      0.56        20
        rock       0.43      0.45      0.44        20

    accuracy                           0.67       200
   macro avg       0.68      0.67      0.67       200
weighted avg       0.68      0.67      0.67       200



Approach 2: Image-based Classification (Spectrograms) with CNNs

In [10]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import zipfile

# --- FIX 1: UNZIP THE DATASET ---
# Define the path to your zipped audio data
zip_path = '/content/genres_original.zip'

# Define the directory where you want to extract the contents
# This will be your new dataset_path
extracted_path = '/content/GTZAN_genres'

# Create the extraction directory if it doesn't exist
if not os.path.exists(extracted_path):
    os.makedirs(extracted_path)

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    print(f"Extracting {zip_path} to {extracted_path}...")
    zip_ref.extractall(extracted_path)
    print("Extraction complete.")


# --- FIX 2: CORRECT THE DATASET AND SPECTROGRAM PATHS ---
# Now, dataset_path points to the extracted folder, not the .zip file
# The unzipped folder is likely named 'genres_original' inside your new extracted_path
dataset_path = os.path.join(extracted_path, 'genres_original')
spectrogram_path = '/content/images_original'

# Create the output directory for spectrograms if it doesn't exist
if not os.path.exists(spectrogram_path):
    os.makedirs(spectrogram_path)

# Loop through each genre and file
# This will now work correctly
for genre in os.listdir(dataset_path):
    genre_audio_path = os.path.join(dataset_path, genre)
    genre_image_path = os.path.join(spectrogram_path, genre)

    if os.path.isdir(genre_audio_path):
        if not os.path.exists(genre_image_path):
            os.makedirs(genre_image_path)

        print(f"Generating spectrograms for genre: {genre}")
        for filename in os.listdir(genre_audio_path):
            if filename.endswith('.wav'):
                audio_file = os.path.join(genre_audio_path, filename)
                image_file = os.path.join(genre_image_path, filename.replace('.wav', '.png'))

                try:
                    # Load audio file
                    y, sr = librosa.load(audio_file, mono=True, duration=30)

                    # Generate Mel-spectrogram
                    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
                    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

                    # Save the spectrogram as an image
                    plt.figure(figsize=(10, 4), frameon=False)
                    librosa.display.specshow(mel_spectrogram_db, x_axis='time', y_axis='mel')
                    plt.axis('off') # Remove axes and border
                    plt.tight_layout(pad=0)
                    plt.savefig(image_file, bbox_inches='tight', pad_inches=0)
                    plt.close() # Close the figure to free up memory
                except Exception as e:
                    print(f"Error processing {audio_file}: {e}")

Extracting /content/genres_original.zip to /content/GTZAN_genres...
Extraction complete.
Generating spectrograms for genre: country
Generating spectrograms for genre: pop
Generating spectrograms for genre: rock
Generating spectrograms for genre: reggae
Generating spectrograms for genre: jazz
Error processing /content/GTZAN_genres/genres_original/jazz/jazz.00054.wav: 
Generating spectrograms for genre: disco
Generating spectrograms for genre: hiphop
Generating spectrograms for genre: metal
Generating spectrograms for genre: blues
Generating spectrograms for genre: classical


Step 2: Build and Train the CNN Model

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing import image_dataset_from_directory

# Define image dimensions
img_height, img_width = 288, 432 # Default dimensions for GTZAN spectrograms

# Load the dataset from the directory
train_ds = image_dataset_from_directory(
    spectrogram_path,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=32
)

val_ds = image_dataset_from_directory(
    spectrogram_path,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=32
)

# Define the number of classes
num_classes = len(train_ds.class_names)

# Build the CNN model
model = Sequential([
    # Rescale pixel values
    tf.keras.layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs
)

# Evaluate the model
loss, accuracy = model.evaluate(val_ds)
print(f"Test accuracy: {accuracy * 100:.2f}%")

Found 999 files belonging to 10 classes.
Using 800 files for training.
Found 999 files belonging to 10 classes.
Using 199 files for validation.
Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 15s/step - accuracy: 0.1259 - loss: 3.8768 - val_accuracy: 0.1709 - val_loss: 2.1189
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 15s/step - accuracy: 0.2168 - loss: 2.1042 - val_accuracy: 0.1960 - val_loss: 2.0242
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 14s/step - accuracy: 0.2747 - loss: 1.9771 - val_accuracy: 0.3518 - val_loss: 1.7993
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 14s/step - accuracy: 0.3763 - loss: 1.7434 - val_accuracy: 0.4372 - val_loss: 1.6419
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m358s[0m 14s/step - accuracy: 0.3884 - loss: 1.6676 - val_accuracy: 0.4372 - val_loss: 1.6045
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━