In [7]:
import numpy as np
import librosa
import matplotlib.pyplot as plt
import math, json, os
from tensorflow.keras.models import load_model

In [2]:
model = load_model('model_cnn2.h5')
model.summary()



In [3]:
def predict_audio(file_path, fs=22050, n_mfcc=11, n_fft=2048, hop_length=512, segment_duration=30):
    """
    Predict the genre of an audio file, ensuring the correct input shape for the model.

    Parameters:
        file_path (str): Path to the audio file.
        fs (int): Sampling rate for audio.
        n_mfcc (int): Number of MFCC coefficients (features).
        n_fft (int): FFT window size.
        hop_length (int): Number of samples between successive frames.
        segment_duration (int): Duration of each segment in seconds.

    Returns:
        predicted_genre (str): Predicted genre of the audio.
    """
    # Load the audio file
    audio, _ = librosa.load(file_path, sr=fs)

    # Calculate segment length in samples
    segment_length = fs * segment_duration

    # Handle short audio: Pad to 30 seconds
    if len(audio) < segment_length:
        audio = np.pad(audio, (0, segment_length - len(audio)), mode='constant')

    # Handle long audio: Split into segments
    num_segments = len(audio) // segment_length
    predictions = []

    for i in range(num_segments):
        start = i * segment_length
        end = start + segment_length
        segment = audio[start:end]

        # Extract MFCCs for the segment
        mfcc = librosa.feature.mfcc(
            y=segment,
            sr=fs,
            n_mfcc=n_mfcc,  # Match the expected number of features
            n_fft=n_fft,
            hop_length=hop_length
        )

        # Ensure we have exactly 128 time steps
        if mfcc.shape[1] < 128:  # Pad if shorter
            mfcc = np.pad(mfcc, ((0, 0), (0, 128 - mfcc.shape[1])), mode='constant')
        else:  # Truncate if longer
            mfcc = mfcc[:, :128]

        # Transpose to match the model's input shape (time_steps, features)
        mfcc = mfcc.T  # Now shape is (128, 11)

        # Add batch and channel dimensions
        # Final shape should be (batch_size, time_steps, features, channels)
        mfcc = mfcc[np.newaxis, ..., np.newaxis]  # Reshape to (1, 128, 11, 1)

        # Predict the genre for this segment
        pred_probs = model.predict(mfcc, verbose=0)
        pred_class = np.argmax(pred_probs, axis=1)
        predictions.append(pred_class[0])

    # Aggregate predictions across all segments
    from collections import Counter
    most_common_genre = Counter(predictions).most_common(1)[0][0]

    # Map to genre name
    genre_map = {0: 'Blues', 1: 'Classical', 2: 'Country', 3: 'Disco', 4: 'Hip Hop',
                 5: 'Jazz', 6: 'Metal', 7: 'Pop', 8: 'Reggae', 9: 'Rock'}
    predicted_genre = genre_map[most_common_genre]

    return predicted_genre

In [4]:
def predict_audio(file_path, fs=22050, n_mfcc=13, n_fft=2048, hop_length=512):
    """
    Predict the genre of an audio file by splitting it into 3-second segments.

    Parameters:
        file_path (str): Path to the audio file.
        fs (int): Sampling rate for audio.
        n_mfcc (int): Number of MFCC coefficients (features).
        n_fft (int): FFT window size.
        hop_length (int): Number of samples between successive frames.

    Returns:
        predicted_genre (str): Predicted genre of the audio.
    """
    # Load the audio file
    audio, _ = librosa.load(file_path, sr=fs)

    # Get total duration of the audio in seconds
    total_duration = librosa.get_duration(y=audio, sr=fs)
    print(f"Duration of audio: {total_duration:.2f} seconds")

    # Calculate the number of 3-second segments
    segment_duration = 3  # seconds
    num_segments = int(total_duration // segment_duration)
    print(f"Number of 3-second segments: {num_segments}")

    # Calculate segment length in samples
    segment_length = fs * segment_duration  # Samples per segment
    predictions = []

    # Process each 3-second segment
    for segment in range(num_segments):
        start = segment * segment_length
        end = start + segment_length
        segment_audio = audio[start:end]

        # Extract MFCCs
        mfcc = librosa.feature.mfcc(
            y=segment_audio,
            sr=fs,
            n_mfcc=n_mfcc,
            n_fft=n_fft,
            hop_length=hop_length
        )

        # Ensure MFCC has the expected shape
        if mfcc.shape[1] < 128:  # Pad if shorter
            mfcc = np.pad(mfcc, ((0, 0), (0, 128 - mfcc.shape[1])), mode='constant')
        else:  # Truncate if longer
            mfcc = mfcc[:, :128]

        # Transpose MFCC to match the model's input shape (time_steps, features)
        mfcc = mfcc.T  # Shape: (128, n_mfcc)

        # Add batch and channel dimensions
        mfcc = mfcc[np.newaxis, ..., np.newaxis]  # Shape: (1, 128, n_mfcc, 1)

        # Predict genre for the segment
        pred_probs = model.predict(mfcc, verbose=0)
        pred_class = np.argmax(pred_probs, axis=1)
        predictions.append(pred_class[0])

    # Aggregate predictions across all 3-second segments
    from collections import Counter
    most_common_genre = Counter(predictions).most_common(1)[0][0]

    # Map to genre name
    genre_map = {0: 'Blues', 1: 'Classical', 2: 'Country', 3: 'Disco', 4: 'Hip Hop',
                 5: 'Jazz', 6: 'Metal', 7: 'Pop', 8: 'Reggae', 9: 'Rock'}
    predicted_genre = genre_map[most_common_genre]

    return predicted_genre


In [5]:
def extract_mfcc_for_song(file_path, fs=22050, duration=30, n_fft=2048, hop_length=512, n_mfcc=13, num_segments=10):
    """
    Extract MFCCs from a single audio file.
    Splits the audio into smaller segments for better analysis and training data.

    Parameters:
        file_path (str): Path to the audio file.
        fs (int): Sampling rate for the audio.
        duration (int): Duration of the audio in seconds (default is 30).
        n_fft (int): FFT window size.
        hop_length (int): Number of samples between successive frames.
        n_mfcc (int): Number of MFCC coefficients.
        num_segments (int): Number of segments to split the audio into.

    Returns:
        mfcc_data (list): List of MFCCs for each segment of the audio file.
    """
    samples_per_track = fs * duration
    samples_per_segment = int(samples_per_track / num_segments)
    mfccs_per_segment = math.ceil(samples_per_segment / hop_length)

    print("Starting MFCC extraction for file...")
    mfcc_data = []

    try:
        # Load the audio file
        audio, _ = librosa.load(file_path, sr=fs)

        for segment in range(num_segments):
            start = segment * samples_per_segment
            end = start + samples_per_segment
            mfcc = librosa.feature.mfcc(
                y=audio[start:end], sr=fs, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length
            ).T

            # Validate segment shape
            if len(mfcc) == mfccs_per_segment:
                mfcc_data.append(mfcc.tolist())

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

    #print(f"Finished extracting MFCCs for: {os.path.basename(file_path)}")
    return mfcc_data


In [6]:
path1 = "./data/genres_original/blues/blues.00002.wav"
mfccs = extract_mfcc_for_song(
    file_path=path1,
    fs=22050,
    duration=30,
    n_fft=2048,
    hop_length=512,
    n_mfcc=13,
    num_segments=10
)

predictions = []
for mfcc in mfccs:
    mfcc_array = np.array(mfcc)  # Convert list to numpy array
    mfcc_array = mfcc_array[np.newaxis, ..., np.newaxis]  # Add batch and channel dimensions

    # Predict the genre for this MFCC
    pred_probs = model.predict(mfcc_array, verbose=0)
    pred_class = np.argmax(pred_probs, axis=1)
    predictions.append(pred_class[0])

# Aggregate predictions across all segments
from collections import Counter
most_common_genre = Counter(predictions).most_common(1)[0][0]

# Map prediction to genre name
genre_map = {0: 'Blues', 1: 'Classical', 2: 'Country', 3: 'Disco', 4: 'Hip Hop',
             5: 'Jazz', 6: 'Metal', 7: 'Pop', 8: 'Reggae', 9: 'Rock'}
predicted_genre = genre_map[most_common_genre]

# Print the result
print(f"Predicted Genre: {predicted_genre}")

NameError: name 'math' is not defined

In [None]:
path1 = wav_files["blues1"]  # Example: Blues song 1
path1 = "./data/genres_original/blues/blues.00001.wav"
path2 = "./data/genres_original/blues/blues.00002.wav"
path3 = "./data/genres_original/blues/blues.00003.wav"
path4 = "./data/genres_original/blues/blues.00004.wav"
path5 = "./data/genres_original/blues/blues.00005.wav"
path6 = "./data/genres_original/blues/blues.00006.wav"
path7 = "./data/genres_original/blues/blues.00007.wav"
path8 = "./data/genres_original/blues/blues.00008.wav"
path9 = "./data/genres_original/blues/blues.00009.wav"
path10 = "./data/genres_original/blues/blues.00010.wav"

mfccs = extract_mfcc_for_song(
    file_path=path1,
    fs=22050,
    duration=30,
    n_fft=2048,
    hop_length=512,
    n_mfcc=13,
    num_segments=10
)

genre1 = predict_audio(path1)
print(f"Predicted Genre (Short Audio): {genre1}")
genre2 = predict_audio(path2)
print(f"Predicted Genre (Short Audio): {genre2}")
genre3 = predict_audio(path3)
print(f"Predicted Genre (Short Audio): {genre3}")
genre4 = predict_audio(path4)
print(f"Predicted Genre (Short Audio): {genre4}")
genre5 = predict_audio(path5)
print(f"Predicted Genre (Short Audio): {genre5}")
genre6 = predict_audio(path6)
print(f"Predicted Genre (Short Audio): {genre6}")
genre7 = predict_audio(path7)
print(f"Predicted Genre (Short Audio): {genre7}")
genre8 = predict_audio(path8)
print(f"Predicted Genre (Short Audio): {genre8}")
genre9 = predict_audio(path9)
print(f"Predicted Genre (Short Audio): {genre9}")
genre10 = predict_audio(path10)
print(f"Predicted Genre (Short Audio): {genre10}")

Duration of audio: 30.01 seconds
Number of 3-second segments: 10
Predicted Genre (Short Audio): Metal
Duration of audio: 30.01 seconds
Number of 3-second segments: 10
Predicted Genre (Short Audio): Metal
Duration of audio: 30.01 seconds
Number of 3-second segments: 10
Predicted Genre (Short Audio): Disco
Duration of audio: 30.01 seconds
Number of 3-second segments: 10
Predicted Genre (Short Audio): Disco
Duration of audio: 30.01 seconds
Number of 3-second segments: 10
Predicted Genre (Short Audio): Disco
Duration of audio: 30.01 seconds
Number of 3-second segments: 10
Predicted Genre (Short Audio): Hip Hop
Duration of audio: 30.01 seconds
Number of 3-second segments: 10
Predicted Genre (Short Audio): Disco
Duration of audio: 30.01 seconds
Number of 3-second segments: 10
Predicted Genre (Short Audio): Disco
Duration of audio: 30.01 seconds
Number of 3-second segments: 10
Predicted Genre (Short Audio): Rock
Duration of audio: 30.01 seconds
Number of 3-second segments: 10
Predicted Genre 

In [None]:
from pydub import AudioSegment

def convert_to_wav(path, target_path):
    """
    Convert an audio file to WAV format using PyDub.
    """
    audio = AudioSegment.from_file(path)
    audio.export(target_path, format="wav")
    print(f"Converted {path} to {target_path}")

In [None]:
# file paths
blues1 = "./tests/blues_crossroads.mp3"
blues2 = "./tests/blues_thrillisgone.mp3"

classical1 = "./tests/classical_beethoven9.mp3"
classical2 = "./tests/classical_mozart.mp3"

country1 = "./tests/country_takemehome.mp3"
country2 = "./tests/country_jolene.mp3"

disco1 = "./tests/disco_stayingalive.mp3"
disco2 = "./tests/disco_ymca.mp3"

hiphop1 = "./tests/hiphop_sickomode.mp3"
hiphop2 = "./tests/hiphop_luciddreams.mp3"

jazz1 = "./tests/jazz_flymetothemoon.mp3"
jazz2 = "./tests/jazz_whatawonderfulworld.mp3"

metal1 = "./tests/metal_masterofpuppets.mp3"
metal2 = "./tests/metal_ironman.mp3"

pop1 = "./tests/pop_shapeofyou.mp3"
pop2 = "./tests/pop_baby.mp3"

reggae1 = "./tests/reggae_nowomannocry.mp3"
reggae2 = "./tests/reggae_badboys.mp3"

rock1 = "./tests/rock_bohemianrhapsody.mp3"
rock2 = "./tests/rock_stairwaytoheaven.mp3"

# wav file paths
wav_files = {
    "blues1": "./tests/blues_crossroads.wav",
    "blues2": "./tests/blues_thrillisgone.wav",
    "classical1": "./tests/classical_beethoven9.wav",
    "classical2": "./tests/classical_mozart.wav",
    "country1": "./tests/country_takemehome.wav",
    "country2": "./tests/country_jolene.wav",
    "disco1": "./tests/disco_stayingalive.wav",
    "disco2": "./tests/disco_ymca.wav",
    "hiphop1": "./tests/hiphop_sickomode.wav",
    "hiphop2": "./tests/hiphop_luciddreams.wav",
    "jazz1": "./tests/jazz_flymetothemoon.wav",
    "jazz2": "./tests/jazz_whatawonderfulworld.wav",
    "metal1": "./tests/metal_masterofpuppets.wav",
    "metal2": "./tests/metal_ironman.wav",
    "pop1": "./tests/pop_shapeofyou.wav",
    "pop2": "./tests/pop_baby.wav",
    "reggae1": "./tests/reggae_nowomannocry.wav",
    "reggae2": "./tests/reggae_badboys.wav",
    "rock1": "./tests/rock_bohemianrhapsody.wav",
    "rock2": "./tests/rock_stairwaytoheaven.wav",
}

In [None]:
mp3_files = [blues1, blues2, classical1, classical2, country1, country2,
             disco1, disco2, hiphop1, hiphop2, jazz1, jazz2,
             metal1, metal2, pop1, pop2, reggae1, reggae2, rock1, rock2]

for key, wav_path in wav_files.items():
    convert_to_wav(eval(key), wav_path)

Converted ./tests/blues_crossroads.mp3 to ./tests/blues_crossroads.wav
Converted ./tests/blues_thrillisgone.mp3 to ./tests/blues_thrillisgone.wav
Converted ./tests/classical_beethoven9.mp3 to ./tests/classical_beethoven9.wav
Converted ./tests/classical_mozart.mp3 to ./tests/classical_mozart.wav
Converted ./tests/country_takemehome.mp3 to ./tests/country_takemehome.wav
Converted ./tests/country_jolene.mp3 to ./tests/country_jolene.wav
Converted ./tests/disco_stayingalive.mp3 to ./tests/disco_stayingalive.wav
Converted ./tests/disco_ymca.mp3 to ./tests/disco_ymca.wav
Converted ./tests/hiphop_sickomode.mp3 to ./tests/hiphop_sickomode.wav
Converted ./tests/hiphop_luciddreams.mp3 to ./tests/hiphop_luciddreams.wav
Converted ./tests/jazz_flymetothemoon.mp3 to ./tests/jazz_flymetothemoon.wav
Converted ./tests/jazz_whatawonderfulworld.mp3 to ./tests/jazz_whatawonderfulworld.wav
Converted ./tests/metal_masterofpuppets.mp3 to ./tests/metal_masterofpuppets.wav
Converted ./tests/metal_ironman.mp3 t

In [None]:
import math

path1 = wav_files["blues1"]  # Example: Blues song 1
genre1 = predict_audio(path1)
print(f"Predicted Genre (Short Audio): {genre1}")

path2 = wav_files["rock2"]  # Example: Rock song 2
genre2 = predict_audio(path2)
print(f"Predicted Genre (Long Audio): {genre2}")

Predicted Genre (Short Audio): Rock
Predicted Genre (Long Audio): Reggae


In [None]:
new_audio_path = './data/genres_original/reggae/reggae.00000.wav'
genre = predict_audio(new_audio_path)
print(f"Predicted Genre (New Audio): {genre}")

Predicted Genre (New Audio): Country
