In [None]:
#pip installs
!pip3 install -q ipython-autotime
!pip3 install h5py -q
!mkdir /kaggle/tmp

In [None]:
%reload_ext autotime
import numpy as np
from pathlib import Path
import librosa
import h5py
import gc
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import torch
import torchaudio
from tqdm.notebook import tqdm

In [None]:
base_path = Path('/kaggle/input/thaat-and-raga-forest-dataset-ieee/Thaat and Raga Forest (TRF) Dataset')
output_file='/kaggle/working/processed_spectrograms.h5'

chunk_duration = 8.18
sampling_rate = 16000
chunk_samples = int(chunk_duration * sampling_rate)

In [None]:
total_files = 0
for thaat in base_path.iterdir():
    if thaat.is_dir():
        for raga in thaat.iterdir():
            if raga.is_dir():
                total_files += len(list(raga.glob('*')))
print(total_files)

In [None]:
def generate_spectrogram(audio_clip, n_mels=256, n_fft = 2048, hop_length= 512, device='cuda'):
    audio_tensor = torch.tensor(audio_clip, device=device)
    # Generate mel spectrogram
    mel_transform = torchaudio.transforms.MelSpectrogram(
        sample_rate = sampling_rate,
        n_fft = n_fft,
        hop_length = hop_length,
        n_mels = n_mels
    ).to(device)
    
    mel_spect = mel_transform(audio_tensor).detach()
    torch.cuda.empty_cache()
#     mel_spect = librosa.feature.melspectrogram(
#         y=audio_clip,
#         n_fft=n_fft,
#         hop_length=hop_length,
#         n_mels=n_mels
#     )
    
    # Convert to log scale
#     mel_spect_db = librosa.power_to_db(mel_spect, ref=np.max)
    mel_spect_db = torchaudio.transforms.AmplitudeToDB()(mel_spect)
    
#     # Normalize
#     mel_spect_db = (mel_spect_db - mel_spect_db.min()) / (mel_spect_db.max() - mel_spect_db.min())
# #     mel_spect_db = librosa.util.fix_length(mel_spect_db, size=256, axis=1)
#     mel_spect_db = torch.nn.functional.pad(mel_spect_db, (0, max(0, 256 - mel_spect_db.size(-1))), mode="constant", value=0)
    
    # Normalize to the range [0, 255]
    mel_spect_db = (mel_spect_db - mel_spect_db.min()) / (mel_spect_db.max() - mel_spect_db.min())
    mel_spect_db = (mel_spect_db * 255).to(torch.uint8)  # Scale and convert to uint8
    
    mel_spect_db_cpu = mel_spect_db.cpu().numpy()  # Convert once to CPU
    del mel_spect_db, mel_spect  # Free GPU memory explicitly
    torch.cuda.empty_cache()  # Clear cache
    return mel_spect_db_cpu

In [None]:
def process_audio_dataset(batch_size=100):
    label_encoder = LabelEncoder()
    all_thaats = []
    for thaat_path in base_path.iterdir():
        if thaat_path.is_dir():
            thaat_name = thaat_path.name.split(" ")[0]  # Remove "(thaat)" suffix
            all_thaats.append(thaat_name)

    # Fit the encoders
    label_encoder.fit(all_thaats)
    onehot_encoder = OneHotEncoder(sparse_output=False)
    onehot_encoder.fit(label_encoder.transform(all_thaats).reshape(-1, 1))
    n_classes = len(label_encoder.classes_)

    print(f"Found {n_classes} thaat classes: {', '.join(label_encoder.classes_)}")
    print(f"Total audio files to process: {total_files}")
    
    with h5py.File(output_file, 'w') as hf:
        spect_dset = hf.create_dataset(
            'spectrograms', 
            shape=(0, 256, 256),        # Initial shape
            maxshape=(None, 256, 256),  # Fixed shape with 254 time frames
            dtype='uint8',
            compression='gzip',
            compression_opts=9,
            chunks=(5, 256, 256)        # Optimize chunking for I/O
        )

        
        label_dset = hf.create_dataset(
            'labels',
            shape=(0, n_classes),
            maxshape=(None, n_classes),
            dtype='float32',
            compression='gzip',
            chunks=(5, n_classes)
        )
        
#         hf.attrs['label_classes'] = label_encoder.classes_
#         hf.attrs['sampling_rate'] = sampling_rate
#         hf.attrs['chunk_duration'] = chunk_duration
        
        current_position = 0
        temp_spects = []
        temp_labels = []
        pbar = tqdm(total=total_files, desc="Processing files")
        
        # Process all thaats
        for thaat_path in base_path.iterdir():
            if not thaat_path.is_dir():
                continue
                
            thaat_name = thaat_path.name.split(" ")[0]
            print(f"\nProcessing thaat: {thaat_name}")
            
            # Process each raga in the thaat
            for raga_path in thaat_path.iterdir():
                if not raga_path.is_dir():
                    continue
                    
                # Process each audio file in the raga
                for audio_file in raga_path.glob('*'):
                    try:
                        # Load audio file
                        y, _ = librosa.load(str(audio_file), mono=True, sr=sampling_rate)
                        
                        # Process chunks
                        for i in range(0, len(y) - chunk_samples + 1, chunk_samples):
                            chunk = y[i:i + chunk_samples]
                            if len(chunk) == chunk_samples:  # Only process complete chunks
                                # Generate spectrogram
                                spect = generate_spectrogram(chunk)
                                
                                # Encode label
                                label_encoded = label_encoder.transform([thaat_name])[0]
                                label_onehot = onehot_encoder.transform([[label_encoded]])
                                
                                temp_spects.append(spect)
                                temp_labels.append(label_onehot[0])
                                
                                # When batch is full, save to H5 file
                                if len(temp_spects) >= batch_size:
                                    spect_batch = np.array(temp_spects, dtype='uint8')
                                    label_batch = np.array(temp_labels, dtype='uint8')
                                    
                                    # Resize datasets
                                    new_size = current_position + len(spect_batch)
                                    spect_dset.resize((new_size, 256, 256))
                                    label_dset.resize((new_size, n_classes))
                                    
                                    # Save batch
                                    spect_dset[current_position:new_size] = spect_batch
                                    label_dset[current_position:new_size] = label_batch
                                    
                                    # Update position and clear temporary lists
                                    current_position = new_size
                                    temp_spects = []
                                    temp_labels = []
                                    
                                    # Force garbage collection
                                    gc.collect()
                        
                        # Clear memory after processing each file
                        del y
                        gc.collect()
                        
                        # Update progress bar
                        pbar.update(1)
                        
                    except Exception as e:
                        print(f"\nError processing file {audio_file}: {str(e)}")
                        pbar.update(1)
                        continue
            
        # Save any remaining data
        if temp_spects:
            spect_batch = np.array(temp_spects, dtype='uint8')
            label_batch = np.array(temp_labels, dtype='uint8')
            new_size = current_position + len(spect_batch)
            spect_dset.resize((new_size, 256, 256))
            label_dset.resize((new_size, n_classes))
            spect_dset[current_position:new_size] = spect_batch
            label_dset[current_position:new_size] = label_batch
        
        pbar.close()
        print(f"\nProcessing complete. Total spectrograms generated: {new_size}")

In [None]:
process_audio_dataset(batch_size=100)

In [None]:
# with h5py.File('/kaggle/working/processed_spectrograms.h5', 'r') as hf:
#     # Load spectrograms and labels datasets
#     spectrograms = hf['spectrograms'][:]
#     labels = hf['labels'][:]
# # Check loaded data shapes
# print("Spectrograms shape:", spectrograms.shape)  # Expected: (total_samples, 256, 256)
# print("Labels shape:", labels.shape)              # Expected: (total_samples, n_classes)

# # Example: Access the first spectrogram and label
# first_spectrogram = spectrograms[0]
# first_label = labels[0]
# print("First spectrogram shape:", first_spectrogram.shape)
# print("First label (one-hot):", first_label)


In [None]:
# import matplotlib.pyplot as plt

# # Plot the Mel spectrogram
# def plot_mel_spectrogram(mel_spectrogram, sr=16000, hop_length=512, title="Mel Spectrogram"):
#     plt.figure(figsize=(10, 4))
#     librosa.display.specshow(mel_spectrogram, 
#                              sr=sr, 
#                              hop_length=hop_length, 
#                              x_axis='time', 
#                              y_axis='mel')
#     plt.colorbar(format='%+2.0f dB')
#     plt.title(title)
#     plt.tight_layout()
#     plt.show()

# # Example usage with the first spectrogram
# plot_mel_spectrogram(spectrograms[657], sr=sampling_rate, hop_length=512)


In [None]:
# !zip data.zip /kaggle/tmp/data/processed_spectrograms.h5