In [1]:
# !pip install opendatasets --quiet
# import opendatasets as od
# od.download("https://www.kaggle.com/datasets/mohammedalrajeh/quran-recitations-for-audio-classification")

In [2]:
# !pip install pandas numpy librosa plotly scikit-learn tqdm

In [1]:
import torch # Main PyTorch Library
from torch import nn # Used for creating the layers and loss function
from torch.optim import Adam # Adam Optimizer
import librosa # Library that is used to read and process audio files
from torch.utils.data import Dataset, DataLoader # Dataset class and DataLoader for creating the objects
from sklearn.preprocessing import LabelEncoder # Label Encoder to encode the classes from strings to numbers
import matplotlib.pyplot as plt # Used for visualizing the images and plotting the training progress
import pandas as pd # Used to read/create dataframes (csv) and process tabular data
import numpy as np # preprocessing and numerical/mathematical operations
import os # Used to read the images path from the directory
import time # Used to calculate time for each epoch or any processing time in seconds
from sklearn.manifold import TSNE # Used to visualize the embeddings in 2D
from tqdm.notebook import tqdm # Used to visualize the progress of the training loop
import plotly.express as px 
# from skimage.transform import resize # Used to resize the images and we will use it to resize audio waves

# detect the GPU if any, if not use CPU, change "cuda" to "mps" if you have a mac
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
data_df = pd.read_csv("files_paths.csv")

data_df['FilePath'] = 'PyTorch_Exc/Audio_Classification' + data_df['FilePath'].str[1:]

data_df

Unnamed: 0,FilePath,Class
0,PyTorch_Exc/Audio_Classification/Dataset/Moham...,Mohammed_Aluhaidan
1,PyTorch_Exc/Audio_Classification/Dataset/Moham...,Mohammed_Aluhaidan
2,PyTorch_Exc/Audio_Classification/Dataset/Moham...,Mohammed_Aluhaidan
3,PyTorch_Exc/Audio_Classification/Dataset/Moham...,Mohammed_Aluhaidan
4,PyTorch_Exc/Audio_Classification/Dataset/Moham...,Mohammed_Aluhaidan
...,...,...
6682,PyTorch_Exc/Audio_Classification/Dataset/Abdul...,Abdullah_Albuaijan
6683,PyTorch_Exc/Audio_Classification/Dataset/Abdul...,Abdullah_Albuaijan
6684,PyTorch_Exc/Audio_Classification/Dataset/Abdul...,Abdullah_Albuaijan
6685,PyTorch_Exc/Audio_Classification/Dataset/Abdul...,Abdullah_Albuaijan


In [3]:
import warnings
warnings.filterwarnings('ignore')

# === STEP 1: DEFINE DATASET PATH AND CREATE METADATA DATAFRAME ===
# --- IMPORTANT: Change this path to where your 'Dataset' folder is located ---
DATASET_ROOT_PATH = 'Dataset'
print(f"Scanning for audio files in: {DATASET_ROOT_PATH}")

# Check if the path exists
if not os.path.exists(DATASET_ROOT_PATH):
    print(f"Error: The directory '{DATASET_ROOT_PATH}' was not found.")
    print("Please make sure you have set the DATASET_ROOT_PATH variable correctly.")

else:
    file_list = []
    # Walk through the directory structure
    for root, dirs, files in os.walk(DATASET_ROOT_PATH):
        for file in files:
            if file.endswith('.wav'):  # Check if the file is an audio file
                # Get the full file path
                file_path = os.path.join(root, file)
                # The reciter's name is the name of the parent folder
                reciter_name = os.path.basename(root)
                file_list.append({'reciter_name': reciter_name ,'file_path': file_path})

    # Create a pandas DataFrame
    metadata_df = pd.DataFrame(file_list)
    print(f"\nSuccessfully created DataFrame with {len(metadata_df)} audio files ")
    print("Here's a sample of the data:")
    print(metadata_df.head())


# === STEP 2: DURATION ANAYLSIS ===
# Let's verify that the audio files are short, consistent chunks.
print("\n--- Starting Step 2: Duration Analysis ---")

# This function is safe and returns None if a file is corrupt
def get_duration(file_path):
    try:
        return librosa.get_duration(path=file_path)
    except Exception as e:
        print(f"Could nor process {file_path}: {e}")
        return None
    
# Use tqdm to show a progress bar as this can take a moment
tqdm.pandas(desc="Calculating durations")
metadata_df['duration_s'] = metadata_df['file_path'].progress_apply(get_duration)

# Plot the results
fig_duration = px.box(metadata_df, x='reciter_name', y='duration_s', 
                      title='<b>Audio Chunk Duration by Reciter</b>',
                      labels={'reciter_name': 'Reciter', 'duration_s': 'Duration (seconds)'})

fig_duration.update_xaxes(tickangle=45)
print("Displaying interactive duration plot...")
fig_duration.show()


# === STEP 3: VOCAL FEATURE EXTRACTION ===
# This is the most intensive step. We will extract Pitch, Timbre, and MFCCs.
print("\n--- Starting Step 3: Vocal Feature Extraction (This will take several minutes)---")

def extract_features(file_path):
    try:
        # Load the audio file
        y, sr = librosa.load(file_path, sr=None)

        # 1. Spectral Centroid (Timbre/Brightness)
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)[0])

        # 2. Pitch (Fundamental Frequency) - using a robust method
        pitches, _ = librosa.core.piptrack(y=y, sr=sr)
        # get the mean pitch of non-zero pitches
        valid_pitches = pitches[pitches > 0]
        avg_pitch = np.mean(valid_pitches) if len(valid_pitches) > 0 else 0.0

        # 3. MFCCs (for t-SNE) - we take the mean across time
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
        # ???? -  za ovo mi treba skripta

        return pd.Series([spectral_centroid, avg_pitch, mfccs])
    
    except Exception as e:
        print(f"Could not process {file_path}: {e}")
        return pd.Series([None, None, None])
    
# Apply the feature extraction function
tqdm.pandas(desc="Extracting Vocal Features")
metadata_df[['spectral_centroid', 'avg_pitch', 'mfccs']] = metadata_df['file_path'].progress_apply(extract_features)

print("\nFeature extraction complete.")
print("Here's the DataFrame with the new features:")
print(metadata_df.head())


Scanning for audio files in: Dataset

Successfully created DataFrame with 6688 audio files 
Here's a sample of the data:
         reciter_name                                    file_path
0  Mohammed_Aluhaidan  Dataset/Mohammed_Aluhaidan/lohaidan_207.wav
1  Mohammed_Aluhaidan  Dataset/Mohammed_Aluhaidan/lohaidan_204.wav
2  Mohammed_Aluhaidan  Dataset/Mohammed_Aluhaidan/lohaidan_213.wav
3  Mohammed_Aluhaidan  Dataset/Mohammed_Aluhaidan/lohaidan_092.wav
4  Mohammed_Aluhaidan  Dataset/Mohammed_Aluhaidan/lohaidan_017.wav

--- Starting Step 2: Duration Analysis ---


Calculating durations:   0%|          | 0/6688 [00:00<?, ?it/s]

Displaying interactive duration plot...



--- Starting Step 3: Vocal Feature Extraction (This will take several minutes)---


Extracting Vocal Features:   0%|          | 0/6688 [00:00<?, ?it/s]


Feature extraction complete.
Here's the DataFrame with the new features:
         reciter_name                                    file_path  \
0  Mohammed_Aluhaidan  Dataset/Mohammed_Aluhaidan/lohaidan_207.wav   
1  Mohammed_Aluhaidan  Dataset/Mohammed_Aluhaidan/lohaidan_204.wav   
2  Mohammed_Aluhaidan  Dataset/Mohammed_Aluhaidan/lohaidan_213.wav   
3  Mohammed_Aluhaidan  Dataset/Mohammed_Aluhaidan/lohaidan_092.wav   
4  Mohammed_Aluhaidan  Dataset/Mohammed_Aluhaidan/lohaidan_017.wav   

   duration_s  spectral_centroid    avg_pitch  \
0         2.5         781.396061   668.051208   
1         2.5         950.175422   783.456787   
2         2.5         906.323377   849.936035   
3         2.5        3222.642243  1713.367065   
4         2.5        3345.552239  1135.475464   

                                               mfccs  
0  [-251.52544, 236.43234, 17.117867, 4.023563, 8...  
1  [-221.94675, 220.82773, 9.017719, 2.2040062, 7...  
2  [-245.28973, 232.22041, 18.541721, -17.858

In [4]:
# --- ADD THIS CLEANING STEP AFTER FEATURE EXTRACTION (AFTER STEP 3) ---

print("\n--- Starting Data Cleaning Step ---")

# Let's see how many of these files we have
is_noise_red = metadata_df['file_path'].str.contains('_noiseRed', na=False)
is_long_duration = metadata_df['duration_s'] > 4 # A safe threshold for 5s clips

print(f"Found {is_noise_red.sum()} files with '_noiseRed'.")
print(f"Found {is_long_duration.sum()} files with duration > 4 seconds.")

# Create a cleaned DataFrame by excluding these files
# We use the ~ symbol, which means NOT
cleaned_df = metadata_df[~is_noise_red & ~is_long_duration].copy()

print(f"\nOriginal DataFrame size: {len(metadata_df)}")
print(f"Cleaned DataFrame size: {len(cleaned_df)}")
print(f"Removed {len(metadata_df) - len(cleaned_df)} non-standard files.")

# --- FROM THIS POINT ON, USE 'cleaned_df' FOR ALL SUBSEQUENT STEPS ---
# For example, when creating the t-SNE plot or performing the data split.


--- Starting Data Cleaning Step ---
Found 886 files with '_noiseRed'.
Found 1268 files with duration > 4 seconds.

Original DataFrame size: 6688
Cleaned DataFrame size: 5420
Removed 1268 non-standard files.


In [5]:
# === STEP 4: COMPARATIVE VISUALIZATION OF VOCAL FEATURES ===
print("\n--- Starting Step 4: Comparative Visualization of Vocal Features ---")

# Plot 1: Timbre / Brightness
fig_timbre = px.box(cleaned_df, x='reciter_name', y='spectral_centroid',
                    title='<b>Timbre/Brightness by Reciter</b>',
                    labels={'reciter_name': 'Reciter', 'spectral_centroid': 'Spectral Centroid (Higher is Brighter)'})
fig_timbre.update_xaxes(tickangle=45)
print("Displaying interactive timbre/brightness plot...")
fig_timbre.show()

# Plot 2: Average pitch
fig_pitch = px.box(cleaned_df, x='reciter_name', y='avg_pitch',
                   title='<b>Average Pitch by Reciter</b>',
                   labels={'reciter_name': 'Reciter', 'avg_pitch': 'Average Pitch (Hz)'})
fig_pitch.update_xaxes(tickangle=45)
print("Displaying interactive pitch plot...")
fig_pitch.show()



--- Starting Step 4: Comparative Visualization of Vocal Features ---
Displaying interactive timbre/brightness plot...


Displaying interactive pitch plot...


In [6]:
# === STEP 5: t-SNE VISUALIZATION OF THE ENTIRE DATASET ===
print("\n--- Starting Step 5: Creating 2D Embedding with t-SNE (This may take a moment) ---")

# Prepare the data for t-SNE
# We need to stack the MFCC arrays into 2D numpy array
mfcc_features = np.stack(cleaned_df['mfccs'].values) #stackaj horizontalno
reciter_labels = cleaned_df['reciter_name'].values

# Run t-SNE
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300, random_state=42)
tsne_results = tsne.fit_transform(mfcc_features)

# Create a new DataFrame for plotting the t-SNE results
df_tsne = pd.DataFrame(tsne_results, columns=['tsne1', 'tsne2'])
df_tsne['reciter'] = reciter_labels

# Plot the t-SNE results
fig_tsne = px.scatter(df_tsne, x='tsne1', y='tsne2', color='reciter',
                      title='<b>t-SNE Visualization of Reciter Vocal Similarities</b>',
                      labels={'tsne_1': 't-SNE Dimension 1', 'tsne_2': 't-SNE Dimension 2'},
                      hover_data=['reciter'])
fig_tsne.update_layout(legend_title_text='Reciter')
# fig_tsne.update_traces(marker=dict(size=8, opacity=0.8))
print("Displaying interactive t-SNE plot...")
fig_tsne.show()

print("\n\nData investigation is complete!")


--- Starting Step 5: Creating 2D Embedding with t-SNE (This may take a moment) ---
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 5420 samples in 0.006s...
[t-SNE] Computed neighbors for 5420 samples in 0.374s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5420
[t-SNE] Computed conditional probabilities for sample 2000 / 5420
[t-SNE] Computed conditional probabilities for sample 3000 / 5420
[t-SNE] Computed conditional probabilities for sample 4000 / 5420
[t-SNE] Computed conditional probabilities for sample 5000 / 5420
[t-SNE] Computed conditional probabilities for sample 5420 / 5420
[t-SNE] Mean sigma: 20.239442
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.043381
[t-SNE] KL divergence after 300 iterations: 1.692988
Displaying interactive t-SNE plot...




Data investigation is complete!


In [7]:
import plotly.express as px
import librosa
import numpy as np

# Assuming 'audio_path' and 'sampling_rate' are loaded
y, sr = librosa.load('Dataset/AbdulBari_Althubaity/abdulbari_004.wav')
fig = px.line(x=np.arange(len(y)) / sr, y=y, labels={'x': 'Time (s)', 'y': 'Amplitude'})
fig.update_layout(title='Interactive Waveform')
fig.show()

# For an interactive spectrogram
D = librosa.stft(y)
DB = librosa.amplitude_to_db(np.abs(D), ref=np.max)
fig = px.imshow(DB, origin='lower', aspect='auto', 
                labels=dict(x="Time Frame", y="Frequency Bin", color="Decibels"))
fig.show()

<h1>5. Time-Series Cross Validation</h1>

<h4>5.1. Imports and Configuration</h4>

In [8]:
import random
import torchaudio

# --- Comfiguration ---
TARGET_SAMPLE_RATE = 22050
# Set the audio duration you want to use for the model (e.g., 2.5 seconds)
NUM_SAMPLES = int(TARGET_SAMPLE_RATE * 2.5)

# Gaps to test for the optimal split (in number of chunks)
GAPS_TO_TEST = [0, 5, 10, 15]

# Use fewer epochs for the gap search to make it faster
MODEL_TRAINING_EPOCHS_FOR_SEARCH = 3
BATCH_SIZE = 32

print("Configuration set.")

Configuration set.


<h4>5.2. Helper Functions</h4>

In [9]:
def extract_chunk_number(filename):
    """Extracts the sequential number from a filename like 'Yasser_047.wav'."""
    try:
        base_name = os.path.basename(filename)
        number_str = base_name.split('.')[0].split('_')[-1]
        return int(number_str)
    except:
        return -1
    
def get_df_from_filepaths(file_paths, source_df):
    """Filters the main DataFrame based on a list of the file paths."""
    return source_df[source_df['file_path'].isin(file_paths)].copy()

<h4>5.3. PyTorch Dataset and Model Classes</h4>

In [10]:
class AudioDataset(Dataset):
    """A PyTorch Dataset class with data augmentation and dB conversion."""
    def __init__(self, dataframe, target_sample_rate, num_samples, is_train=True):
        self.dataframe = dataframe
        self.device = device
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        self.is_train = is_train
        self.label_to_int = {label: i for i, label in enumerate(sorted(dataframe['reciter_name'].unique()))}
        self.int_to_label = {i: label for label, i in self.label_to_int.items()}
        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
            sample_rate=target_sample_rate,
            n_fft=1024,
            hop_length=512,
            n_mels=64
        )
        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        audio_path = self.dataframe.iloc[index]['file_path']   
        label = self.label_to_int[self.dataframe.iloc[index]['reciter_name']]
        signal, sr = torchaudio.load(audio_path)
        if sr!= self.target_sample_rate:
            signal = torchaudio.functional.resample(signal, sr, self.target_sample_rate)
        if signal.shape[0] > 1:
            signal = torch.mean(signal, axis=0, keepdim=True)
        if signal.shape[1] > self.num_samples:
            if self.is_train:
                start = np.random.randint(0, signal.shape[1] - self.num_samples)
                signal = signal[:, start : start + self.num_samples]
            else:
                start = (signal.shape[1] - self.num_samples) // 2
                signal = signal[:, :self.num_samples]
        elif signal.shape[1] < self.num_samples:
            padding = (0, self.num_samples - signal.shape[1])
            signal = torch.nn.functional.pad(signal, padding)
        mel_spec = self.mel_spectrogram(signal)
        db_mel_spec = self.amplitude_to_db(mel_spec)
        return db_mel_spec, label


class ReciterCNN(nn.Module):
    """A simple Convolutional Neural Network for audio classification"""
    def __init__(self, num_classes):
        super().__init__()
        self.conv_stack = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
            # nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            # nn.ReLU(),
            # nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.flatten = nn.Flatten()
        # Input features to the linear layer depends on spectrogram size after convolutions.
        # For a (64, 216) spectrogram from a 2.5s clip, this becomes (8, 13) after 3 pools.
        self.linear_stack = nn.Sequential(
            nn.Linear(64 * 8 * 13, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.conv_stack(x)
        # print(f"Input after convolution: {x.shape}")
        x = self.flatten(x)
        # print(f"Input after flattening: {x.shape}")
        logits = self.linear_stack(x)
        # print(f"Output logits: {logits.shape}")
        return logits
    

print("PyTorch Dataset and Model classes defined.")

PyTorch Dataset and Model classes defined.


<h4>5.4. The Experiment Engine Function</h4>

In [11]:
def create_and_train_model(train_df, val_df, num_epochs, batch_size):
    """Orchestrates a single training and validation run, 
    returns validation accuracy"""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"  -> Training on {device} for {num_epochs} epochs...")

    train_dataset = AudioDataset(train_df, TARGET_SAMPLE_RATE, NUM_SAMPLES, is_train=True)
    val_dataset = AudioDataset(val_df, TARGET_SAMPLE_RATE, NUM_SAMPLES, is_train=False)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    num_classes = len(train_dataset.label_to_int)
    model = ReciterCNN(num_classes=num_classes).to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    best_accuracy = 0.0

    for epoch in range(num_epochs):
        model.train()
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} Training", leave=False)
        # Using tqdm for a progress bar within the notebook
        for inputs, labels in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for inputs, labels in val_dataloader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), "best_model.pth")
    
    print(f"  -> Run finished. Best validation accuracy: {best_accuracy:.2f}%")
    return best_accuracy

print("Experiment engine function 'create_and_train_model' defined.")


Experiment engine function 'create_and_train_model' defined.


<h4>5.5. Optimal Gap Search</h4>

In [12]:
# --- Prepare the cleaned DataFrame by adding the chunk_id
tqdm.pandas()
print("Preparing DataFrame for splitting by extracting chunk numbers...")
cleaned_df['chunk_id'] = cleaned_df['file_path'].progress_apply(extract_chunk_number)
cleaned_df = cleaned_df[cleaned_df['chunk_id'] != -1].copy()

Preparing DataFrame for splitting by extracting chunk numbers...


  0%|          | 0/5420 [00:00<?, ?it/s]

In [None]:
# # ---Run the Optimal Gap Search Experiment ---
# print("\n --- Starting Optimal Gap Search Experiment ---")
# performance_results = {}
# for gap in GAPS_TO_TEST:
#     print(f"\nTesting with gap_size = {gap}...")
#     train_list, val_list = [], []
#     for reciter, group in cleaned_df.groupby('reciter_name'):
#         group = group.sort_values('chunk_id')
#         files = group['file_path'].tolist()
#         n_files = len(files)

#         train_end = int(n_files * 0.7)
#         val_start = train_end + gap
#         val_end = val_start + int(n_files * 0.15)

#         # Ensure there are enough files in this group for the split
#         if val_end > n_files: continue

#         train_list.extend(files[:train_end])
#         val_list.extend(files[val_start:val_end])

#     train_df_gapped = get_df_from_filepaths(train_list, cleaned_df)
#     val_df_gapped = get_df_from_filepaths(val_list, cleaned_df)

#     # Run the training and validation for this gap
#     accuracy = create_and_train_model(train_df_gapped, val_df_gapped, MODEL_TRAINING_EPOCHS_FOR_SEARCH, BATCH_SIZE)
#     performance_results[gap] = accuracy

# # --- Analyze results ---
# print("\n--- Optimal Gap Search Results ---")
# for gap, acc in performance_results.items():
#     print(f"Gap Size: {gap}, Validation Accuracy: {acc:.2f}%")

# # Simple logic to find optimal gap: where performance stabilizes
# # We choose the gap that gives the highest score after the initial drop (if any)
# temp_results = performance_results.copy()
# if 0 in temp_results: del temp_results[0] # Exclude gap=0 as it's likely inflated

# if not temp_results:
#     optimal_gap = 10 # Default to 10 if no other gaps were tested
#     print("\nWarning: Only gap=0 was tested. Defaulting optimal_gap to 10.")
    
# else:
#     optimal_gap = max(temp_results, key=temp_results.get)
#     print(f"\nOptimal gap found: {optimal_gap}")

<h4>5.6. Final Data Split and Saving</h4>

In [None]:
# print(f"\n--- Performing Final Split with Optimal Gap ({optimal_gap}) and Saving ---")
# train_list, val_list, test_list = [], [], []
# for reciter, group in cleaned_df.groupby('reciter_name'):
#     group = group.sort_values(by='chunk_id')
#     files = group['file_path'].tolist()
#     n_files = len(files)

#     train_end = int(n_files * 0.7)
#     val_start = train_end + optimal_gap
#     val_end = val_start + int(n_files * 0.15)
#     test_start = val_end + optimal_gap

#     # Ensure indices are valid
#     if val_end > n_files or test_start > n_files: continue

#     train_list.extend(files[:train_end])
#     val_list.extend(files[val_start:val_end])
#     test_list.extend(files[test_start:])

# train_df_final = get_df_from_filepaths(train_list, cleaned_df)
# val_df_final = get_df_from_filepaths(val_list, cleaned_df)
# test_df_final = get_df_from_filepaths(test_list, cleaned_df)

# # Save the splits to CSV files for the final training stage
# train_df_final.to_csv('train_split.csv', index=False)
# val_df_final.to_csv('val_split.csv', index=False)
# test_df_final.to_csv('test_split.csv', index=False)

# print("\nProcess Complete!")
# print(f"Final training set size: {len(train_df_final)}")
# print(f"Final validation set size: {len(val_df_final)}")
# print(f"Final test set size: {len(test_df_final)}")
# print("Data split information saved to train_split.csv, val_split.csv, and test_split.csv")

In [None]:
# cleaned_df.to_csv('cleaned_df.csv', index=False)
# df_tsne.to_csv('tsne.csv', index=False)

<h4>ADD-ON: Saving important parameters</h4>

In [None]:
# import json

# # We can save all key parameters in one file
# project_parameters = {
#     'optimal_gap': optimal_gap,
#     'target_sample_rate': TARGET_SAMPLE_RATE,
#     'num_samples': NUM_SAMPLES,
#     'n_mels': 64,
#     'hop_length': 512 # The hop_length used in your MelSpectrogram
    
# }

# with open('project_parameters.json', 'w') as f:
#     json.dump(project_parameters, f, indent=4)
    
# print("Project parameters (including optimal gap) saved to project_parameters.json")

In [None]:
# # Create a temporary dataset just to get the mapping
# temp_dataset = AudioDataset(cleaned_df, TARGET_SAMPLE_RATE, NUM_SAMPLES)
# project_parameters['label_to_int'] = temp_dataset.label_to_int

# with open('project_parameters.json', 'w') as f:
#     json.dump(project_parameters, f, indent=4)

# print("Updated project_parameters.json with label mapping.")

# performance_results_serializable = {str(k): float(v) for k, v in performance_results.items()}

# project_parameters['gap_search_experiment'] = {
#     'gaps_tested': GAPS_TO_TEST,
#     'epochs_per_run': MODEL_TRAINING_EPOCHS_FOR_SEARCH,
#     'batch_size': BATCH_SIZE,
#     'performance_results': performance_results_serializable
# }

# # --- Save the updated dictionary back to the file ---
# with open('project_parameters.json', 'w') as f:
#     json.dump(project_parameters, f, indent=4)

# print("Successfully added gap search hyperparameters to project_parameters.json")

In [None]:
import json
import plotly.io as pio

# --- Load Data Splits ---
train_df = pd.read_csv('train_split.csv')
val_df = pd.read_csv('val_split.csv')
test_df = pd.read_csv('test_split.csv')

cleaned_df = pd.read_csv('cleaned_df.csv')
df_tsne = pd.read_csv('tsne.csv')
print("Successfully loaded data splits.")
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Testing samples: {len(test_df)}")

# --- Load Project Parameters ---
with open('project_parameters.json', 'r') as f:
    params = json.load(f)

# You can now access the parameters like this:
optimal_gap = params['optimal_gap']
label_to_int = params['label_to_int']
# Create the reverse mapping if needed
int_to_label = {int(i): label for label, i in label_to_int.items()}

print(f"\nSuccessfully loaded project parameters. Optimal gap is {optimal_gap}.")
print("Label mapping loaded:", label_to_int)

# --- Load and Display a Saved Plot (Optional) ---
# You can simply open the 'tsne_visualization.html' file in your browser
# to view the interactive plot.
# Data Analysis, Data Augmentation, Optimal Gap Search, Saving and Loading Parameters

Successfully loaded data splits.
Training samples: 3791
Validation samples: 808
Testing samples: 581

Successfully loaded project parameters. Optimal gap is 10.
Label mapping loaded: {'AbdulBari_Althubaity': 0, 'AbdulRahman_Alsudais': 1, 'Abdullah_Albuaijan': 2, 'Ali_Alhothaify': 3, 'Bander_Balilah': 4, 'Maher_Almuaiqly': 5, 'Mohammed_Aluhaidan': 6, 'Mohammed_Ayoub': 7, 'Nasser_Alqutami': 8, 'Saad_Alghamdi': 9, 'Saud_Alshuraim': 10, 'Yasser_Aldossary': 11}


<h5>Ovo sa t-SNE htmlom i jpg-om ne valja. Trebat ću nač drugi način.</h5>