In [1]:
import os
import numpy as np
import pandas as pd
import umap
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import silhouette_score

n_neighbors = 50
min_dist = 0.5

### UserWarning: n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.

# List of main folder names
main_folders = [
    'npy_alex', 'npy_rn18', 'npy_rn50', 'npy_rn152',
    'npy_rn18p', 'npy_rn50p', 'npy_rn152p', 
    'npy_vggish', 'npy_swin', 'npy_aves_all', 'npy_aves'
]

# Define the species order
species_order = [
    'watkins', 'bats', 'cbi', 'humbugdb', 'dogs',
    'dcase', 'enabirds', 'hiceas', 'rfcx', 'hainan-gibbons'
]

# Loop through each main folder
for folder in main_folders:
    # Construct the path to the "Embeddings" subfolder
    folder_path = os.path.join(folder, 'Embeddings')
    print(folder_path)
    # List all files in the folder
    files = os.listdir(folder_path)

    # Filter and sort the feature and label files based on species order
    features_files = sorted([f for f in files if 'features.npy' in f], key=lambda x: species_order.index(x.split('_')[0]))
    labels_files = sorted([f for f in files if 'labels.npy' in f], key=lambda x: species_order.index(x.split('_')[0]))
    
    ARI_all = []
    NMI_all = []
    Sil_all = []
    # Load the data from the files
    for feature_file, label_file in zip(features_files, labels_files):
        # Construct the full path to the files
        feature_path = os.path.join(folder_path, feature_file)
        label_path = os.path.join(folder_path, label_file)

        X = np.load(feature_path)
        X_labels = np.load(label_path)

        if len(X_labels.shape) > 1: # detection task with one-hot encoder 0-1-0
            # Check for all-zero rows
            all_zeros = np.all(X_labels == 0, axis=1)
            # Use np.argmax for rows that are not all zeros
            # For rows that are all zeros, assign a unique class identifier (e.g., 0)
            num_classes = X_labels.shape[1]  # assign real label 0-1-2 to 1-2-3 by adding 1
            X_labels_transformed = np.where(all_zeros, 0, np.argmax(X_labels, axis=1)+1)
            X_labels = X_labels_transformed
        N_class = len(np.unique(X_labels))
        N_sample = len(X_labels)
        
        umap_model = umap.UMAP(n_neighbors = n_neighbors, min_dist=min_dist, 
                       n_components=2, n_jobs = 1, random_state=42) # multiple jobs require random state
        umap_results = umap_model.fit_transform(X) 

        kmeans = KMeans(n_clusters=N_class, random_state=42) ## parallel automatically
        labels_pred = kmeans.fit_predict(umap_results)
        labels_true = X_labels

        ari_score = adjusted_rand_score(labels_true, labels_pred)
        ARI_all.append(round(ari_score, 3))
        print('ARI_all=', ARI_all)

        nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
        NMI_all.append(round(nmi_score,3))
        print('NMI_all=', NMI_all)

        silhouette_avg = silhouette_score(X, labels_pred, n_jobs = -1)
        Sil_all.append(round(silhouette_avg,3))
        print('Sil_all=', Sil_all)

  from .autonotebook import tqdm as notebook_tqdm


npy_alex/Embeddings
ARI_all= [0.242]
NMI_all= [0.529]
Sil_all= [0.008]
ARI_all= [0.242, 0.172]
NMI_all= [0.529, 0.296]
Sil_all= [0.008, -0.046]
ARI_all= [0.242, 0.172, 0.052]
NMI_all= [0.529, 0.296, 0.441]
Sil_all= [0.008, -0.046, -0.139]
ARI_all= [0.242, 0.172, 0.052, 0.117]
NMI_all= [0.529, 0.296, 0.441, 0.313]
Sil_all= [0.008, -0.046, -0.139, 0.162]
ARI_all= [0.242, 0.172, 0.052, 0.117, 0.195]
NMI_all= [0.529, 0.296, 0.441, 0.313, 0.358]
Sil_all= [0.008, -0.046, -0.139, 0.162, -0.063]
ARI_all= [0.242, 0.172, 0.052, 0.117, 0.195, 0.012]
NMI_all= [0.529, 0.296, 0.441, 0.313, 0.358, 0.153]
Sil_all= [0.008, -0.046, -0.139, 0.162, -0.063, 0.134]
ARI_all= [0.242, 0.172, 0.052, 0.117, 0.195, 0.012, 0.1]
NMI_all= [0.529, 0.296, 0.441, 0.313, 0.358, 0.153, 0.33]
Sil_all= [0.008, -0.046, -0.139, 0.162, -0.063, 0.134, -0.016]
ARI_all= [0.242, 0.172, 0.052, 0.117, 0.195, 0.012, 0.1, -0.007]
NMI_all= [0.529, 0.296, 0.441, 0.313, 0.358, 0.153, 0.33, 0.002]
Sil_all= [0.008, -0.046, -0.139, 0.162, 

ARI_all= [0.802, 0.362, 0.271, 0.252, 0.708, 0.034, 0.235, 0.203, 0.013]
NMI_all= [0.887, 0.464, 0.671, 0.494, 0.807, 0.3, 0.591, 0.28, 0.258]
Sil_all= [0.219, 0.097, -0.013, 0.167, 0.222, 0.071, 0.041, 0.402, -0.011]
ARI_all= [0.802, 0.362, 0.271, 0.252, 0.708, 0.034, 0.235, 0.203, 0.013, 0.365]
NMI_all= [0.887, 0.464, 0.671, 0.494, 0.807, 0.3, 0.591, 0.28, 0.258, 0.51]
Sil_all= [0.219, 0.097, -0.013, 0.167, 0.222, 0.071, 0.041, 0.402, -0.011, 0.143]
npy_rn152p/Embeddings
ARI_all= [0.776]
NMI_all= [0.895]
Sil_all= [0.207]
ARI_all= [0.776, 0.394]
NMI_all= [0.895, 0.502]
Sil_all= [0.207, 0.087]
ARI_all= [0.776, 0.394, 0.353]
NMI_all= [0.895, 0.502, 0.727]
Sil_all= [0.207, 0.087, 0.011]
ARI_all= [0.776, 0.394, 0.353, 0.322]
NMI_all= [0.895, 0.502, 0.727, 0.611]
Sil_all= [0.207, 0.087, 0.011, 0.148]
ARI_all= [0.776, 0.394, 0.353, 0.322, 0.608]
NMI_all= [0.895, 0.502, 0.727, 0.611, 0.747]
Sil_all= [0.207, 0.087, 0.011, 0.148, 0.159]
ARI_all= [0.776, 0.394, 0.353, 0.322, 0.608, 0.041]
NMI_a

In [1]:
import os
import numpy as np
import pandas as pd
import umap
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import silhouette_score

n_neighbors = 50
min_dist = 0.1

### UserWarning: n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.

# List of main folder names
main_folders = [
    'npy_alex', 'npy_rn18', 'npy_rn50', 'npy_rn152',
    'npy_rn18p', 'npy_rn50p', 'npy_rn152p', 
    'npy_vggish', 'npy_swin', 'npy_aves_all', 'npy_aves'
]

# Define the species order
species_order = [
    'watkins', 'bats', 'cbi', 'humbugdb', 'dogs',
    'dcase', 'enabirds', 'hiceas', 'rfcx', 'hainan-gibbons'
]

# Loop through each main folder
for folder in main_folders:
    # Construct the path to the "Embeddings" subfolder
    folder_path = os.path.join(folder, 'Embeddings')
    print(folder_path)
    # List all files in the folder
    files = os.listdir(folder_path)

    # Filter and sort the feature and label files based on species order
    features_files = sorted([f for f in files if 'features.npy' in f], key=lambda x: species_order.index(x.split('_')[0]))
    labels_files = sorted([f for f in files if 'labels.npy' in f], key=lambda x: species_order.index(x.split('_')[0]))
    
    ARI_all = []
    NMI_all = []
    Sil_all = []
    # Load the data from the files
    for feature_file, label_file in zip(features_files, labels_files):
        # Construct the full path to the files
        feature_path = os.path.join(folder_path, feature_file)
        label_path = os.path.join(folder_path, label_file)

        X = np.load(feature_path)
        X_labels = np.load(label_path)

        if len(X_labels.shape) > 1: # detection task with one-hot encoder 0-1-0
            # Check for all-zero rows
            all_zeros = np.all(X_labels == 0, axis=1)
            # Use np.argmax for rows that are not all zeros
            # For rows that are all zeros, assign a unique class identifier (e.g., 0)
            num_classes = X_labels.shape[1]  # assign real label 0-1-2 to 1-2-3 by adding 1
            X_labels_transformed = np.where(all_zeros, 0, np.argmax(X_labels, axis=1)+1)
            X_labels = X_labels_transformed
        N_class = len(np.unique(X_labels))
        N_sample = len(X_labels)
        
        umap_model = umap.UMAP(n_neighbors = n_neighbors, min_dist=min_dist, 
                       n_components=2, n_jobs = 1, random_state=42) # multiple jobs require random state
        umap_results = umap_model.fit_transform(X) 

        kmeans = KMeans(n_clusters=N_class, random_state=42) ## parallel automatically
        labels_pred = kmeans.fit_predict(umap_results)
        labels_true = X_labels

        ari_score = adjusted_rand_score(labels_true, labels_pred)
        ARI_all.append(round(ari_score, 3))
        print('ARI_all=', ARI_all)

        nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
        NMI_all.append(round(nmi_score,3))
        print('NMI_all=', NMI_all)

        silhouette_avg = silhouette_score(X, labels_pred, n_jobs = -1)
        Sil_all.append(round(silhouette_avg,3))
        print('Sil_all=', Sil_all)

  from .autonotebook import tqdm as notebook_tqdm


npy_alex/Embeddings
ARI_all= [0.279]
NMI_all= [0.555]
Sil_all= [0.031]
ARI_all= [0.279, 0.167]
NMI_all= [0.555, 0.299]
Sil_all= [0.031, -0.027]
ARI_all= [0.279, 0.167, 0.078]
NMI_all= [0.555, 0.299, 0.467]
Sil_all= [0.031, -0.027, -0.175]
ARI_all= [0.279, 0.167, 0.078, 0.121]
NMI_all= [0.555, 0.299, 0.467, 0.317]
Sil_all= [0.031, -0.027, -0.175, 0.16]
ARI_all= [0.279, 0.167, 0.078, 0.121, 0.185]
NMI_all= [0.555, 0.299, 0.467, 0.317, 0.354]
Sil_all= [0.031, -0.027, -0.175, 0.16, -0.02]
ARI_all= [0.279, 0.167, 0.078, 0.121, 0.185, 0.013]
NMI_all= [0.555, 0.299, 0.467, 0.317, 0.354, 0.159]
Sil_all= [0.031, -0.027, -0.175, 0.16, -0.02, 0.111]
ARI_all= [0.279, 0.167, 0.078, 0.121, 0.185, 0.013, 0.102]
NMI_all= [0.555, 0.299, 0.467, 0.317, 0.354, 0.159, 0.344]
Sil_all= [0.031, -0.027, -0.175, 0.16, -0.02, 0.111, -0.014]
ARI_all= [0.279, 0.167, 0.078, 0.121, 0.185, 0.013, 0.102, -0.006]
NMI_all= [0.555, 0.299, 0.467, 0.317, 0.354, 0.159, 0.344, 0.001]
Sil_all= [0.031, -0.027, -0.175, 0.16, -0

ARI_all= [0.779, 0.385, 0.484, 0.257, 0.821, 0.033, 0.284, 0.191, 0.014]
NMI_all= [0.897, 0.475, 0.773, 0.498, 0.841, 0.308, 0.626, 0.273, 0.27]
Sil_all= [0.215, 0.094, 0.041, 0.164, 0.132, 0.088, 0.056, 0.4, 0.016]
ARI_all= [0.779, 0.385, 0.484, 0.257, 0.821, 0.033, 0.284, 0.191, 0.014, 0.382]
NMI_all= [0.897, 0.475, 0.773, 0.498, 0.841, 0.308, 0.626, 0.273, 0.27, 0.548]
Sil_all= [0.215, 0.094, 0.041, 0.164, 0.132, 0.088, 0.056, 0.4, 0.016, 0.147]
npy_rn152p/Embeddings
ARI_all= [0.808]
NMI_all= [0.911]
Sil_all= [0.212]
ARI_all= [0.808, 0.428]
NMI_all= [0.911, 0.527]
Sil_all= [0.212, 0.088]
ARI_all= [0.808, 0.428, 0.605]
NMI_all= [0.911, 0.527, 0.837]
Sil_all= [0.212, 0.088, 0.076]
ARI_all= [0.808, 0.428, 0.605, 0.363]
NMI_all= [0.911, 0.527, 0.837, 0.629]
Sil_all= [0.212, 0.088, 0.076, 0.147]
ARI_all= [0.808, 0.428, 0.605, 0.363, 0.659]
NMI_all= [0.911, 0.527, 0.837, 0.629, 0.776]
Sil_all= [0.212, 0.088, 0.076, 0.147, 0.135]
ARI_all= [0.808, 0.428, 0.605, 0.363, 0.659, 0.045]
NMI_all=

In [2]:
import os
import numpy as np
import pandas as pd
import umap
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import silhouette_score

n_neighbors = 100
min_dist = 0.5

### UserWarning: n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.

# List of main folder names
main_folders = [
    'npy_alex', 'npy_rn18', 'npy_rn50', 'npy_rn152',
    'npy_rn18p', 'npy_rn50p', 'npy_rn152p', 
    'npy_vggish', 'npy_swin', 'npy_aves_all', 'npy_aves'
]

# Define the species order
species_order = [
    'watkins', 'bats', 'cbi', 'humbugdb', 'dogs',
    'dcase', 'enabirds', 'hiceas', 'rfcx', 'hainan-gibbons'
]

# Loop through each main folder
for folder in main_folders:
    # Construct the path to the "Embeddings" subfolder
    folder_path = os.path.join(folder, 'Embeddings')
    print(folder_path)
    # List all files in the folder
    files = os.listdir(folder_path)

    # Filter and sort the feature and label files based on species order
    features_files = sorted([f for f in files if 'features.npy' in f], key=lambda x: species_order.index(x.split('_')[0]))
    labels_files = sorted([f for f in files if 'labels.npy' in f], key=lambda x: species_order.index(x.split('_')[0]))
    
    ARI_all = []
    NMI_all = []
    Sil_all = []
    # Load the data from the files
    for feature_file, label_file in zip(features_files, labels_files):
        # Construct the full path to the files
        feature_path = os.path.join(folder_path, feature_file)
        label_path = os.path.join(folder_path, label_file)

        X = np.load(feature_path)
        X_labels = np.load(label_path)

        if len(X_labels.shape) > 1: # detection task with one-hot encoder 0-1-0
            # Check for all-zero rows
            all_zeros = np.all(X_labels == 0, axis=1)
            # Use np.argmax for rows that are not all zeros
            # For rows that are all zeros, assign a unique class identifier (e.g., 0)
            num_classes = X_labels.shape[1]  # assign real label 0-1-2 to 1-2-3 by adding 1
            X_labels_transformed = np.where(all_zeros, 0, np.argmax(X_labels, axis=1)+1)
            X_labels = X_labels_transformed
        N_class = len(np.unique(X_labels))
        N_sample = len(X_labels)
        
        umap_model = umap.UMAP(n_neighbors = n_neighbors, min_dist=min_dist, 
                       n_components=2, n_jobs = 1, random_state=42) # multiple jobs require random state
        umap_results = umap_model.fit_transform(X) 

        kmeans = KMeans(n_clusters=N_class, random_state=42) ## parallel automatically
        labels_pred = kmeans.fit_predict(umap_results)
        labels_true = X_labels

        ari_score = adjusted_rand_score(labels_true, labels_pred)
        ARI_all.append(round(ari_score, 3))
        print('ARI_all=', ARI_all)

        nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
        NMI_all.append(round(nmi_score,3))
        print('NMI_all=', NMI_all)

        silhouette_avg = silhouette_score(X, labels_pred, n_jobs = -1)
        Sil_all.append(round(silhouette_avg,3))
        print('Sil_all=', Sil_all)

npy_alex/Embeddings
ARI_all= [0.236]
NMI_all= [0.521]
Sil_all= [0.009]
ARI_all= [0.236, 0.148]
NMI_all= [0.521, 0.274]
Sil_all= [0.009, -0.03]
ARI_all= [0.236, 0.148, 0.048]
NMI_all= [0.521, 0.274, 0.436]
Sil_all= [0.009, -0.03, -0.107]
ARI_all= [0.236, 0.148, 0.048, 0.137]
NMI_all= [0.521, 0.274, 0.436, 0.315]
Sil_all= [0.009, -0.03, -0.107, 0.159]
ARI_all= [0.236, 0.148, 0.048, 0.137, 0.169]
NMI_all= [0.521, 0.274, 0.436, 0.315, 0.322]
Sil_all= [0.009, -0.03, -0.107, 0.159, -0.083]
ARI_all= [0.236, 0.148, 0.048, 0.137, 0.169, 0.012]
NMI_all= [0.521, 0.274, 0.436, 0.315, 0.322, 0.15]
Sil_all= [0.009, -0.03, -0.107, 0.159, -0.083, 0.119]
ARI_all= [0.236, 0.148, 0.048, 0.137, 0.169, 0.012, 0.098]
NMI_all= [0.521, 0.274, 0.436, 0.315, 0.322, 0.15, 0.329]
Sil_all= [0.009, -0.03, -0.107, 0.159, -0.083, 0.119, -0.01]
ARI_all= [0.236, 0.148, 0.048, 0.137, 0.169, 0.012, 0.098, -0.006]
NMI_all= [0.521, 0.274, 0.436, 0.315, 0.322, 0.15, 0.329, 0.002]
Sil_all= [0.009, -0.03, -0.107, 0.159, -0.08

ARI_all= [0.687, 0.34, 0.232, 0.25, 0.714, 0.029, 0.235, 0.242, 0.013]
NMI_all= [0.839, 0.453, 0.643, 0.495, 0.781, 0.286, 0.587, 0.302, 0.251]
Sil_all= [0.19, 0.073, -0.018, 0.169, 0.195, 0.089, 0.044, 0.403, -0.006]
ARI_all= [0.687, 0.34, 0.232, 0.25, 0.714, 0.029, 0.235, 0.242, 0.013, 0.362]
NMI_all= [0.839, 0.453, 0.643, 0.495, 0.781, 0.286, 0.587, 0.302, 0.251, 0.517]
Sil_all= [0.19, 0.073, -0.018, 0.169, 0.195, 0.089, 0.044, 0.403, -0.006, 0.16]
npy_rn152p/Embeddings
ARI_all= [0.736]
NMI_all= [0.872]
Sil_all= [0.184]
ARI_all= [0.736, 0.392]
NMI_all= [0.872, 0.501]
Sil_all= [0.184, 0.089]
ARI_all= [0.736, 0.392, 0.296]
NMI_all= [0.872, 0.501, 0.694]
Sil_all= [0.184, 0.089, -0.006]
ARI_all= [0.736, 0.392, 0.296, 0.327]
NMI_all= [0.872, 0.501, 0.694, 0.62]
Sil_all= [0.184, 0.089, -0.006, 0.145]
ARI_all= [0.736, 0.392, 0.296, 0.327, 0.611]
NMI_all= [0.872, 0.501, 0.694, 0.62, 0.752]
Sil_all= [0.184, 0.089, -0.006, 0.145, 0.16]
ARI_all= [0.736, 0.392, 0.296, 0.327, 0.611, 0.038]
NMI_a