<a href="https://colab.research.google.com/github/Jerry086/SALSA/blob/Shiqing_audioset_embedding_checking/Hierarchical_and_dbscan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

metadata = pd.read_csv('/content/drive/MyDrive/audioset/audio_metadata.csv')
embeddings = pd.read_csv('/content/drive/MyDrive/audioset/audio_embeddings.csv')

merged_df = pd.merge(metadata, embeddings, on='video_id')

# print(metadata.head())
# print(embeddings.head())
# print(merged_df.head())

In [4]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

embeddings = merged_df.loc[:, 'feature_0':'feature_1279'].values

scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings)

# Initialize PCA
pca = PCA(n_components=128)

# Fit PCA on embeddings
reduced_embeddings = pca.fit_transform(embeddings_scaled)

# Check the shape of the reduced embeddings
print(reduced_embeddings.shape)

(21782, 128)


Try Hierarchical Clustering: Builds a hierarchy of clusters agglomeratively (bottom-up)

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Perform Hierarchical Clustering
cluster = AgglomerativeClustering(distance_threshold=None, n_clusters=1450, linkage='ward')

# Fit model
cluster.fit(reduced_embeddings)

# Add the cluster labels back to the merged DataFrame
merged_df['cluster_label'] = cluster.labels_

# Display the first few rows to verify
print(merged_df.head())

      video_id  start_time_seconds  end_time_seconds          labels  \
0  wqoOX8K8DEU                30.0              40.0      [396, 397]   
1  wqH6Sj_h948               120.0             130.0   [0, 441, 443]   
2  wq1098my4zA               130.0             140.0  [27, 137, 271]   
3  wqR7LHho-WE                10.0              20.0     [0, 22, 25]   
4  wq6Me-UUbSc               360.0             370.0           [413]   

   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  ...  \
0         89        255         19         54        240        199  ...   
1         89         74        221        113         99        254  ...   
2        224        124        142        123        113        144  ...   
3        147        207        173        147        216         98  ...   
4         70         90        168         90        220         90  ...   

   feature_1271  feature_1272  feature_1273  feature_1274  feature_1275  \
0           152           255      

In [None]:
cluster_size = merged_df['cluster_label'].value_counts()
single_value_groups = cluster_size[cluster_size == 1].count()
print(cluster_size)
print(f"Number of clusters with only one member: {single_value_groups}")

864     98
943     76
2       61
198     61
80      53
        ..
1115     1
1370     1
1036     1
848      1
1281     1
Name: cluster_label, Length: 1450, dtype: int64
Number of clusters with only one member: 7


In [None]:
# Analyze the labels for a specific cluster
specific_cluster = merged_df[merged_df['cluster_label'] == 17]
print(specific_cluster['labels'].value_counts())

In [None]:
# a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.
from sklearn.metrics import silhouette_score

# Calculate the silhouette score
score = silhouette_score(reduced_embeddings, cluster.labels_)
print('Silhouette Score: %.3f' % score)

Silhouette Score: 0.031


In [None]:
# A lower Davies-Bouldin index relates to a model with better separation between the clusters.
from sklearn.metrics import davies_bouldin_score

# Calculate the Davies-Bouldin index
db_index = davies_bouldin_score(reduced_embeddings, cluster.labels_)
print('Davies-Bouldin Index:', db_index)

Davies-Bouldin Index: 2.3947606526146283


In [None]:
# Calinski-Harabasz Index (the Variance Ratio Criterion), Higher scores indicate clusters are dense and well separated
from sklearn.metrics import calinski_harabasz_score

# Calculate the Calinski-Harabasz index
ch_index = calinski_harabasz_score(reduced_embeddings, cluster.labels_)
print('Calinski-Harabasz Index:', ch_index)

Calinski-Harabasz Index: 18.74815020940819


For DBSCAN (Density-Based Spatial Clustering of Applications with Noise), it uses eps (the maximum distance between two samples for one to be considered as in the neighborhood of the other) and min_samples (the number of samples in a neighborhood for a point to be considered as a core point) to determine the clusters.

In [84]:
from sklearn.cluster import DBSCAN

# Initialize DBSCAN
# can adjust `eps` and `min_samples`
dbscan = DBSCAN(eps=22.5, min_samples=28)

# Fit DBSCAN on the reduced embeddings
cluster_labels = dbscan.fit_predict(reduced_embeddings)

# Add cluster labels to the DataFrame
merged_df['cluster_label'] = cluster_labels

# Display the first few rows to verify
print(merged_df.head())

      video_id  start_time_seconds  end_time_seconds          labels  \
0  wqoOX8K8DEU                30.0              40.0      [396, 397]   
1  wqH6Sj_h948               120.0             130.0   [0, 441, 443]   
2  wq1098my4zA               130.0             140.0  [27, 137, 271]   
3  wqR7LHho-WE                10.0              20.0     [0, 22, 25]   
4  wq6Me-UUbSc               360.0             370.0           [413]   

   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  ...  \
0         89        255         19         54        240        199  ...   
1         89         74        221        113         99        254  ...   
2        224        124        142        123        113        144  ...   
3        147        207        173        147        216         98  ...   
4         70         90        168         90        220         90  ...   

   feature_1271  feature_1272  feature_1273  feature_1274  feature_1275  \
0           152           255      

In [85]:
import numpy as np

# Count the number of unique clusters (excluding noise points labeled as -1)
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)

# Count the number of noise points (points labeled as -1)
num_noise_points = list(cluster_labels).count(-1)

print(f"Number of clusters: {num_clusters}")
print(f"Number of noise points: {num_noise_points}")

Number of clusters: 4
Number of noise points: 5156


In [86]:
import pandas as pd

# Assuming 'merged_df' is your DataFrame and 'cluster_label' is the column with DBSCAN labels
cluster_counts = merged_df['cluster_label'].value_counts()
print(cluster_counts)

 0    16547
-1     5156
 1       35
 2       24
 3       20
Name: cluster_label, dtype: int64
