In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0


In [3]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
import faiss
from collections import defaultdict
from scipy.spatial import distance
import time

In [5]:
metadata = pd.read_csv('/content/test_metadata_rootclass.csv')
embeddings = pd.read_csv('/content/test_embeddings_meanpool.csv')

# Merge the metadata and embeddings on 'video_id'
merged_df = pd.merge(metadata, embeddings, on='video_id')
merged_df

Unnamed: 0,video_id,start_time_seconds,end_time_seconds,labels,root_classes,feature_0,feature_1,feature_2,feature_3,feature_4,...,feature_118,feature_119,feature_120,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127
0,iZcPr3wgJCw,30.0,40.0,['Engine knocking'],['Engine'],64.9,221.3,85.4,7.4,215.8,...,124.0,191.2,144.8,65.8,87.1,153.0,112.2,216.3,187.1,173.0
1,iZmn0dUXP9E,40.0,50.0,"['Toothbrush', 'Electric toothbrush']","['Domestic sounds, home sounds']",93.7,112.2,100.1,20.7,166.8,...,54.9,181.4,106.5,192.7,50.5,67.4,95.2,123.9,190.5,132.4
2,iZ2K_GPtc6g,30.0,40.0,"['Sine wave', 'Chirp tone']",['Other sourceless'],170.6,72.8,107.5,83.9,118.0,...,96.5,204.1,215.9,80.7,77.4,20.7,80.1,190.3,42.6,146.3
3,iZtF1lUPbEQ,20.0,30.0,"['Fart', 'Buzz', 'Music']","['Wild animals', 'Digestive', 'Music', 'Onomat...",170.6,118.4,102.6,128.2,134.6,...,120.7,27.1,137.5,150.0,90.3,94.5,100.1,136.5,156.4,119.3
4,iZFRfh1Wjt8,420.0,430.0,"['Vehicle', 'Vehicle horn, car horn, honking',...","['Alarm', 'Vehicle']",73.9,166.3,82.6,17.5,230.2,...,91.4,149.5,149.7,134.8,124.3,62.7,146.1,166.0,52.4,85.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21777,2Cmn2lplGfc,30.0,40.0,"['Propeller, airscrew']",['Vehicle'],59.7,189.7,13.3,18.3,179.3,...,255.0,214.7,253.7,112.6,123.7,90.4,110.9,138.4,58.5,113.1
21778,2ClQcAd-wJg,30.0,40.0,"['Speech', 'Buzzer']","['Alarm', 'Human voice']",55.0,147.8,237.5,223.2,126.1,...,65.6,122.0,178.2,192.2,198.1,175.9,237.6,61.6,182.2,153.4
21779,-Sclnq22t7o,30.0,40.0,"['Burping, eructation']",['Digestive'],82.0,77.8,171.4,86.6,112.6,...,147.6,129.8,32.3,148.6,101.8,54.0,85.2,107.8,183.9,112.1
21780,-SD9DkKyOrY,30.0,40.0,"['Percussion', 'Drum kit', 'Drum', 'Snare drum...",['Music'],180.4,107.9,81.7,80.9,99.4,...,42.1,33.7,0.7,233.8,150.8,93.2,255.0,128.7,137.1,230.7


In [6]:
# Simplify the root_classes column for easier processing
merged_df['root_classes'] = merged_df['root_classes'].apply(lambda x: eval(x)[0] if eval(x) else None)
root_class_distribution = merged_df['root_classes'].value_counts()
print(root_class_distribution.head(20))

Music                                       7198
Domestic sounds, home sounds                1806
Wild animals                                1175
Alarm                                       1068
Vehicle                                      904
Human voice                                  790
Livestock, farm animals, working animals     674
Miscellaneous sources                        615
Onomatopoeia                                 590
Noise                                        546
Mechanisms                                   539
Respiratory sounds                           527
Engine                                       508
Water                                        495
Acoustic environment                         476
Liquid                                       383
Explosion                                    379
Digestive                                    365
Generic impact sounds                        357
Domestic animals, pets                       328
Name: root_classes, 

In [105]:
# Select 50 audios from each of the top 20 root classes
evaluation_set = pd.DataFrame()
for root_class in root_class_distribution.head(20).index:
    class_subset = merged_df[merged_df['root_classes'] == root_class]
    # Randomly select 50 audios if possible, or all audios if less than 100 are available
    sample_subset = class_subset.sample(n=min(50, len(class_subset)), random_state=42)
    evaluation_set = pd.concat([evaluation_set, sample_subset], axis=0)

# Separate the dataset into evaluation set and the rest for training the models
remaining_set = merged_df[~merged_df.index.isin(evaluation_set.index)]

In [106]:
# Extract embeddings from dataframe
train_embeddings = remaining_set.iloc[:, 5:].values
test_embeddings = evaluation_set.iloc[:, 5:].values
print(train_embeddings.shape)
print(test_embeddings.shape)

(20782, 128)
(1000, 128)


In [107]:
def perform_kmeans_clustering(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    model = kmeans.fit(embeddings)
    prediction = model.predict(embeddings)
    return model, prediction

In [108]:
def perform_gmm_clustering(embeddings, n_clusters):
    model = GaussianMixture(n_components=n_clusters, random_state=42)
    model.fit(embeddings)
    prediction = model.predict(embeddings)
    return model, prediction

In [109]:
def perform_faiss_indexing(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

In [110]:
experiment_log = {}
experiments = ['Linear', 'KMeans', 'GMM', 'Faiss']

for experiment in experiments:
  experiment_log[experiment] = {}

In [111]:
# perform KMeans clustering
model, prediction = perform_kmeans_clustering(train_embeddings, 30)

# create index based on cluster label
clusters = defaultdict(list)
for index, label in enumerate(prediction):
    clusters[label].append(index)

experiment_log['KMeans']['model'] = model
experiment_log['KMeans']['prediction'] = prediction
experiment_log['KMeans']['clusters'] = clusters



In [112]:
# perform GMM clustering
model, prediction = perform_gmm_clustering(train_embeddings, 30)

# create index based on cluster label
clusters = defaultdict(list)
for index, label in enumerate(prediction):
    clusters[label].append(index)

experiment_log['GMM']['model'] = model
experiment_log['GMM']['prediction'] = prediction
experiment_log['GMM']['clusters'] = clusters

In [113]:
# perform Faiss indexing
model = perform_faiss_indexing(train_embeddings)
experiment_log['Faiss']['model'] = model

In [114]:
def accuracy_measure(linear_results, cluster_based_results):
  assert len(linear_results) == len(cluster_based_results), "The number of query results must match"
  accuracy = 0
  for linear_result, cluster_based_result in zip(linear_results, cluster_based_results):
    overlap = set(linear_result) & set(cluster_based_result)
    accuracy += len(overlap) / len(linear_result)
  return accuracy / len(linear_results)

In [115]:
def linear_evaluation(k=100):
    begin = time.time()
    distances = distance.cdist(test_embeddings, train_embeddings, 'euclidean')
    closest_indices = np.argsort(distances)[:,:k]
    experiment_log['Linear']['time'] = time.time() - begin
    experiment_log['Linear']['query_results'] = closest_indices
    experiment_log['Linear']['accuracy'] = 1.0
    print(f"Linear search query time: {experiment_log['Linear']['time']}")

In [117]:
# collect data for linear search
linear_evaluation()

Linear search query time: 15.30975079536438


In [120]:
# qeury evaluation for clustering models
def query_evaluation(method, k=100):
  begin = time.time()

  # retrive index from dict
  clusters = experiment_log[method]['clusters']

  closest_cluster = experiment_log[method]['model'].predict(test_embeddings)
  query_results = []

  for label, query_embedding in zip(closest_cluster, test_embeddings):
    # Extract the embeddings of the closest cluster
    cluster_embeddings_indices = clusters[label]
    cluster_embeddings = train_embeddings[cluster_embeddings_indices]
    # Calculate distances from the query to each embedding in the cluster
    distances = distance.cdist([query_embedding], cluster_embeddings, 'euclidean')[0]
    # Get the indices of the closest embeddings
    closest_indices = np.argsort(distances)[:k]
    # Retrieve the original indices of the closest embeddings in the dataset
    closest_embeddings_original_indices = [cluster_embeddings_indices[i] for i in closest_indices]
    # Append query result to result list
    query_results.append(closest_embeddings_original_indices)
  experiment_log[method]['time'] = time.time() - begin
  experiment_log[method]['query_results'] = np.array(query_results)
  # print(type(experiment_log[method]['query_results']))
  print(f"{method} query time: {experiment_log[method]['time']}")

In [129]:
# collect data for KMeans and GMM
for method in ['KMeans', 'GMM']:
  query_evaluation(method)
  experiment_log[method]['accuracy'] = accuracy_measure(
      experiment_log['Linear']['query_results'],
      experiment_log[method]['query_results'])
  print(method, 'accuracy:',experiment_log[method]['accuracy'])

KMeans query time: 1.6938183307647705
KMeans accuracy: 0.5341300000000005
GMM query time: 2.0203702449798584
GMM accuracy: 0.4952800000000003


In [130]:
def faiss_evaluation(k=100):
  begin = time.time()
  D, I = experiment_log['Faiss']['model'].search(test_embeddings, k)
  experiment_log['Faiss']['time'] = time.time() - begin
  experiment_log['Faiss']['query_results'] = I
  print(f"Faiss query time: {experiment_log['Faiss']['time']}")

In [146]:
# collect data for faiss
faiss_evaluation()
experiment_log['Faiss']['accuracy'] = accuracy_measure(
    experiment_log['Linear']['query_results'],
    experiment_log['Faiss']['query_results'])
print('Faiss accuracy:',experiment_log['Faiss']['accuracy'])

Faiss query time: 0.19264435768127441
Faiss accuracy: 0.99995


In [147]:
methods = ['Linear', 'KMeans', 'GMM', 'Faiss']
summary_table = {
    'Method': methods,
    'Accuracy (%)': [round(experiment_log[method]['accuracy'], 3) * 100 for method in methods],
    'Efficiency (ms)': [int(experiment_log[method]['time'] * 100) for method in methods]
}

summary_df = pd.DataFrame(summary_table)
summary_df

Unnamed: 0,Method,Accuracy (%),Efficiency (ms)
0,Linear,100.0,1530
1,KMeans,53.4,169
2,GMM,49.5,202
3,Faiss,100.0,19
