In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df= pd.read_csv("/kaggle/input/audio-files/Dataset/train_labels.csv")
df.head()

In [None]:
import librosa

time_series = []
sample_rate = []
for idx in range(1500):
    audio_path = "/kaggle/input/audio-files/Dataset/train_folder/" +str(idx+1) + ".wav"
    y, sr = librosa.load(audio_path, sr=None, mono=False)
    time_series.append(y)
    sample_rate.append(sr)

df["time_series"] = time_series
df["sample_rate"] = sample_rate
df.head()

In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

df['cluster'] = encoder.fit_transform(df[['category']])
df.head()

In [None]:
import librosa

comp_df= pd.DataFrame()
time_series = []
sample_rate = []
for idx in range(500):
    audio_path = "/kaggle/input/audio-files/Dataset/test_folder/" +str(idx+1) + ".wav"
    y, sr = librosa.load(audio_path, sr=None, mono=False)
    time_series.append(y)
    sample_rate.append(sr)

comp_df["time_series"] = time_series
comp_df["sample_rate"] = sample_rate
comp_df.head()

## Feature extraction and clustering

In [None]:
# Extracting mfcc features

n_mfcc = 13

MFCC= []
DELTA_MFCC= []
DELTA2_MFCC= []
MFCC_MEAN= []
MFCC_VAR= []
DELTA_MFCC_MEAN= []
DELTA_MFCC_VAR= []
DELTA2_MFCC_MEAN= []
DELTA2_MFCC_VAR= []

for idx in range(500):
    y= comp_df.iloc[idx]['time_series']
    sr= comp_df.iloc[idx]['sample_rate']
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    MFCC.append(mfcc)
    
    delta_mfcc = librosa.feature.delta(mfcc)
    DELTA_MFCC.append(delta_mfcc)
    
    delta2_mfcc = librosa.feature.delta(mfcc, order=2)
    DELTA2_MFCC.append(delta2_mfcc)
    
    mfcc_mean = np.mean(mfcc, axis=1)
    MFCC_MEAN.append(mfcc_mean)
    
    mfcc_var = np.var(mfcc, axis=1)
    MFCC_VAR.append(mfcc_var)
    
    delta_mfcc_mean = np.mean(delta_mfcc, axis=1)
    DELTA_MFCC_MEAN.append(delta_mfcc_mean)
    
    delta_mfcc_var = np.var(delta_mfcc, axis=1)
    DELTA_MFCC_VAR.append(delta_mfcc_var)
    
    delta2_mfcc_mean = np.mean(delta2_mfcc, axis=1)
    DELTA2_MFCC_MEAN.append(delta2_mfcc_mean)
    
    delta2_mfcc_var = np.var(delta2_mfcc, axis=1)
    DELTA2_MFCC_VAR.append(delta2_mfcc_var)

comp_df["mfcc"] = MFCC
comp_df["delta_mfcc"] = DELTA_MFCC
comp_df["delta2_mfcc"] = DELTA2_MFCC
comp_df["mfcc_mean"] = MFCC_MEAN
comp_df["mfcc_var"] = MFCC_VAR
comp_df["delta_mfcc_mean"] = DELTA_MFCC_MEAN
comp_df["delta_mfcc_var"] = DELTA_MFCC_VAR
comp_df["delta2_mfcc_mean"] = DELTA2_MFCC_MEAN
comp_df["delta2_mfcc_var"] = DELTA2_MFCC_VAR
comp_df.head()

In [None]:
# Extracting time domain features

rms_energy = []
ZCR = []
for idx in range(500):
    y= comp_df.iloc[idx]['time_series']
    sr= comp_df.iloc[idx]['sample_rate']
    rms = librosa.feature.rms(y=y)
    rms_energy.append(rms)
    zcr = librosa.feature.zero_crossing_rate(y)
    ZCR.append(zcr)
    
comp_df["rms_energy"] = rms_energy
comp_df["zcr"] = ZCR
comp_df.head()

In [None]:
# Extracting spectral features

spectral_centroid = []
spectral_bandwidth = []
for idx in range(500):
    y= comp_df.iloc[idx]['time_series']
    sr= comp_df.iloc[idx]['sample_rate']
    s_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    s_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spectral_centroid.append(s_centroid)
    spectral_bandwidth.append(s_bandwidth)
    
comp_df["spectral_centroid"] = spectral_centroid
comp_df["spectral_bandwidth"] = spectral_bandwidth
comp_df.head()

In [None]:
# Extracting rhythmic features

onset_rate = []
pulse_clarity= []
for idx in range(500):
    y= comp_df.iloc[idx]['time_series']
    sr= comp_df.iloc[idx]['sample_rate']
    onset_r = librosa.onset.onset_strength(y=y, sr=sr)
    onset_rate.append(onset_r)
    p_clarity= librosa.beat.plp(y=y, sr=sr)
    pulse_clarity.append(p_clarity)
    
comp_df["pulse_clarity"] = pulse_clarity
comp_df["onset_rate"] = onset_rate
comp_df.head()

In [None]:
features= list(comp_df.columns)
features.remove('time_series')
features.remove('sample_rate')
features

In [None]:
# Normalising features using mean and variance

import numpy as np

for feat in features:
    mean_col = np.stack(comp_df[feat])
    mu = np.mean(mean_col, axis=0)
    std = np.std(mean_col, axis=0)
    
    mean_col = (mean_col - mu) / std
    mean_col= np.nan_to_num(mean_col, nan= 0)
    comp_df[feat+"_normalised"] = mean_col.tolist()
    

comp_df.head()

In [None]:
# Flattening feature vectors

import numpy as np
import pandas as pd


features_1d = ['mfcc_mean_normalised', 'mfcc_var_normalised',
       'delta_mfcc_mean_normalised', 'delta_mfcc_var_normalised',
       'delta2_mfcc_mean_normalised', 'delta2_mfcc_var_normalised',
        'pulse_clarity_normalised', 'onset_rate_normalised']
features_2d = ['mfcc_normalised', 'delta_mfcc_normalised',
       'delta2_mfcc_normalised', 'zcr_normalised',
        'rms_energy_normalised','spectral_centroid_normalised', 'spectral_bandwidth_normalised']

comp_df_exp = {}

for feat in features_1d:
    arr = np.vstack(comp_df[feat].values)
    for i in range(arr.shape[1]):
        comp_df_exp[f'{feat}_{i}'] = arr[:, i] 

for feat in features_2d:
    arr = np.array(comp_df[feat].tolist())  
    reshaped_arr = arr.reshape(arr.shape[0], -1)  
    col_names = [f'{feat}_{i}_{j}' for i in range(arr.shape[1]) for j in range(arr.shape[2])]
    
    for j, col in enumerate(col_names):
        comp_df_exp[col] = reshaped_arr[:, j]  


comp_df_exp = pd.DataFrame(comp_df_exp)

comp_df_exp.head()

In [None]:
# PCA on the features

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

pca = PCA(n_components=0.95)
comp_df_pca = pca.fit_transform(comp_df_exp)

comp_df_pca = pd.DataFrame(comp_df_pca, columns=[f'PC{i+1}' for i in range(comp_df_pca.shape[1])])

explained_variance = pca.explained_variance_ratio_

comp_df_pca.head()

In [None]:
comp_df_pca_red = comp_df_pca.iloc[:, :37]
comp_df_pca_red.head()

In [None]:
# Self-implemented KMeans

import numpy as np
import pandas as pd

class DIY_KMeans:
    def __init__(self, k, max_iters=100, tol=1e-4):
        self.k = k
        self.max_iters = max_iters
        self.tol = tol
        self.centroids = None
    
    def fit(self, X: pd.DataFrame):
        n_samples = X.shape[0]
        random_indices = np.random.choice(n_samples, self.k, replace=False)
        self.centroids = X.iloc[random_indices].copy().reset_index(drop=True)
        
        for _ in range(self.max_iters):
            labels = self._assign_clusters(X)
            
            new_centroids = X.groupby(labels).mean()
            
            if new_centroids.shape[0] < self.k:
                break
            
            if np.linalg.norm(self.centroids.values - new_centroids.values) < self.tol:
                break
            
            self.centroids = new_centroids.reset_index(drop=True)
    
    def _assign_clusters(self, X: pd.DataFrame):
        distances = np.linalg.norm(X.values[:, np.newaxis] - self.centroids.values, axis=2)
        return np.argmin(distances, axis=1)
    
    def predict(self, X: pd.DataFrame):
        return self._assign_clusters(X)

In [None]:
# Generating predicted labels using self-implemented KMeans

from sklearn.metrics import adjusted_rand_score

kmeans = DIY_KMeans(k=50)
kmeans.fit(comp_df_pca_red)
final_labels= kmeans.predict(comp_df_pca_red)
final_labels= pd.DataFrame(final_labels)
final_labels = final_labels.rename(columns={0: "cluster"})
final_labels.head()

In [None]:
final_labels['id'] = df['filename'][0:500]
final_labels.set_index('id',inplace=True)
final_labels.head()

In [None]:
final_labels.to_csv("output_1.csv", index= True)

## YAMNET to generate embeddings, followed by clustering

In [None]:
!pip install tensorflow tensorflow_hub librosa numpy

In [None]:
# Loading YAMNET model

import tensorflow as tf
import tensorflow_hub as hub
import librosa
import numpy as np

yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

In [None]:
# Generating embeddings using YAMNET

from tqdm import tqdm

scores = []
embeddings = []
spectrogram = []

for idx, x in tqdm(comp_df.iterrows(), total=len(comp_df), desc="Processing Audio"):
    score, embed, spect = yamnet_model(x['time_series'])  # Get YAMNet outputs
    scores.append(score)
    embeddings.append(embed)
    spectrogram.append(spect)

comp_df['scores'] = scores
comp_df['embeddings'] = embeddings
comp_df['spectrogram'] = spectrogram

comp_df.head()


In [None]:
embed_df= comp_df[['embeddings', 'scores']]
embed_df.head()

In [None]:
arr = np.array(embed_df['embeddings'].apply(lambda x:x.numpy()).tolist())  

mean = np.mean(arr, axis=0, keepdims=True)
std = np.std(arr, axis=0, keepdims=True)
normal_arr= (arr-mean)/std

reshaped_arr = normal_arr.reshape(arr.shape[0], -1)  

col_names = [f'embeddings_{i}_{j}' for i in range(arr.shape[1]) for j in range(arr.shape[2])]

embed_df_exp = {}
for j, col in enumerate(col_names):
    embed_df_exp[col] = reshaped_arr[:, j]  

embed_df_exp = pd.DataFrame(embed_df_exp)
embed_df_exp.fillna(0, inplace=True)
embed_df_exp.head()

### Component reduction by PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

pca = PCA(n_components=0.95)
embed_pca = pca.fit_transform(embed_df_exp)

embed_pca = pd.DataFrame(embed_pca, columns=[f'PC{i+1}' for i in range(embed_pca.shape[1])])

explained_variance = pca.explained_variance_ratio_

embed_pca

In [None]:
# K-Means clustering using YAMNET embeddings post PCA

kmeans = DIY_KMeans(k=50)
kmeans.fit(embed_pca.iloc[:, :25])
final_labels= kmeans.predict(embed_pca.iloc[:, :25])
final_labels= pd.DataFrame(final_labels)
final_labels = final_labels.rename(columns={0: "cluster"})
final_labels.head()

In [None]:
final_labels['id'] = df['filename'][0:500]
final_labels.set_index('id',inplace=True)
final_labels.head()

In [None]:
final_labels.to_csv("output_2.csv", index= True)

### Component reduction using T-SNE

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import pandas as pd

tsne = TSNE(n_components=2, perplexity=30, random_state=42)
embed_tsne = tsne.fit_transform(embed_df_exp)

embed_tsne = pd.DataFrame(embed_tsne, columns=['TSNE1', 'TSNE2'])

embed_tsne

In [None]:
# K-Means clustering using YAMNET embeddings post t-SNE

kmeans = DIY_KMeans(k=50)
kmeans.fit(embed_tsne)
final_labels= kmeans.predict(embed_tsne)
final_labels= pd.DataFrame(final_labels)
final_labels = final_labels.rename(columns={0: "cluster"})
final_labels.head()

In [None]:
final_labels['id'] = df['filename'][0:500]
final_labels.set_index('id',inplace=True)
final_labels.head()

In [None]:
final_labels.to_csv("output_3.csv", index= True)

## CLAP Model

In [None]:
!pip install laion-clap torch librosa tqdm numpy pandas

In [None]:
# Loading pre-trained CLAP model and using it to classify test data

from datasets import load_dataset
from transformers import pipeline
from tqdm import tqdm

class_labels= list(set(df['category']))
results= []
audio_classifier = pipeline(task="zero-shot-audio-classification", model="laion/larger_clap_general")
for idx in tqdm(range(500), desc="Processing Audio Files"):
    audio_path = f"/kaggle/input/audio-files/Dataset/test_folder/{idx+1}.wav"
    output = audio_classifier(audio_path, candidate_labels=class_labels)

    #Assigning the label with maximum similarity score
    predicted_label = max(output, key=lambda x: x["score"])["label"]

    results.append({"audio_file": audio_path, "predicted_label": predicted_label})

results_df= pd.DataFrame(results)
results_df.head()

In [None]:
results_df['id']= df['filename'][0:500]
results_df.head()

In [None]:
results_df= results_df.drop(['audio_file'], axis=1)
results_df= results_df.set_index(['id'])
results_df.head()

In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

results_df['cluster'] = encoder.fit_transform(results_df[['predicted_label']])

results_df= results_df.drop(['predicted_label'], axis=1)
results_df.head()

In [None]:
results_df['cluster']= results_df['cluster'].astype(int)
results_df.head()

In [None]:
results_df.to_csv('output_4.csv', index= True)

## AST Model (Best ARI score submission on Kaggle)

In [None]:
# Loading pre-trained AST model and using it to classify test data

import torch
import torchaudio
import pandas as pd
from transformers import ASTFeatureExtractor, AutoModelForAudioClassification
from tqdm import tqdm

# Defining an audio feature extractor based on config.json file on git repo of the pretrained model

extractor = ASTFeatureExtractor(
    sampling_rate=16000,
    num_mel_bins=128,
    max_length=1024,
    padding="max_length",
    return_attention_mask=True,
    do_normalize=True,
    feature_size=128
)

# Loading the model

model = AutoModelForAudioClassification.from_pretrained("Evan-Lin/ast-esc50")
model.eval()

class_labels = list(model.config.id2label.values())

results = []

# Generating label predictions

for idx in tqdm(range(500), desc="Processing with AST"):
    audio_path = f"/kaggle/input/audio-files/Dataset/test_folder/{idx+1}.wav"

    try:
        waveform, sample_rate = torchaudio.load(audio_path)
        
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)

        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        inputs = extractor(
            waveform.squeeze().numpy(),
            sampling_rate=16000,
            return_tensors="pt"
        )

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            pred_id = torch.argmax(logits, dim=-1).item()
            predicted_label = model.config.id2label[pred_id]

    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        predicted_label = "ERROR"

    results.append({"audio_file": audio_path, "predicted_label": predicted_label})

results_df = pd.DataFrame(results)
results_df.head()


In [None]:
results_df['id']= df['filename'][0:500]
results_df.head()

In [None]:
# Ordinal encoding of labels

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

results_df['cluster'] = encoder.fit_transform(results_df[['predicted_label']])

results_df= results_df.drop(['predicted_label'], axis=1)
results_df.head()

In [None]:
results_df['cluster']= results_df['cluster'].astype(int)
results_df.head()

In [None]:
results_df= results_df.drop(['audio_file'], axis=1)
results_df= results_df.set_index(['id'])
results_df.head()

In [None]:
results_df.to_csv('output_5.csv', index= True)