In [28]:
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from torch.utils.data import random_split, DataLoader
from sklearn.model_selection import train_test_split



from customDatasets.audioDataset import AudioDataset

In [29]:
# free gpu
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [30]:
import librosa
import numpy as np

def extract_mfccs(audio_path, n_mfcc=13, sr=16000, hop_length=512, n_fft=1024):
    # Load audio file
    y, sr = librosa.load(audio_path, sr=sr)
    
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)
    
    # Compute mean and variance for each MFCC coefficient over all frames
    mfccs_mean = np.mean(mfccs, axis=1)
    mfccs_var = np.var(mfccs, axis=1)
    
    # Combine mean and variance into a single feature vector
    mfccs_features = np.concatenate((mfccs_mean, mfccs_var))
    
    return mfccs_features

# Example usage
audio_path = "./data/train/normal_id_00_00000000.wav"
mfcc_features = extract_mfccs(audio_path)
print(mfcc_features)
print(mfcc_features.shape)

[-3.5131357e+02  1.0086341e+02 -1.0051440e+01  2.8362326e+01
  5.0546038e-01  1.4227257e+01 -4.3528104e+00  3.0120912e+00
 -3.3442781e+00  3.2717636e+00 -2.8724821e+00 -3.4457836e-01
  4.8515433e-01  5.5663044e+01  2.6887306e+01  2.2144478e+01
  1.6949223e+01  1.6398485e+01  1.7218737e+01  1.6001156e+01
  1.6644236e+01  1.8200521e+01  1.7958870e+01  1.5473613e+01
  1.5308606e+01  1.5064983e+01]
(26,)


In [31]:
#Load training and test dataset

def set_seed(seed = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(42)

data_path = "./data/train/"
data_path_test = "./data/test/"


meta_train_df = pd.read_csv("./data/train.csv")
meta_test_df = pd.read_csv("./data/test.csv")

train_df = meta_train_df[['filename', 'is_normal', 'machine_id']]
range_train, range_test = train_test_split(range(len(train_df)), test_size=0.2, train_size=0.8, random_state=None, shuffle=True, stratify=meta_train_df['machine_id'])

val_df = train_df.iloc[range_test].reset_index(drop=True)

train_df = train_df.iloc[range_train].reset_index(drop=True)

test_df = meta_test_df[['filename', 'is_normal', 'machine_id']]

train_audios = []
val_audios = []
test_audios = []
test_labels = []


for idx in range(len(train_df)):
    audio_file = data_path + train_df.loc[idx, 'filename']
    aud = extract_mfccs(audio_file)
    train_audios.append(aud)

for idx in range(len(val_df)):
    audio_file = data_path + val_df.loc[idx, 'filename']
    aud = extract_mfccs(audio_file)
    val_audios.append(aud)

for idx in range(len(test_df)):
    audio_file = data_path_test + test_df.loc[idx, 'filename']
    aud = extract_mfccs(audio_file)
    test_audios.append(aud)
    test_labels.append(test_df.loc[idx, 'is_normal'])

In [32]:
train_dataset = np.array(train_audios)
val_dataset = np.array(val_audios)
test_dataset = np.array(test_audios)
test_labels = np.array(test_labels)

print(train_dataset.shape)
print(val_dataset.shape)
print(test_dataset.shape)

(1896, 26)
(474, 26)
(1101, 26)


In [70]:
gmm = GaussianMixture(n_components=4)
gmm.fit(train_dataset)

scores = gmm.predict(test_dataset)

print(scores.shape)

fpr, tpr, _ = roc_curve(test_labels, scores, pos_label=0)
roc_auc = auc(fpr, tpr)
print(roc_auc)

(1101,)
0.7974469413233458
