In [3]:
%reset -f
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_curve, auc
from torch.utils.data import random_split, DataLoader


from customDatasets.audioDataset import AudioDataset


In [4]:
# free gpu
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [5]:
def set_seed(seed = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

CONFIG = {
    "seed": 42,
    "epochs": 20,
    "num_classes": 2,
    "learning_rate": 0.01,
    "train_batch_size": 32,
    "val_batch_size": 16,
    "test_batch_size": 128,
    "criterion": nn.MSELoss(),
    "device":
        torch.device(
            "cuda:0" if torch.cuda.is_available()
            else "mps" if torch.backends.mps.is_available()
            else "cpu"
        )
}

set_seed(CONFIG['seed'])

data_path = "./data/train/"
data_path_test = "./data/test/"


meta_train_df = pd.read_csv("./data/train.csv")
meta_test_df = pd.read_csv("./data/test.csv")

train_df = meta_train_df[['filename', 'is_normal', 'machine_id']]
train_dataset = AudioDataset(train_df, data_path)
test_df = meta_test_df[['filename', 'is_normal', 'machine_id']]
test_dataset = AudioDataset(test_df, data_path_test)

num_items = len(train_dataset)
num_train = int(0.8 * num_items)
num_val = num_items-num_train

train_ds, val_ds = random_split(train_dataset, [num_train, num_val])
test_ds = test_dataset


train_dl = DataLoader(train_ds, batch_size=CONFIG['train_batch_size'], shuffle=True)
val_dl = DataLoader(val_ds, batch_size=CONFIG['val_batch_size'], shuffle=False)
test_dl = DataLoader(test_ds, batch_size=CONFIG["test_batch_size"], shuffle=False)

In [6]:
# compute metrics
inputs_cat=[]
for inputs, labels in train_dl:
    inputs_cat.append(inputs)
inputs_cat = torch.cat([input for input in inputs_cat])
print(inputs_cat.shape)
val_cat=[]
for inputs, labels in val_dl:
    val_cat.append(inputs)
val_cat = torch.cat([input for input in val_cat])
print(val_cat.shape)
test_cat=[]
for inputs, labels in test_dl:
    test_cat.append(inputs)
test_cat = torch.cat([input for input in test_cat])
print(test_cat.shape)

torch.Size([1896, 1, 320, 128])
torch.Size([474, 1, 320, 128])
torch.Size([1101, 1, 320, 128])


In [7]:
# flatten it
inputs_cat = inputs_cat.view(inputs_cat.shape[0], -1)
val_cat = val_cat.view(val_cat.shape[0], -1)
test_cat = test_cat.view(test_cat.shape[0], -1)
print(inputs_cat.shape)
print(val_cat.shape)
print(test_cat.shape)

torch.Size([1896, 40960])
torch.Size([474, 40960])
torch.Size([1101, 40960])


In [10]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import roc_curve, auc

values=[]
for n_components, n_clusters in [(5,2),(5,3),(5,4),(10,2),(10,3),(10,4),(15,2),(15,3),(15,4),(20,2),(20,3),(20,4)]:
    # do pca
    from sklearn.decomposition import PCA
    pca = PCA(n_components=n_components)
    pca.fit(inputs_cat)
    inputs_pca = pca.transform(inputs_cat)
    val_pca = pca.transform(val_cat)
    test_pca = pca.transform(test_cat)
    print(inputs_pca.shape)
    
    # use a clustering method hierarchical
    from sklearn.cluster import AgglomerativeClustering
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    clustering.fit(inputs_pca)
    
    # Calculate centroids
    centroids = []
    for cluster_label in range(n_clusters):
        cluster_points = inputs_pca[clustering.labels_ == cluster_label]
        centroid = cluster_points.mean(axis=0)
        centroids.append(centroid)
    centroids = np.array(centroids)
    
    # Calculate distances to centroids
    distances_to_centroids = euclidean_distances(test_pca, centroids)
    
    fpr, tpr, _ = roc_curve(test_df['is_normal'], distances_to_centroids[:, 0])
    roc_auc = auc(fpr, tpr)
    print(f"n_components: {n_components}, n_clusters: {n_clusters}, auc: {roc_auc}")
    values.append((n_components, n_clusters, roc_auc))

best=values[np.argmax([v[2] for v in values])]
print(f"best: {best}")

(1896, 5)
n_components: 5, n_clusters: 2, auc: 0.2721223470661673
(1896, 5)
n_components: 5, n_clusters: 3, auc: 0.5882313774448605
(1896, 5)
n_components: 5, n_clusters: 4, auc: 0.517191011235955
(1896, 10)
n_components: 10, n_clusters: 2, auc: 0.27590928006658344
(1896, 10)
n_components: 10, n_clusters: 3, auc: 0.5762380357885977
(1896, 10)
n_components: 10, n_clusters: 4, auc: 0.492267998335414
(1896, 15)
n_components: 15, n_clusters: 2, auc: 0.2580982105701207
(1896, 15)
n_components: 15, n_clusters: 3, auc: 0.5906367041198503
(1896, 15)
n_components: 15, n_clusters: 4, auc: 0.4990470245526425
(1896, 20)
n_components: 20, n_clusters: 2, auc: 0.2757761131918436
(1896, 20)
n_components: 20, n_clusters: 3, auc: 0.5785892634207241
(1896, 20)
n_components: 20, n_clusters: 4, auc: 0.48675405742821476
best: (15, 3, 0.5906367041198503)


In [11]:
# using the LOF
from sklearn.neighbors import LocalOutlierFactor
for n_components in [2,5,10,15,20]:
    # do pca
    from sklearn.decomposition import PCA
    pca = PCA(n_components=n_components)
    pca.fit(inputs_cat)
    inputs_pca = pca.transform(inputs_cat)
    val_pca = pca.transform(val_cat)
    test_pca = pca.transform(test_cat)
    print(inputs_pca.shape)
    
    # use a clustering method hierarchical
    clf = LocalOutlierFactor(novelty=True)
    clf.fit(inputs_pca)
    
    # Calculate distances to centroids
    distances_to_centroids = clf.decision_function(test_pca)
    
    fpr, tpr, _ = roc_curve(test_df['is_normal'], distances_to_centroids)
    roc_auc = auc(fpr, tpr)
    print(f"n_components: {n_components}, auc: {roc_auc}")

(1896, 2)
n_components: 2, auc: 0.5933104452767374
(1896, 5)
n_components: 5, auc: 0.6353225135247609
(1896, 10)
n_components: 10, auc: 0.6130711610486892
(1896, 15)
n_components: 15, auc: 0.5766708281315024
(1896, 20)
n_components: 20, auc: 0.5984519350811486
