In [75]:
%reset -f
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_curve, auc
from torch.utils.data import random_split, DataLoader


from customDatasets.audioDataset import AudioDataset


In [76]:
# free gpu
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [77]:
class ConvolutionalAE(nn.Module):
    def __init__(self, encoding_dim):
        super(ConvolutionalAE, self).__init__()
        self.encoding_dim = encoding_dim
        self.encoder = nn.Sequential(
            # (320, 128)
            nn.Conv2d(1, 32, kernel_size=5, stride=(1,2), padding=(2,2)),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            # (320, 64)
            nn.Conv2d(32, 64, kernel_size=5, stride=(1,2), padding=(2,2)),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            # (320, 32)
            nn.Conv2d(64, 128, kernel_size=5, stride=(2,2), padding=(2,2)),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            # (160, 16)
            nn.Conv2d(128, 256, kernel_size=3, stride=(2,2), padding=(1,1)),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            # (80, 8)
            nn.Conv2d(256, 512, kernel_size=3, stride=(2,2), padding=(1,1)),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            # (40, 4, 512)
        )
               
        # inflates the latent space to the shape of the last layer of the encoder
        self.fc=nn.Sequential(
            nn.Linear(512*40*4, self.encoding_dim),
            nn.ReLU(),
            nn.Linear(self.encoding_dim, 512*40*4),
            nn.ReLU()
        )
        
        self.decoder=nn.Sequential(
            # (512, 40, 4)
            nn.ConvTranspose2d(512, 256, kernel_size=3, stride=(2,2), padding=(1,1),output_padding=(1,1)),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            # (256, 80, 8)
            nn.ConvTranspose2d(256, 128, kernel_size=3, stride=(2,2), padding=(1,1),output_padding=(1,1)),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            # (128, 160, 16)
            nn.ConvTranspose2d(128, 64, kernel_size=3, stride=(2,2), padding=(1,1),output_padding=(1,1)),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            # (64, 320, 32)
            nn.ConvTranspose2d(64, 32, kernel_size=5, stride=(1,2), padding=(2,2),output_padding=(0,1)),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            # (32, 320, 64)
            nn.ConvTranspose2d(32, 1, kernel_size=5, stride=(1,2), padding=(2,2),output_padding=(0,1)),
            # (1, 320, 128)
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        encoded = encoded.view(-1, 512*40*4)
        x=self.fc(encoded)
        x = x.view(-1, 512, 40, 4)
        decoded = self.decoder(x)
        
        return decoded

In [78]:
def train_model(model, train_dl, val_dl, test_dl, criterion, optimizer, device, epochs=5, step_size=5):
    lr_scheduler=torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=0.1)
    for epoch in range(epochs):
        train_losses = []
        val_losses = []

        for inputs, labels in train_dl:
            model.train()
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, inputs)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        lr_scheduler.step()
        print(f'Epoch[{epoch + 1}/{epochs}], Train loss: {np.average(train_losses): .4f}')
        
        
        for inputs, labels in val_dl:
            model.eval()
            with torch.no_grad():
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, inputs)
                val_losses.append(loss.item())
        print(f'Epoch[{epoch + 1}/{epochs}], Val loss: {np.average(val_losses): .4f}')
 
        scores = []
        full_labels = []
        for inputs, labels in test_dl:
            model.eval()
            with torch.no_grad():
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                mse = torch.sum((outputs-inputs)**2,dim=(1,2,3))/(inputs.shape[1]*inputs.shape[2]*inputs.shape[3])            
                scores.append(mse)
                full_labels.append(labels)
        
        full_labels = torch.cat([label for label in full_labels])
        scores = torch.cat([score for score in scores])
        fpr, tpr, _ = roc_curve(full_labels.cpu().detach(), scores.cpu().detach(), pos_label=0)
        roc_auc = auc(fpr, tpr)
        print(roc_auc)
        
    return np.average(train_losses),np.average(val_losses),roc_auc

In [79]:
def set_seed(seed = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

CONFIG = {
    "seed": 42,
    "epochs": 10,
    "num_classes": 2,
    "learning_rate": 0.01,
    "train_batch_size": 32,
    "val_batch_size": 16,
    "test_batch_size": 128,
    "criterion": nn.MSELoss(),
    "device":
        torch.device(
            "cuda:0" if torch.cuda.is_available()
            else "mps" if torch.backends.mps.is_available()
            else "cpu"
        )
}

set_seed(CONFIG['seed'])

data_path = "./data/train/"
data_path_test = "./data/test/"


meta_train_df = pd.read_csv("./data/train.csv")
meta_test_df = pd.read_csv("./data/test.csv")

train_df = meta_train_df[['filename', 'is_normal', 'machine_id']]
train_dataset = AudioDataset(train_df, data_path)
test_df = meta_test_df[['filename', 'is_normal', 'machine_id']]
test_dataset = AudioDataset(test_df, data_path_test)

num_items = len(train_dataset)
num_train = int(0.8 * num_items)
num_val = num_items-num_train

train_ds, val_ds = random_split(train_dataset, [num_train, num_val])
test_ds = test_dataset


train_dl = DataLoader(train_ds, batch_size=CONFIG['train_batch_size'], shuffle=True)
val_dl = DataLoader(val_ds, batch_size=CONFIG['val_batch_size'], shuffle=False)
test_dl = DataLoader(test_ds, batch_size=CONFIG["test_batch_size"], shuffle=True)

In [80]:
model = ConvolutionalAE(encoding_dim=128)
model = model.to(CONFIG["device"])
optimizer = optim.Adam(model.parameters(), lr=CONFIG["learning_rate"])
for batch in train_dl:
    inputs, labels = batch
    print(inputs.shape)
    inputs = inputs.to(CONFIG["device"])
    outputs = model(inputs)
    print(outputs.shape)
    break

torch.Size([32, 1, 320, 128])
torch.Size([32, 1, 320, 128])


In [81]:
# compute metrics
inputs_cat=[]
for inputs, labels in train_dl:
    inputs_cat.append(inputs)
inputs_cat = torch.cat([input for input in inputs_cat])
print(inputs_cat.shape)

torch.Size([1896, 1, 320, 128])


In [82]:
# compute the min and max value for each frequency of the batch_sizexchannelxtimexfrequecy
min = torch.min(inputs_cat, dim=0).values
max = torch.max(inputs_cat, dim=0).values
print(max.shape)
print(min.shape)
train_dataset.min = min
train_dataset.max = max
test_dataset.min = min
test_dataset.max = max
measures = []

torch.Size([1, 320, 128])
torch.Size([1, 320, 128])


In [83]:
training=True

# testing emb space size
if training:
    for emb_space_size in [32, 64, 128, 256, 512]:
        model = ConvolutionalAE(encoding_dim=emb_space_size)
        model = model.to(CONFIG["device"])
        optimizer = optim.Adam(model.parameters(), lr=CONFIG["learning_rate"])
        measures.append(train_model(model, train_dl, val_dl, test_dl, CONFIG["criterion"], optimizer, CONFIG["device"], CONFIG["epochs"]))
    for emb_space_size, measure in zip([32, 64, 128, 256, 512], measures):
        print(f"Emb space size: {emb_space_size}, Train loss: {measure[0]}, Val loss: {measure[1]}, ROC AUC: {measure[2]}")

Epoch[1/10], Train loss:  0.6116
Epoch[1/10], Val loss:  0.0290
0.7491427382438619
Epoch[2/10], Train loss:  0.0292
Epoch[2/10], Val loss:  0.0289
0.6680482729920931
Epoch[3/10], Train loss:  0.0255
Epoch[3/10], Val loss:  0.0250
0.7231710362047441
Epoch[4/10], Train loss:  0.0243
Epoch[4/10], Val loss:  0.0242
0.7303662089055347
Epoch[5/10], Train loss:  0.0236
Epoch[5/10], Val loss:  0.0240
0.7391593841032044
Epoch[6/10], Train loss:  0.0235
Epoch[6/10], Val loss:  0.0240
0.7411652101539743
Epoch[7/10], Train loss:  0.0234
Epoch[7/10], Val loss:  0.0239
0.7405493133583022
Epoch[8/10], Train loss:  0.0234
Epoch[8/10], Val loss:  0.0240
0.7454140657511444
Epoch[9/10], Train loss:  0.0234
Epoch[9/10], Val loss:  0.0240
0.7421098626716605
Epoch[10/10], Train loss:  0.0234
Epoch[10/10], Val loss:  0.0240
0.7313607990012485
Epoch[1/10], Train loss:  0.3134
Epoch[1/10], Val loss:  0.0338
0.565364128173117
Epoch[2/10], Train loss:  0.0334
Epoch[2/10], Val loss:  0.0266
0.6946025801081981
Epo

In [84]:
# take the best one and train it for more epochs
if training:
    emb_space_measures=[32, 64, 128, 256, 512]
    model = ConvolutionalAE(encoding_dim=emb_space_measures[np.argmax([measure[2] for measure in measures])])
    model = model.to(CONFIG["device"])

In [85]:
if training:
    optimizer = optim.Adam(model.parameters(), lr=+0.01)
    train_model(model, train_dl, val_dl, test_dl, CONFIG["criterion"], optimizer, CONFIG["device"], 50, 20)

Epoch[1/50], Train loss:  0.6693
Epoch[1/50], Val loss:  0.0452
0.7473366625052018
Epoch[2/50], Train loss:  0.0291
Epoch[2/50], Val loss:  0.0268
0.663866000832293
Epoch[3/50], Train loss:  0.0270
Epoch[3/50], Val loss:  0.0278
0.7280316271327507
Epoch[4/50], Train loss:  0.0254
Epoch[4/50], Val loss:  0.0264
0.7251560549313358
Epoch[5/50], Train loss:  0.0248
Epoch[5/50], Val loss:  0.0241
0.6803537245110278
Epoch[6/50], Train loss:  0.0240
Epoch[6/50], Val loss:  0.0241
0.7525301706200582
Epoch[7/50], Train loss:  0.0238
Epoch[7/50], Val loss:  0.0238
0.7207199334165627
Epoch[8/50], Train loss:  0.0233
Epoch[8/50], Val loss:  0.0237
0.7310944652517687
Epoch[9/50], Train loss:  0.0229
Epoch[9/50], Val loss:  0.0240
0.7332417811069497
Epoch[10/50], Train loss:  0.0229
Epoch[10/50], Val loss:  0.0232
0.7316229712858927
Epoch[11/50], Train loss:  0.0228
Epoch[11/50], Val loss:  0.0233
0.7249438202247191
Epoch[12/50], Train loss:  0.0228
Epoch[12/50], Val loss:  0.0230
0.6936870578443612

In [86]:
#save weights into weights/weights.pth
if training:
    torch.save(model.state_dict(), "./weights/weights.pth")

In [87]:
best_index=np.argmax([measure[2] for measure in measures]) if training else 1
model=ConvolutionalAE(encoding_dim=[32, 64, 128, 256, 512][best_index])
model.load_state_dict(torch.load("./weights/weights.pth"))
model=model.to(CONFIG["device"])
train_dataset.with_id=True
test_dataset.with_filename=True

In [88]:
# compute the average mse for each id in the val_dl
mse_dict = {}
for inputs, labels, ids in val_dl:
    inputs, labels = inputs.to(CONFIG["device"]), labels.to(CONFIG["device"])
    with torch.no_grad():
        outputs = model(inputs)
        diff=outputs-inputs
        for id,diff in zip(ids,diff):
            if id in mse_dict:
                mse_dict[id.item()].append((torch.sum(diff**2)/(inputs.shape[1]*inputs.shape[2]*inputs.shape[3])).item())
            else:
                mse_dict[id.item()]=[(torch.sum(diff**2)/(inputs.shape[1]*inputs.shape[2]*inputs.shape[3])).item()]

print({key:np.average(value) for key,value in mse_dict.items()})

{0: 0.02267526462674141, 4: 0.020125363022089005, 2: 0.018952330574393272}


In [89]:
for inputs, labels, ids in test_dl:
    inputs, labels = inputs.to(CONFIG["device"]), labels.to(CONFIG["device"])
    with torch.no_grad():
        outputs = model(inputs)
        mse = torch.sum((outputs-inputs)**2,dim=(1,2,3))/(inputs.shape[1]*inputs.shape[2]*inputs.shape[3])
        for name,mse in zip(ids,mse):
            print(name,mse)
        break

anomaly_id_00_00000079.wav tensor(0.0424, device='cuda:0')
anomaly_id_04_00000012.wav tensor(0.0242, device='cuda:0')
anomaly_id_04_00000070.wav tensor(0.0310, device='cuda:0')
anomaly_id_00_00000219.wav tensor(0.0299, device='cuda:0')
normal_id_00_00000020.wav tensor(0.0235, device='cuda:0')
anomaly_id_04_00000105.wav tensor(0.0332, device='cuda:0')
anomaly_id_04_00000029.wav tensor(0.0309, device='cuda:0')
anomaly_id_02_00000254.wav tensor(0.0150, device='cuda:0')
anomaly_id_04_00000073.wav tensor(0.0244, device='cuda:0')
normal_id_04_00000030.wav tensor(0.0265, device='cuda:0')
anomaly_id_04_00000102.wav tensor(0.0206, device='cuda:0')
anomaly_id_00_00000086.wav tensor(0.0354, device='cuda:0')
normal_id_00_00000091.wav tensor(0.0146, device='cuda:0')
anomaly_id_02_00000128.wav tensor(0.0279, device='cuda:0')
anomaly_id_02_00000105.wav tensor(0.0170, device='cuda:0')
anomaly_id_00_00000336.wav tensor(0.0384, device='cuda:0')
anomaly_id_02_00000033.wav tensor(0.0227, device='cuda:0')
