# Fully connected Auto-Encoder for Tabular Data
In order to compare the classical and quantum methods we will implement several methods including the deep learning method, fully connected auto-encoder. 

In [22]:
# TODO: Make this an automated file, not copying code from "SVM on Iris and arrhythmia Datasets.ipynb"
# Data processing
import pandas as pd
import numpy as np
from collections import Counter
# Visualization
import matplotlib.pyplot as plt

# Data Sets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer

#Auto-encoder imports
import torchvision
from torchvision import datasets
from torchvision import transforms
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#DataSets that we can test
liver = 'https://archive.ics.uci.edu/ml/machine-learning-databases/liver-disorders/bupa.data'
arrhythmia = 'https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data'
vowel = 'https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/vowel/vowel-context.data'
iris = load_iris()
breast_cancer = load_breast_cancer()

name_datasets_csv = {'arrhythmia': arrhythmia}
name_datasets_sklearn = {'iris_data': iris, 'breast_cancer_data': breast_cancer}

In [23]:
vowel_data = pd.read_csv(vowel, index_col=False, delim_whitespace=True, header=None)
vowel_data.head()
# TODO: Should I train on one class and consider the other ones outliers?

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,0,-3.639,0.418,-0.67,1.779,-0.168,1.627,-0.388,0.529,-0.874,-0.814,0
1,0,0,0,-3.327,0.496,-0.694,1.365,-0.265,1.933,-0.363,0.51,-0.621,-0.488,1
2,0,0,0,-2.12,0.894,-1.576,0.147,-0.707,1.559,-0.579,0.676,-0.809,-0.049,2
3,0,0,0,-2.287,1.809,-1.498,1.012,-1.053,1.06,-0.567,0.235,-0.091,-0.795,3
4,0,0,0,-2.598,1.938,-0.846,1.062,-1.633,0.764,0.394,-0.15,0.277,-0.396,4


In [32]:
normal_data = vowel_data.copy()
anomaly_data = vowel_data.copy()
#isolate vowel 5 and 6
normal_data.iloc[:,-1] = np.where((normal_data.iloc[:, -1] == 5)|(normal_data.iloc[:,-1] == 6), 0, 1)
#normal_data.iloc[:,-1] = np.where((normal_data.iloc[:, -1] == 5), 0, 1)
normal_data = normal_data[normal_data.iloc[:,-1] == 0].to_numpy()


#isolate anomaly class
anomaly_data.iloc[:,-1] = np.where((anomaly_data.iloc[:, -1] == 10),1,0)
anomaly_data = anomaly_data[anomaly_data.iloc[:,-1] == 1].to_numpy()

In [33]:
n_zeros = normal_data.shape[0]
train_size = int(n_zeros * 0.9)
test_size = n_zeros - train_size
#select only collumn 6
train_data,test_data = torch.utils.data.random_split(normal_data[:,:], [train_size, test_size], generator=torch.Generator().manual_seed(42))
test_data = np.concatenate([test_data, anomaly_data[:,:]])

In [34]:
# Settings the dtype the first time didn't work, don't ask me why, only god knows :)
train_data = np.array(train_data, dtype=np.float32)
test_data = np.array(test_data, dtype=np.float32)

In [35]:
train_data.shape, test_data.shape

((162, 14), (108, 14))

In [36]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64)
# TODO: Does it make sense to include both 0 and 1 target values at training?

In [37]:
# Define the autoencoder class
class AutoEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Linear(hidden_size//2, hidden_size//4)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size//4, hidden_size//2),
            nn.ReLU(),
            nn.Linear(hidden_size//2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, input_size)
        )
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Set up the autoencoder model
input_size = 13  # The size of the input data
hidden_size = 12  # The number of neurons in the hidden layer
autoencoder = AutoEncoder(input_size, hidden_size)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.005)

# Train the autoencoder
num_epochs = 50

for epoch in range(num_epochs):
    for batch in train_loader:

        X = batch[:, :-1] # drop the label

        # Forward pass
        output = autoencoder(X)
        loss = criterion(output, X)
        
        # Backward pass and optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        
    # Print the loss for this epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/50], Loss: 5.6910
Epoch [2/50], Loss: 5.6006
Epoch [3/50], Loss: 5.4805
Epoch [4/50], Loss: 5.3120
Epoch [5/50], Loss: 5.0876
Epoch [6/50], Loss: 4.7917
Epoch [7/50], Loss: 4.4129
Epoch [8/50], Loss: 3.9353
Epoch [9/50], Loss: 3.3635
Epoch [10/50], Loss: 2.7376
Epoch [11/50], Loss: 2.1389
Epoch [12/50], Loss: 1.5727
Epoch [13/50], Loss: 1.1133
Epoch [14/50], Loss: 0.8746
Epoch [15/50], Loss: 0.8036
Epoch [16/50], Loss: 0.7930
Epoch [17/50], Loss: 0.7565
Epoch [18/50], Loss: 0.6963
Epoch [19/50], Loss: 0.6329
Epoch [20/50], Loss: 0.5848
Epoch [21/50], Loss: 0.5579
Epoch [22/50], Loss: 0.5439
Epoch [23/50], Loss: 0.5385
Epoch [24/50], Loss: 0.5322
Epoch [25/50], Loss: 0.5247
Epoch [26/50], Loss: 0.5157
Epoch [27/50], Loss: 0.5066
Epoch [28/50], Loss: 0.4974
Epoch [29/50], Loss: 0.4888
Epoch [30/50], Loss: 0.4807
Epoch [31/50], Loss: 0.4732
Epoch [32/50], Loss: 0.4660
Epoch [33/50], Loss: 0.4595
Epoch [34/50], Loss: 0.4534
Epoch [35/50], Loss: 0.4474
Epoch [36/50], Loss: 0.4414
E

In [38]:
for batch in test_loader:
    X, y = batch[:, :-1], batch[:, -1]
    output = autoencoder(X)
    diffs = np.sum((X - output).detach().numpy()**2, axis=1)
    print(diffs, y)
    print("This is the result for batch:",np.mean(diffs[y == 0]), np.mean(diffs[y == 1])) 

[ 4.300988    1.1255577   5.6136627   1.4237684   6.2069845  11.279987
  2.9297712   4.823722    3.0409627   1.2942362   1.735055    4.3141537
  2.7314155   1.5265138   1.9763727   3.630903    3.983611    1.5947287
 10.167311    9.919837    9.840106    9.863388    8.99854     8.9606695
  4.769708    4.7922616   5.9377804   6.152851    5.539493    5.460432
  5.454151    4.4658318   4.1031313   4.0999255   4.312312    4.1760793
  0.8690279   0.82465017  0.9352773   1.014418    1.0985583   1.1871036
  4.198247    4.2605658   4.254881    4.2568035   4.253447    3.7785842
  3.6562867   3.4218397   3.114182    3.1306355   3.1710644   3.1230412
  3.3991344   3.6826997   4.1798835   3.9953818   3.759509    3.5422862
  5.2707734   5.5920625   5.4371395   5.334218  ] tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 

In [3]:
ANOMALY_CLASS = 10
vowel_data.iloc[:, -1] = np.select([vowel_data.iloc[:, -1] != ANOMALY_CLASS, vowel_data.iloc[:, -1] == ANOMALY_CLASS], [0, 1])
vowel_data = vowel_data.values

n_zeros = sum(vowel_data[:, -1] == 0)
train_size = int(n_zeros * 0.9)
test_size = n_zeros - train_size
train_data, test_data = torch.utils.data.random_split(vowel_data[vowel_data[:, -1] == 0], [train_size, test_size], generator=torch.Generator().manual_seed(42))
train_data = np.array(train_data, dtype=np.float32)
test_data = np.array(test_data, dtype=np.float32)
test_data = np.concatenate([test_data, vowel_data[vowel_data[:, -1] == 1]])



# Settings the dtype the first time didn't work, don't ask me why, only god knows :)
train_data = np.array(train_data, dtype=np.float32)
test_data = np.array(test_data, dtype=np.float32)



train_loader = torch.utils.data.DataLoader(train_data, batch_size=64)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64)
# TODO: Does it make sense to include both 0 and 1 target values at training?

# Define the autoencoder class
class AutoEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Linear(hidden_size//2, hidden_size//4)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size//4, hidden_size//2),
            nn.ReLU(),
            nn.Linear(hidden_size//2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, input_size)
        )
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Set up the autoencoder model
input_size = 13  # The size of the input data
hidden_size = 12  # The number of neurons in the hidden layer
autoencoder = AutoEncoder(input_size, hidden_size)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.005)

# Train the autoencoder
num_epochs = 50

for epoch in range(num_epochs):
    for batch in train_loader:

        X = batch[:, :-1] # drop the label

        # Forward pass
        output = autoencoder(X)
        loss = criterion(output, X)
        
        # Backward pass and optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        
    # Print the loss for this epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

for batch in test_loader:
    X, y = batch[:, :-1], batch[:, -1]
    output = autoencoder(X)
    diffs = np.sum((X - output).detach().numpy()**2, axis=1)
    print(diffs, y)
    print("This is the result for batch:",np.mean(diffs[y == 0]), np.mean(diffs[y == 1])) 


Epoch [1/50], Loss: 6.4804
Epoch [2/50], Loss: 5.4681
Epoch [3/50], Loss: 2.6730
Epoch [4/50], Loss: 0.9643
Epoch [5/50], Loss: 0.7098
Epoch [6/50], Loss: 0.6367
Epoch [7/50], Loss: 0.6067
Epoch [8/50], Loss: 0.5713
Epoch [9/50], Loss: 0.5542
Epoch [10/50], Loss: 0.5481
Epoch [11/50], Loss: 0.5383
Epoch [12/50], Loss: 0.5311
Epoch [13/50], Loss: 0.5212
Epoch [14/50], Loss: 0.5085
Epoch [15/50], Loss: 0.4677
Epoch [16/50], Loss: 0.4496
Epoch [17/50], Loss: 0.4262
Epoch [18/50], Loss: 0.4177
Epoch [19/50], Loss: 0.4162
Epoch [20/50], Loss: 0.4105
Epoch [21/50], Loss: 0.4004
Epoch [22/50], Loss: 0.3858
Epoch [23/50], Loss: 0.3584
Epoch [24/50], Loss: 0.3365
Epoch [25/50], Loss: 0.3183
Epoch [26/50], Loss: 0.3039
Epoch [27/50], Loss: 0.2909
Epoch [28/50], Loss: 0.2821
Epoch [29/50], Loss: 0.2778
Epoch [30/50], Loss: 0.2720
Epoch [31/50], Loss: 0.2682
Epoch [32/50], Loss: 0.2650
Epoch [33/50], Loss: 0.2627
Epoch [34/50], Loss: 0.2610
Epoch [35/50], Loss: 0.2590
Epoch [36/50], Loss: 0.2573
E

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
