# Fully connected Auto-Encoder for Tabular Data
In order to compare the classical and quantum methods we will implement several methods including the deep learning method, fully connected auto-encoder. 

In [12]:
# TODO: Make this an automated file, not copying code from "SVM on Iris and arrhythmia Datasets.ipynb"
# Data processing
import pandas as pd
import numpy as np
from collections import Counter
# Visualization
import matplotlib.pyplot as plt

# Data Sets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer

#Auto-encoder imports
import torchvision
from torchvision import datasets
from torchvision import transforms
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#DataSets that we can test
liver = 'https://archive.ics.uci.edu/ml/machine-learning-databases/liver-disorders/bupa.data'
arrhythmia = 'https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data'
vowel = 'https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/vowel/vowel-context.data'
iris = load_iris()
breast_cancer = load_breast_cancer()

name_datasets_csv = {'arrhythmia': arrhythmia}
name_datasets_sklearn = {'iris_data': iris, 'breast_cancer_data': breast_cancer}

In [13]:
vowel_data = pd.read_csv(vowel, index_col=False, delim_whitespace=True, header=None)
vowel_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,0,-3.639,0.418,-0.67,1.779,-0.168,1.627,-0.388,0.529,-0.874,-0.814,0
1,0,0,0,-3.327,0.496,-0.694,1.365,-0.265,1.933,-0.363,0.51,-0.621,-0.488,1
2,0,0,0,-2.12,0.894,-1.576,0.147,-0.707,1.559,-0.579,0.676,-0.809,-0.049,2
3,0,0,0,-2.287,1.809,-1.498,1.012,-1.053,1.06,-0.567,0.235,-0.091,-0.795,3
4,0,0,0,-2.598,1.938,-0.846,1.062,-1.633,0.764,0.394,-0.15,0.277,-0.396,4


In [14]:
ANOMALY_CLASS = 10
vowel_data.iloc[:, -1] = np.select([vowel_data.iloc[:, -1] != ANOMALY_CLASS, vowel_data.iloc[:, -1] == ANOMALY_CLASS], [0, 1])
vowel_data = vowel_data.values

In [15]:
n_zeros = sum(vowel_data[:, -1] == 0)
train_size = int(n_zeros * 0.9)
test_size = n_zeros - train_size
train_data, test_data = torch.utils.data.random_split(vowel_data[vowel_data[:, -1] == 0], [train_size, test_size], generator=torch.Generator().manual_seed(42))
train_data = np.array(train_data, dtype=np.float32)
test_data = np.array(test_data, dtype=np.float32)
test_data = np.concatenate([test_data, vowel_data[vowel_data[:, -1] == 1]])

In [16]:
# Settings the dtype the first time didn't work, don't ask me why, only god knows :)
train_data = np.array(train_data, dtype=np.float32)
test_data = np.array(test_data, dtype=np.float32)

In [17]:
train_data.shape, test_data.shape

((810, 14), (180, 14))

In [18]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64)
# TODO: Does it make sense to include both 0 and 1 target values at training?

In [36]:
# Define the autoencoder class
class AutoEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Linear(hidden_size//2, hidden_size//4)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size//4, hidden_size//2),
            nn.ReLU(),
            nn.Linear(hidden_size//2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, input_size)
        )
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Set up the autoencoder model
input_size = 13  # The size of the input data
hidden_size = 6  # The number of neurons in the hidden layer
autoencoder = AutoEncoder(input_size, hidden_size)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.005)

# Train the autoencoder
num_epochs = 50

for epoch in range(num_epochs):
    for batch in train_loader:

        X = batch[:, :-1] # drop the label

        # Forward pass
        output = autoencoder(X)
        loss = criterion(output, X)
        
        # Backward pass and optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        
    # Print the loss for this epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/50], Loss: 6.0896
Epoch [2/50], Loss: 5.7114
Epoch [3/50], Loss: 5.0051
Epoch [4/50], Loss: 3.7868
Epoch [5/50], Loss: 1.9854
Epoch [6/50], Loss: 0.8134
Epoch [7/50], Loss: 0.6837
Epoch [8/50], Loss: 0.6212
Epoch [9/50], Loss: 0.5679
Epoch [10/50], Loss: 0.5491
Epoch [11/50], Loss: 0.5422
Epoch [12/50], Loss: 0.5302
Epoch [13/50], Loss: 0.5221
Epoch [14/50], Loss: 0.5144
Epoch [15/50], Loss: 0.5062
Epoch [16/50], Loss: 0.4993
Epoch [17/50], Loss: 0.4927
Epoch [18/50], Loss: 0.4865
Epoch [19/50], Loss: 0.4806
Epoch [20/50], Loss: 0.4751
Epoch [21/50], Loss: 0.4701
Epoch [22/50], Loss: 0.4653
Epoch [23/50], Loss: 0.4609
Epoch [24/50], Loss: 0.4567
Epoch [25/50], Loss: 0.4529
Epoch [26/50], Loss: 0.4494
Epoch [27/50], Loss: 0.4463
Epoch [28/50], Loss: 0.4436
Epoch [29/50], Loss: 0.4411
Epoch [30/50], Loss: 0.4388
Epoch [31/50], Loss: 0.4369
Epoch [32/50], Loss: 0.4353
Epoch [33/50], Loss: 0.4338
Epoch [34/50], Loss: 0.4326
Epoch [35/50], Loss: 0.4314
Epoch [36/50], Loss: 0.4305
E

In [37]:
for batch in test_loader:
    X, y = batch[:, :-1], batch[:, -1]
    output = autoencoder(X)
    diffs = np.sum((X - output).detach().numpy()**2, axis=1)
    print(diffs, y)

[ 4.9841685  2.9071488  1.224896   2.648544   8.325684   3.01578
  3.7916331  3.0987551  5.7304134 18.538197  12.983862   2.537234
  2.4113963  3.7200096  3.7787695  5.8969417 13.383757   1.5270464
  6.721251   9.421338   2.4836333  3.0538905  4.252852   5.242676
  5.4035854  2.784776  11.937885   4.368555   3.2112134  8.675455
  2.9345293  9.867014   9.1859665 18.34227    5.290482   3.4943962
  2.3236642  9.955638   4.0791273  4.542345  14.605998  14.166662
  2.7430813  9.911175   3.2271683  2.9139569  3.2237694  3.466642
  2.790471   6.8004327  7.4699945  6.225895   3.8466902  4.0382376
  4.5171185  5.708927   7.451555   1.9712598  6.708394   4.879606
  3.5643928  6.6052294  5.207177   6.3115172] tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
[ 2.4910655