In [None]:
import torch
from tqdm import tqdm
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.io import arff
from io import BytesIO

## Preprocess Datasets - Adolescent
### Read data file

In [None]:
# Read file
data = arff.loadarff('Autism-Adolescent-Data.arff')
df_adol = pd.DataFrame(data[0])

df_adol = df_adol.drop(['ethnicity', 'contry_of_res', 'age_desc'], axis=1)

for key in df_adol.columns.values.tolist():                
    if (type(df_adol[key][0])==bytes):                     
        df_adol[key] = df_adol[key].str.decode('utf-8')

df_adol['gender'] = df_adol['gender'].map({'m': 0, 'f': 1})
df_adol['jundice'] = df_adol['jundice'].map({'no': 0, 'yes': 1})
df_adol['austim'] = df_adol['austim'].map({'no': 0, 'yes': 1})
df_adol['used_app_before'] = df_adol['used_app_before'].map({'no': 0, 'yes': 1})
df_adol['relation'] = df_adol['relation'].map({'Parent': 0, 'Self': 1, 'Relative': 2, 'Health care professional': 3, 'Others':4, '?': 0})
df_adol['Class/ASD'] = df_adol['Class/ASD'].map({'NO': 0, 'YES': 1})

for key in df_adol.columns.values.tolist():        # loop though all entries           
    if (type(df_adol[key][0]) != int):             # Find non int data
        df_adol[key] = df_adol[key].astype(int) 

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,b'0',b'0',b'0',b'1',b'1',b'1',b'1',b'1',b'1',b'0',...,b'm',b'Hispanic',b'yes',b'yes',b'Austria',b'no',6.0,b'12-16 years',b'Parent',b'NO'
1,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'1',...,b'm',b'Black',b'no',b'no',b'Austria',b'no',2.0,b'12-16 years',b'Relative',b'NO'
2,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'1',...,b'f',b'?',b'no',b'no',b'AmericanSamoa',b'no',2.0,b'12-16 years',b'?',b'NO'
3,b'0',b'1',b'1',b'1',b'1',b'1',b'0',b'1',b'1',b'0',...,b'f',b'White-European',b'no',b'no',b'United Kingdom',b'no',7.0,b'12-16 years',b'Self',b'YES'
4,b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'0',b'0',b'0',...,b'f',b'?',b'no',b'no',b'Albania',b'no',7.0,b'12-16 years',b'?',b'YES'


In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, y, max_age=16, normalize=False):
        if normalize:
            if 'age' in X.index:
                X['age'] /= max_age;
            X *= 2
            X -= 1
        self.data = torch.from_numpy(X.to_numpy()).type(torch.float)
        self.label = torch.from_numpy(y.to_numpy()).type(torch.long)
    
    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        data = self.data[idx]
        label = self.label[idx]
        return data, label

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim=17, hidden_dim=100):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        data, label = next(iter(loader))
        scores = model(data)
        _, preds = scores.max(1)
        num_correct = (preds == label).sum()
        num_samples = preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

def train(model, optimizer, criterion, dataloader, epochs=100, print_every=10):
    """
    Train a model on CIFAR-10 using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    for e in range(epochs):
        data, label = next(iter(dataloader))
        model.train() 

        scores = model(data)
        loss = criterion(scores, label)

        # Zero out all of the gradients for the variables which the optimizer
        # will update.
        optimizer.zero_grad()

        # This is the backwards pass: compute the gradient of the loss with
        # respect to each  parameter of the model.
        loss.backward()

        # Actually update the parameters of the model using the gradients
        # computed by the backwards pass.
        optimizer.step()

        if (e+1) % print_every == 0:
            print('Iteration %d, loss = %.4f' % (e, loss.item()))
            check_accuracy(dataloader, model)
            print()

def test_accuracy(model, x, y, max_age=16, normalize=False):
    if normalize:
        if 'age' in x.index:
            x['age'] /= max_age
        x *= 2
        x -= 1
    data = torch.from_numpy(x.to_numpy()).type(torch.float)
    label = torch.from_numpy(y.to_numpy()).type(torch.long)
    scores = model(data)
    _, preds = scores.max(1)
    num_correct = (preds == label).sum()
    num_samples = preds.size(0)
    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f) on test set' % (num_correct, num_samples, 100 * acc))

In [None]:
# train_data = torch.from_numpy(X_train)
# test_data = torch.from_numpy(X_test)
# Split to data and labels
X = df_adol.copy().drop(['Class/ASD'], axis=1)
y = df_adol.copy()['Class/ASD']
labels_to_drop = []
# ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'result']
X = X.drop(labels=labels_to_drop, axis=1)

# train-test-split
normalize = True
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

dataset = CustomDataset(X_train, y_train,normalize=normalize)

learning_rate = 0.01
batch_size = 64
epochs = 100
input_dim = len(X_train.columns.values)
hidden_dim = 100

model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)

train(model, optimizer, criterion, train_dataloader, epochs=epochs)
test_accuracy(model, X_test, y_test, normalize=normalize)
    

Iteration 9, loss = 0.5467
Got 41 / 64 correct (64.06)

Iteration 19, loss = 0.5393
Got 41 / 64 correct (64.06)

Iteration 29, loss = 0.5357
Got 43 / 64 correct (67.19)

Iteration 39, loss = 0.4509
Got 43 / 64 correct (67.19)

Iteration 49, loss = 0.4203
Got 47 / 64 correct (73.44)

Iteration 59, loss = 0.4455
Got 51 / 64 correct (79.69)

Iteration 69, loss = 0.4285
Got 52 / 64 correct (81.25)

Iteration 79, loss = 0.3784
Got 53 / 64 correct (82.81)

Iteration 89, loss = 0.3697
Got 52 / 64 correct (81.25)

Iteration 99, loss = 0.3272
Got 55 / 64 correct (85.94)

Got 28 / 32 correct (87.50) on test set


In [None]:
# train_data = torch.from_numpy(X_train)
# test_data = torch.from_numpy(X_test)
# Split to data and labels
X = df_adol.copy().drop(['Class/ASD'], axis=1)
y = df_adol.copy()['Class/ASD']

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

labels_to_drop = ['result', 'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation']
#['result', 'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation']
# ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'result']
X_train = X_train.drop(labels=labels_to_drop, axis=1)
dataset = CustomDataset(X_train, y_train,normalize=False)


learning_rate = 0.1
batch_size = 64
num_batchs = 100
input_dim = len(X_train.columns.values)
hidden_dim = 100

model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)

train(model, optimizer, criterion, train_dataloader, epochs=201, print_every=20)
    

Iteration 0, loss = 0.7307
Got 29 / 64 correct (45.31)

Iteration 20, loss = 0.4462
Got 56 / 64 correct (87.50)

Iteration 40, loss = 0.2382
Got 60 / 64 correct (93.75)

Iteration 60, loss = 0.1254
Got 63 / 64 correct (98.44)

Iteration 80, loss = 0.0706
Got 63 / 64 correct (98.44)

Iteration 100, loss = 0.0555
Got 64 / 64 correct (100.00)

Iteration 120, loss = 0.0398
Got 64 / 64 correct (100.00)

Iteration 140, loss = 0.0276
Got 64 / 64 correct (100.00)

Iteration 160, loss = 0.0228
Got 64 / 64 correct (100.00)

Iteration 180, loss = 0.0166
Got 64 / 64 correct (100.00)

Iteration 200, loss = 0.0140
Got 64 / 64 correct (100.00)



In [None]:
# train_data = torch.from_numpy(X_train)
# test_data = torch.from_numpy(X_test)
# Split to data and labels
X = df_adol.copy().drop(['Class/ASD'], axis=1)
y = df_adol.copy()['Class/ASD']

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

labels_to_drop = ['result', 'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation', 'A2_Score', 'A1_Score', 'A7_Score', 'A8_Score']
#['result', 'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation']
# ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'result']
X_train = X_train.drop(labels=labels_to_drop, axis=1)
dataset = CustomDataset(X_train, y_train,normalize=False)


learning_rate = 0.01
batch_size = 64
epochs = 5000
print_every = epochs/10
input_dim = len(X_train.columns.values)
hidden_dim = 200

model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)

train(model, optimizer, criterion, train_dataloader, epochs=epochs, print_every=print_every)
    

Iteration 499, loss = 0.2117
Got 58 / 64 correct (90.62)

Iteration 999, loss = 0.1945
Got 59 / 64 correct (92.19)

Iteration 1499, loss = 0.1593
Got 60 / 64 correct (93.75)

Iteration 1999, loss = 0.1454
Got 60 / 64 correct (93.75)

Iteration 2499, loss = 0.1486
Got 60 / 64 correct (93.75)

Iteration 2999, loss = 0.1180
Got 60 / 64 correct (93.75)

Iteration 3499, loss = 0.1449
Got 60 / 64 correct (93.75)

Iteration 3999, loss = 0.1405
Got 60 / 64 correct (93.75)

Iteration 4499, loss = 0.1390
Got 61 / 64 correct (95.31)

Iteration 4999, loss = 0.1318
Got 60 / 64 correct (93.75)

