In [1]:
import torch
from tqdm import tqdm
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.io import arff
from io import BytesIO

## Preprocess Datasets - Adolescent
### Read data file

In [2]:
# Read file
data = arff.loadarff('Autism-Adolescent-Data.arff')
df_adol = pd.DataFrame(data[0])

df_adol = df_adol.drop(['ethnicity', 'contry_of_res', 'age_desc'], axis=1)

for key in df_adol.columns.values.tolist():                
    if (type(df_adol[key][0])==bytes):                     
        df_adol[key] = df_adol[key].str.decode('utf-8')

df_adol['gender'] = df_adol['gender'].map({'m': 0, 'f': 1})
df_adol['jundice'] = df_adol['jundice'].map({'no': 0, 'yes': 1})
df_adol['austim'] = df_adol['austim'].map({'no': 0, 'yes': 1})
df_adol['used_app_before'] = df_adol['used_app_before'].map({'no': 0, 'yes': 1})
df_adol['relation'] = df_adol['relation'].map({'Parent': 0, 'Self': 1, 'Relative': 2, 'Health care professional': 3, 'Others':4, '?': 0})
df_adol['Class/ASD'] = df_adol['Class/ASD'].map({'NO': 0, 'YES': 1})

for key in df_adol.columns.values.tolist():        # loop though all entries           
    if (type(df_adol[key][0]) != int):             # Find non int data
        df_adol[key] = df_adol[key].astype(int) 
df_adol.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,jundice,austim,used_app_before,result,relation,Class/ASD
0,0,0,0,1,1,1,1,1,1,0,15,0,1,1,0,6,0,0
1,0,0,0,0,0,0,0,0,1,1,15,0,0,0,0,2,2,0
2,0,0,0,0,0,0,0,0,1,1,12,1,0,0,0,2,0,0
3,0,1,1,1,1,1,0,1,1,0,14,1,0,0,0,7,1,1
4,1,1,1,1,1,1,1,0,0,0,16,1,0,0,0,7,0,1


In [3]:
data = arff.loadarff('Autism-Child-Data.arff')
df_child = pd.DataFrame(data[0])
for key in df_child.columns.tolist():
    if(type(df_child[key][0]) == bytes):
        df_child[key] = df_child[key].str.decode('utf-8')
df_child = df_child.drop(['ethnicity','contry_of_res', 'age_desc'], axis=1)
df_child['gender'] = df_child['gender'].map({'m': 0, 'f': 1})
df_child['jundice'] = df_child['jundice'].map({'no': 0, 'yes': 1})
df_child['austim'] = df_child['austim'].map({'no': 0, 'yes': 1})
df_child['used_app_before'] = df_child['used_app_before'].map({'no': 0, 'yes': 1})
df_child['relation'] = df_child['relation'].map({'Parent': 0, 'Self': 1, 'self': 1, 'Relative': 2, 'Health care professional': 3})
df_child['Class/ASD'] = df_child['Class/ASD'].map({'NO': 0, 'YES': 1})
df_child['relation'].fillna(value = 0, inplace=True)
df_child['age'].fillna(round(df_child['age'].mean()), inplace=True)
for key in df_child.columns.tolist():
    if(type(df_child[key][0]) != int):
        df_child[key] = df_child[key].astype(int)
df_child.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,jundice,austim,used_app_before,result,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,6,0,0,0,0,5,0,0
1,1,1,0,0,1,1,0,1,0,0,6,0,0,0,0,5,0,0
2,1,1,0,0,0,1,1,1,0,0,6,0,0,0,1,5,0,0
3,0,1,0,0,1,1,0,0,0,1,5,1,1,0,0,4,0,0
4,1,1,1,1,1,1,1,1,1,1,5,0,1,0,0,10,0,1


In [4]:
df_all = df_child.append(df_adol)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 396 entries, 0 to 103
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   A1_Score         396 non-null    int32
 1   A2_Score         396 non-null    int32
 2   A3_Score         396 non-null    int32
 3   A4_Score         396 non-null    int32
 4   A5_Score         396 non-null    int32
 5   A6_Score         396 non-null    int32
 6   A7_Score         396 non-null    int32
 7   A8_Score         396 non-null    int32
 8   A9_Score         396 non-null    int32
 9   A10_Score        396 non-null    int32
 10  age              396 non-null    int32
 11  gender           396 non-null    int32
 12  jundice          396 non-null    int32
 13  austim           396 non-null    int32
 14  used_app_before  396 non-null    int32
 15  result           396 non-null    int32
 16  relation         396 non-null    int32
 17  Class/ASD        396 non-null    int32
dtypes: int32(1

  df_all = df_child.append(df_adol)


In [14]:
df_norm = df_all.copy().drop(['Class/ASD'], axis=1)
for key in df_all.columns.values.tolist():
    if np.any(df_all[key].to_numpy()>1):
        min = np.min(df_all[key])
        max = np.max(df_all[key])
        df_norm[key] = (df_all[key]-min)/(max-min)
df_norm *= 2
df_norm -= 1
df_norm.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,jundice,austim,used_app_before,result,relation
0,1,1,-1,-1,1,1,-1,1,-1,-1,-0.666667,-1,-1,-1,-1,0.0,-1.0
1,1,1,-1,-1,1,1,-1,1,-1,-1,-0.666667,-1,-1,-1,-1,0.0,-1.0
2,1,1,-1,-1,-1,1,1,1,-1,-1,-0.666667,-1,-1,-1,1,0.0,-1.0
3,-1,1,-1,-1,1,1,-1,-1,-1,1,-0.833333,1,1,-1,-1,-0.2,-1.0
4,1,1,1,1,1,1,1,1,1,1,-0.833333,-1,1,-1,-1,1.0,-1.0


In [15]:
class CustomDataset(Dataset):
    def __init__(self, X, y, max_age=16, normalize=False):
        if normalize:
            if 'age' in X.index:
                X['age'] /= max_age;
            X *= 2
            X -= 1
        self.data = torch.from_numpy(X.to_numpy()).type(torch.float)
        self.label = torch.from_numpy(y.to_numpy()).type(torch.long)
    
    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        data = self.data[idx]
        label = self.label[idx]
        return data, label

In [16]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim=17, hidden_dim=100):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [17]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        data, label = next(iter(loader))
        scores = model(data)
        _, preds = scores.max(1)
        num_correct = (preds == label).sum()
        num_samples = preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

def train(model, optimizer, criterion, dataloader, epochs=100, print_every=10):
    """
    Train a model on CIFAR-10 using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    for e in range(epochs):
        data, label = next(iter(dataloader))
        model.train() 

        scores = model(data)
        loss = criterion(scores, label)

        # Zero out all of the gradients for the variables which the optimizer
        # will update.
        optimizer.zero_grad()

        # This is the backwards pass: compute the gradient of the loss with
        # respect to each  parameter of the model.
        loss.backward()

        # Actually update the parameters of the model using the gradients
        # computed by the backwards pass.
        optimizer.step()

        if (e+1) % print_every == 0:
            print('Iteration %d, loss = %.4f' % (e, loss.item()))
            check_accuracy(dataloader, model)
            print()

def test_accuracy(model, x, y, max_age=16, normalize=False):
    if normalize:
        if 'age' in x.index:
            x['age'] /= max_age
        x *= 2
        x -= 1
    data = torch.from_numpy(x.to_numpy()).type(torch.float)
    label = torch.from_numpy(y.to_numpy()).type(torch.long)
    scores = model(data)
    _, preds = scores.max(1)
    num_correct = (preds == label).sum()
    num_samples = preds.size(0)
    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f) on test set' % (num_correct, num_samples, 100 * acc))

In [None]:
# train_data = torch.from_numpy(X_train)
# test_data = torch.from_numpy(X_test)
# Split to data and labels
X = df_adol.copy().drop(['Class/ASD'], axis=1)
y = df_adol.copy()['Class/ASD']
labels_to_drop = []
# ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'result']
X = X.drop(labels=labels_to_drop, axis=1)

# train-test-split
normalize = True
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

dataset = CustomDataset(X_train, y_train,normalize=normalize)

learning_rate = 0.01
batch_size = 64
epochs = 100
input_dim = len(X_train.columns.values)
hidden_dim = 100

model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)

train(model, optimizer, criterion, train_dataloader, epochs=epochs)
test_accuracy(model, X_test, y_test, normalize=normalize)
    

Iteration 9, loss = 0.5467
Got 41 / 64 correct (64.06)

Iteration 19, loss = 0.5393
Got 41 / 64 correct (64.06)

Iteration 29, loss = 0.5357
Got 43 / 64 correct (67.19)

Iteration 39, loss = 0.4509
Got 43 / 64 correct (67.19)

Iteration 49, loss = 0.4203
Got 47 / 64 correct (73.44)

Iteration 59, loss = 0.4455
Got 51 / 64 correct (79.69)

Iteration 69, loss = 0.4285
Got 52 / 64 correct (81.25)

Iteration 79, loss = 0.3784
Got 53 / 64 correct (82.81)

Iteration 89, loss = 0.3697
Got 52 / 64 correct (81.25)

Iteration 99, loss = 0.3272
Got 55 / 64 correct (85.94)

Got 28 / 32 correct (87.50) on test set


In [None]:
# train_data = torch.from_numpy(X_train)
# test_data = torch.from_numpy(X_test)
# Split to data and labels
X = df_adol.copy().drop(['Class/ASD'], axis=1)
y = df_adol.copy()['Class/ASD']

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

labels_to_drop = ['result', 'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation']
#['result', 'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation']
# ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'result']
X_train = X_train.drop(labels=labels_to_drop, axis=1)
dataset = CustomDataset(X_train, y_train,normalize=False)


learning_rate = 0.1
batch_size = 64
num_batchs = 100
input_dim = len(X_train.columns.values)
hidden_dim = 100

model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)

train(model, optimizer, criterion, train_dataloader, epochs=201, print_every=20)
    

Iteration 0, loss = 0.7307
Got 29 / 64 correct (45.31)

Iteration 20, loss = 0.4462
Got 56 / 64 correct (87.50)

Iteration 40, loss = 0.2382
Got 60 / 64 correct (93.75)

Iteration 60, loss = 0.1254
Got 63 / 64 correct (98.44)

Iteration 80, loss = 0.0706
Got 63 / 64 correct (98.44)

Iteration 100, loss = 0.0555
Got 64 / 64 correct (100.00)

Iteration 120, loss = 0.0398
Got 64 / 64 correct (100.00)

Iteration 140, loss = 0.0276
Got 64 / 64 correct (100.00)

Iteration 160, loss = 0.0228
Got 64 / 64 correct (100.00)

Iteration 180, loss = 0.0166
Got 64 / 64 correct (100.00)

Iteration 200, loss = 0.0140
Got 64 / 64 correct (100.00)



In [None]:
# train_data = torch.from_numpy(X_train)
# test_data = torch.from_numpy(X_test)
# Split to data and labels
X = df_adol.copy().drop(['Class/ASD'], axis=1)
y = df_adol.copy()['Class/ASD']

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

labels_to_drop = ['result', 'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation', 'A2_Score', 'A1_Score', 'A7_Score', 'A8_Score']
#['result', 'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation']
# ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'result']
X_train = X_train.drop(labels=labels_to_drop, axis=1)
dataset = CustomDataset(X_train, y_train,normalize=False)


learning_rate = 0.01
batch_size = 64
epochs = 5000
print_every = epochs/10
input_dim = len(X_train.columns.values)
hidden_dim = 200

model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)

train(model, optimizer, criterion, train_dataloader, epochs=epochs, print_every=print_every)
    

Iteration 499, loss = 0.2117
Got 58 / 64 correct (90.62)

Iteration 999, loss = 0.1945
Got 59 / 64 correct (92.19)

Iteration 1499, loss = 0.1593
Got 60 / 64 correct (93.75)

Iteration 1999, loss = 0.1454
Got 60 / 64 correct (93.75)

Iteration 2499, loss = 0.1486
Got 60 / 64 correct (93.75)

Iteration 2999, loss = 0.1180
Got 60 / 64 correct (93.75)

Iteration 3499, loss = 0.1449
Got 60 / 64 correct (93.75)

Iteration 3999, loss = 0.1405
Got 60 / 64 correct (93.75)

Iteration 4499, loss = 0.1390
Got 61 / 64 correct (95.31)

Iteration 4999, loss = 0.1318
Got 60 / 64 correct (93.75)



In [44]:
# train_data = torch.from_numpy(X_train)
# test_data = torch.from_numpy(X_test)
# Split to data and labels
X = df_norm.copy()
y = df_all.copy()['Class/ASD']
labels_to_drop = ['result', 'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation', 'A1_Score']#, 'A2_Score']#, 'A9_Score', 'A7_Score']
# ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'result']
X = X.drop(labels=labels_to_drop, axis=1)

# train-test-split
normalize = False
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

dataset = CustomDataset(X_train, y_train, normalize=normalize)
# test_dataset = CustomDataset(X_test, y_test, normalize=normalize)

learning_rate = 0.0004
batch_size = 64
epochs = 10000
input_dim = len(X_train.columns.values)
hidden_dim = 200

model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)

train(model, optimizer, criterion, train_dataloader, epochs=epochs)
test_accuracy(model, X_test, y_test, normalize=normalize)
    

Iteration 9, loss = 0.6942
Got 30 / 64 correct (46.88)

Iteration 19, loss = 0.6949
Got 39 / 64 correct (60.94)

Iteration 29, loss = 0.7055
Got 28 / 64 correct (43.75)

Iteration 39, loss = 0.6902
Got 35 / 64 correct (54.69)

Iteration 49, loss = 0.6905
Got 32 / 64 correct (50.00)

Iteration 59, loss = 0.6761
Got 31 / 64 correct (48.44)

Iteration 69, loss = 0.6981
Got 31 / 64 correct (48.44)

Iteration 79, loss = 0.6981
Got 24 / 64 correct (37.50)

Iteration 89, loss = 0.6812
Got 33 / 64 correct (51.56)

Iteration 99, loss = 0.6916
Got 39 / 64 correct (60.94)

Iteration 109, loss = 0.7147
Got 34 / 64 correct (53.12)

Iteration 119, loss = 0.6807
Got 31 / 64 correct (48.44)

Iteration 129, loss = 0.6846
Got 34 / 64 correct (53.12)

Iteration 139, loss = 0.6947
Got 34 / 64 correct (53.12)

Iteration 149, loss = 0.6813
Got 32 / 64 correct (50.00)

Iteration 159, loss = 0.6902
Got 34 / 64 correct (53.12)

Iteration 169, loss = 0.6819
Got 39 / 64 correct (60.94)

Iteration 179, loss = 0.6