In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.io import arff
from io import BytesIO

## Preprocess Datasets - Adolescent
### 1. Read data file

In [None]:
# Read file
data = arff.loadarff('Autism-Adolescent-Data.arff')
df_adol = pd.DataFrame(data[0])
df_adol.head()

In [None]:
# Check data type of each feature in the dataset
df_adol.info()

### 2. Decode

In [None]:
for key in df_adol.columns.values.tolist():                # loop through all entries of the dataframe
    if (type(df_adol[key][0])==bytes):                     # find bytes object
        df_adol[key] = df_adol[key].str.decode('utf-8')   # Decode and replace
df_adol.head()                                             # check

### 3. Visualization

In [None]:
#display the number of Non-ASD/ASD children in different age groups
CrosstabResult_age = pd.crosstab(index = df_adol['age'],columns = df_adol['Class/ASD'], rownames=['Age'])
CrosstabResult_gender = pd.crosstab(index = df_adol['gender'],columns = df_adol['Class/ASD'], rownames=['Gender'])

fig, ax = plt.subplots(1,2,figsize=(12,5))
CrosstabResult_age.plot.bar(ax = ax[0])
ax[0].set_ylabel("Counts")
ax[0].set_title("Non-ASD/ASD Adolescents in Different Age Groups")
plt.sca(ax[0])
plt.xticks(ticks=range(len(np.unique(df_adol['age'].astype(int)))), labels=np.unique(df_adol['age'].astype(int)),rotation=0)
plt.yticks(ticks=[0,5,10,15,20], labels=[0,5,10,15,20])
for bar in ax[0].patches:
    y_value = bar.get_height()
    x_value = bar.get_x() + bar.get_width() / 2
    space = 1
    label = format(y_value)
    ax[0].annotate(label, (x_value, y_value), xytext=(0, space), textcoords="offset points", ha='center', va='bottom')

CrosstabResult_gender.plot.bar(ax = ax[1])
ax[1].set_title("Non-ASD/ASD Adolescents in Different Gender Groups")
plt.sca(ax[1])
plt.xticks([0,1], ['Female', 'Male'], rotation=0)
for bar in ax[1].patches:
    y_value = bar.get_height()
    x_value = bar.get_x() + bar.get_width() / 2
    space = 1
    label = format(y_value)
    ax[1].annotate(label, (x_value, y_value), xytext=(0, space), textcoords="offset points", ha='center', va='bottom')
#plt.tight_layout()
plt.show()

### 4. Remove the entries, which we are not interessed

In [None]:
df_adol = df_adol.drop(['ethnicity', 'contry_of_res', 'age_desc'], axis=1)
df_adol.head()

### 5. One-Hot-Encoding

In [None]:
for key in df_adol.columns.values.tolist():
    print(key + " has value: ", df_adol[key].unique())

In [None]:
df_adol['gender'] = df_adol['gender'].map({'m': 0, 'f': 1})
df_adol['jundice'] = df_adol['jundice'].map({'no': 0, 'yes': 1})
df_adol['austim'] = df_adol['austim'].map({'no': 0, 'yes': 1})
df_adol['used_app_before'] = df_adol['used_app_before'].map({'no': 0, 'yes': 1})
df_adol['relation'] = df_adol['relation'].map({'Parent': 0, 'Self': 1, 'Relative': 2, 'Health care professional': 3, 'Others':4, '?': 0})
df_adol['Class/ASD'] = df_adol['Class/ASD'].map({'NO': 0, 'YES': 1})
df_adol.head()

### 6. Handle missing value

In [None]:
df_adol[df_adol.isna().sum(axis=1) > 0] # Find whcih columns have missing value

In [None]:
# Some of the data are string not int, we will change it here
for key in df_adol.columns.values.tolist():        # loop though all entries           
    if (type(df_adol[key][0]) != int):             # Find non int data
        df_adol[key] = df_adol[key].astype(int)    # Change it to int

# Check dtype again
df_adol.info()

### 7. Check the size and balance of processed dataset

In [None]:
print(
    "Number of Non_ASD samples in the dataset: {}".format(len(df_adol[df_adol['Class/ASD'] == 0]))
)
print(
    "Number of ASD samples in the dataset: {}".format(len(df_adol[df_adol['Class/ASD'] == 1]))
)

## Build Randoom Forest Model

### 1. Split the data into training and testing set

In [None]:
# Split to data and labels
X = df_adol.copy().drop(['Class/ASD'], axis=1)
y = df_adol.copy()['Class/ASD']

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("Size of Training Set: ", len(X_train))
print("Size of Testing Set: ", len(X_test))


### 2. Train the model

In [None]:
# Creat a random forest model
model = RandomForestClassifier(n_estimators=500)

In [None]:
# Train
model.fit(X_train, y_train)

## Evalute RF Model

### 1. Accuracy

In [None]:
# Test accuracy
y_pred = model.predict(X_test)
print("Accuracy on testing set:",sklearn.metrics.accuracy_score(y_test, y_pred))

In [None]:
#Sanity Check
y_fake = np.random.choice([0,1], y_train.shape)   # Change the result to random numbers
print("Accuracy with fake results:",sklearn.metrics.accuracy_score(model.predict(X_train), y_fake))

### 2. F1-Score

In [None]:
f1 = sklearn.metrics.f1_score(y_test, y_pred)
print('F1 Score: ', f1)

### 3. Confusion Matrix

In [None]:
cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['No ASD', 'ASD'])
disp.plot()
plt.show()

### 4. Feature Ranking

In [None]:
importance = model.feature_importances_
index = np.argsort(model.feature_importances_).tolist()
index.reverse
std = np.std([tree.feature_importances_[index] for tree in model.estimators_], axis=0)
for idx in index:
    print(X_train.columns[idx], ": ", importance[idx])

In [None]:
#plot bar chart of sorted features
plt.barh(X_train.columns[index], model.feature_importances_[index], color=['green'])
plt.gca().invert_yaxis()
plt.xlabel("RF Feature Importance")
plt.ylabel("Feature Name")


## Train Models with limited features

### 1. Use only "result" to predict

In [None]:
labels_to_drop = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 
'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation']
n_estimator = 100
X_drop = X.drop(labels=labels_to_drop, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_drop, y, test_size=0.3)
model = RandomForestClassifier(n_estimators=n_estimator)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Dropped features: ", labels_to_drop)
print("Used features", X_drop.columns.values.tolist())
print("Accuracy:",sklearn.metrics.accuracy_score(y_test, y_pred))

In [None]:
cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['No ASD', 'ASD'])
disp.plot()
plt.show()

### 2. Use the 10 questions A1 to A10 to predict

In [None]:
labels_to_drop = ['result', 'austim', 'used_app_before', 'age', 'gender', 'jundice', 'relation']
n_estimator = 500
X_drop = X.drop(labels=labels_to_drop, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_drop, y, test_size=0.2)
model = RandomForestClassifier(n_estimators=n_estimator)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Dropped features: ", labels_to_drop)
print("Used features", X_drop.columns.values.tolist())
print("Accuracy:",sklearn.metrics.accuracy_score(y_test, y_pred))
importance = model.feature_importances_
index = np.argsort(importance).tolist()
fig, ax = plt.subplots()
ax = plt.barh(X_train.columns[index], importance[index], color="green")
#plt.xticks(rotation=90)
plt.gca().invert_yaxis()
plt.show()

In [None]:
cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['No ASD', 'ASD'])
disp.plot()
plt.show()

### 3. Use features unrelated to AQ10

In [None]:
labels_to_drop = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'result']
n_estimator = 500
X_drop = X.drop(labels=labels_to_drop, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_drop, y, test_size=0.2)
model = RandomForestClassifier(n_estimators=n_estimator)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Dropped features: ", labels_to_drop)
print("Used features", X_drop.columns.values.tolist())
print("Accuracy:",sklearn.metrics.accuracy_score(y_test, y_pred))
importance = model.feature_importances_
index = np.argsort(importance).tolist()
fig, ax = plt.subplots()
ax = plt.barh(X_train.columns[index], importance[index], color="green")
#plt.xticks(rotation=90)
plt.gca().invert_yaxis()
plt.show()

In [None]:
cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['No ASD', 'ASD'])
disp.plot()
plt.show()

In [None]:
from scipy import stats
k = np.count_nonzero(y_test==y_pred)
n = len(y_pred)
print("Null hypothsis: The model trained on features unrelated to AQ10 is only guessing the result.")
print("Altenative hypothsis: The modle is not only guessing the result")
print("If the model is only guessing, for each result, it has 50\% chance to get the correct answer.")
print("The number of correct answer follows a binomial distribution B(21,0.5)")
p = stats.binomtest(k,n,1/2).pvalue
print("The model achieved an accuracy of {:.1f}%.".format(sklearn.metrics.accuracy_score(y_test, y_pred)*100))
print("P-Value of the result is {}.".format(p))
print("We can not reject the null hypothesis that the model is only guessing.")