In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import arff
from io import BytesIO
import math
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# Process Datasets - Children

## 1. Create Data Frame

In [None]:
#Read file
data = arff.loadarff('Autism-Child-Data.arff')
df_child = pd.DataFrame(data[0])
df_child

In [None]:
#check data type of each feature in the dataset
df_child.info()

## 2. Decode Bytes Object

In [None]:
for key in df_child.columns.tolist():
    if(type(df_child[key][0]) == bytes):
        df_child[key] = df_child[key].str.decode('utf-8')
        
df_child.head()

## 3. Visualization

In [None]:
CrosstabResult_age = pd.crosstab(index = df_child['age'],columns = df_child['Class/ASD'], rownames=['Age'])
CrosstabResult_gender = pd.crosstab(index = df_child['gender'],columns = df_child['Class/ASD'], rownames=['Gender'])

fig, ax =plt.subplots(1, 2, figsize = (12,5))

#display the number of Non-ASD/ASD children in different age groups
CrosstabResult_age.plot.bar(ax = ax[0])
plt.sca(ax[0])
ax[0].set_title("Non-ASD/ASD Children in Different Age Groups")
ax[0].set_xlabel('Age')
ax[0].set_ylabel('Counts')
plt.xticks(ticks=[0,1,2,3,4,5,6,7], labels=[4,5,6,7,8,9,10,11], rotation=0)

for bar in ax[0].patches:
    y_value = bar.get_height()
    x_value = bar.get_x() + bar.get_width() / 2
    space = 1
    label = format(y_value)
    ax[0].annotate(label, (x_value, y_value), xytext=(0, space), textcoords="offset points", ha='center', va='bottom')
    
#display the number of Non-ASD/ASD children in different gender groups  
CrosstabResult_gender.plot.bar(ax = ax[1])
plt.sca(ax[1])
ax[1].set_title("Non-ASD/ASD Children in Different Gender Groups")
ax[1].set_xlabel('Gender')
plt.xticks(ticks=[0,1], labels=['Female','Male'], rotation = 0)

for bar in ax[1].patches:
    y_value = bar.get_height()
    x_value = bar.get_x() + bar.get_width() / 2
    space = 1
    label = format(y_value)
    ax[1].annotate(label, (x_value, y_value), xytext=(0, space), textcoords="offset points", ha='center', va='bottom')





## 4. Drop Unrelated Features

In [None]:
#drop 3 unrelated feature columns
df_child = df_child.drop(['ethnicity','contry_of_res', 'age_desc'], axis=1)

#check if features are dropped correctly
df_child

## 5. One-Hot-Encoding
#### Convert not numeric data to numeric for analysis purpose

In [None]:
#check the different values each feature column has
for key in df_child.columns.tolist():
    unique_value = df_child[key].unique()
    print(key + " has value: ", unique_value )

In [None]:
#do ont-hot encoding
df_child['gender'] = df_child['gender'].map({'m': 0, 'f': 1})
df_child['jundice'] = df_child['jundice'].map({'no': 0, 'yes': 1})
df_child['austim'] = df_child['austim'].map({'no': 0, 'yes': 1})
df_child['used_app_before'] = df_child['used_app_before'].map({'no': 0, 'yes': 1})
df_child['relation'] = df_child['relation'].map({'Parent': 0, 'Self': 1, 'self': 1, 'Relative': 2, 'Health care professional': 3})
df_child['Class/ASD'] = df_child['Class/ASD'].map({'NO': 0, 'YES': 1})
df_child.head()

## 6. Handle Missing Value

In [None]:
#print out samples that have missing values
df_child[df_child.isna().sum(axis=1) > 0]

In [None]:
#fill missing value of 'relation' with 'Parents', since most questions are answered by children's Parents
df_child['relation'].fillna(value = 0, inplace=True)

#fill missing value of 'age' with its mean
df_child['age'].fillna(round(df_child['age'].mean()), inplace=True)

In [None]:
#check if all missing values are handeled
df_child[df_child.isna().sum(axis=1) > 0]

In [None]:
#convert all feature type to int
for key in df_child.columns.tolist():
    if(type(df_child[key][0]) != int):
        df_child[key] = df_child[key].astype(int)

#check the datatype after converting
df_child.info()

## 7. Check the Size and Balance of Processed Dataset 

In [None]:
#check if the size of processed dataset matches the size of original dataset(after dropping 3 feature columns)
print("Number of Non_ASD samples: ", len(df_child[df_child['Class/ASD'] == 0]))
print("Number of ASD samples: ", len(df_child[df_child['Class/ASD'] == 1]))

# Build Random Forest Model

## 1. Split df_child into train and test sets

In [None]:
X = df_child.iloc[:, :-1] #feature columns
y = df_child.iloc[:,-1] #lable column

In [None]:
#split original dataset into test and taing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## 2. Train the model

In [None]:
#buit RF model, choose 500 as the number of estimator
model = RandomForestClassifier(n_estimators = 500)

In [None]:
#train RF model with training set
model.fit(X_train, y_train)

# Evaluate RF Model

## 1. Accuracy

In [None]:
#Get prediction result based on test set
y_pred = model.predict(X_test)

In [None]:
#find accuracy of model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

### Since the accuracy is 100%, we use shuffled version of  test label to calculate the accuracy again to make sure the model is trained correctly

In [None]:
import random

#shuffle the test label
y_shuffle = y_test.copy().tolist()
random.shuffle(y_shuffle)

print("Accuracy of shuffled:", accuracy_score(y_shuffle, y_pred))

## 2. F1 Score

In [None]:
#find f1 score of model
f1 = f1_score(y_test, y_pred)
print('F1 Score: ', f1)

## 3. Confusion Matrix

In [None]:
#plot confusion matrix of model
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['No ASD', 'ASD'])
disp.plot()
plt.show()

## 4. Feature Ranking

In [None]:
#get the index of descending sorted features
index = np.argsort(model.feature_importances_).tolist()
#reverse to ascending order
index.reverse
#display feature score
feature = [(X_train.columns[idx], model.feature_importances_[idx]) for idx in index]
feature 

In [None]:
#plot bar chart of sorted features
plt.barh(X_train.columns[index], model.feature_importances_[index], color=['green'])
plt.gca().invert_yaxis()
plt.xlabel("RF Feature Importance")
plt.ylabel("Feature Name")

# Train Models with Limited Features

## 1. Use only "result" to predict

In [None]:
#only keep the feature of result
X_train_result = X_train['result'].values.reshape(-1,1)
X_test_result = X_test['result'].values.reshape(-1,1)

model_result = RandomForestClassifier(n_estimators = 500)
model_result.fit(X_train_result,y_train)

y_pred_result = model_result.predict(X_test_result)

print("Used features: ['result']")
print("Accuracy:", accuracy_score(y_test, y_pred_result))

In [None]:
#plot confusion matrix
cm_result = confusion_matrix(y_test, y_pred_result)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_result, display_labels = ['No ASD', 'ASD'])
disp.plot()
plt.show()

## 2. Use the 10 questions A1 to A10 to predict

In [None]:
#only keep the features of A1 to A10 questions
X_train_10Q = X_train.drop(['used_app_before','age','gender','jundice','austim','result','relation'], axis=1)
X_test_10Q = X_test.drop(['used_app_before','age','gender','jundice','austim','result','relation'], axis=1)

model_10Q = RandomForestClassifier(n_estimators = 500)
model_10Q.fit(X_train_10Q,y_train)

y_pred_10Q = model_10Q.predict(X_test_10Q)

print("Used features:", X_train_10Q.columns.tolist())
print("Accuracy:", accuracy_score(y_test, y_pred_10Q))

In [None]:
#do freature ranking
index_10Q = np.argsort(model_10Q.feature_importances_).tolist()

plt.barh(X_train_10Q.columns[index_10Q], model_10Q.feature_importances_[index_10Q], color=['green'])
plt.gca().invert_yaxis()
plt.xlabel("RF Feature Importance")
plt.ylabel("Feature Name")

In [None]:
#plot confusion matrix
cm_10Q = confusion_matrix(y_test, y_pred_10Q)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_10Q, display_labels = ['No ASD', 'ASD'])
disp.plot()
plt.show()

## 3. Use features unrelated to AQ 10

In [None]:
#only keep the 6 features unrelated to the AQ 10 questionaire
X_train_unrelated = X_train[['age','gender','jundice','relation','austim','used_app_before']]
X_test_unrelated = X_test[['age','gender','jundice','relation','austim','used_app_before']]

model_unrelated = RandomForestClassifier(n_estimators = 500)
model_unrelated.fit(X_train_unrelated,y_train)

y_pred_unrelated = model_unrelated.predict(X_test_unrelated)

print("Used features:", X_train_unrelated.columns.tolist())
print("Accuracy:", accuracy_score(y_test, y_pred_unrelated))

In [None]:
#do freature ranking
index_unrelated = np.argsort(model_unrelated.feature_importances_).tolist()

plt.barh(X_train_unrelated.columns[index_unrelated], model_unrelated.feature_importances_[index_unrelated], color=['green'])
plt.gca().invert_yaxis()
plt.xlabel("RF Feature Importance")
plt.ylabel("Feature Name")

In [None]:
#plot confusion matrix
cm_unrelated = confusion_matrix(y_test, y_pred_unrelated)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_unrelated, display_labels = ['No ASD', 'ASD'])
disp.plot()
plt.show()

In [None]:
#explain the accuracy of this model
from scipy import stats
k = np.count_nonzero(y_test == y_pred_unrelated)
n = len(y_pred_unrelated)
print("Null hypothsis: The model trained on features unrelated to AQ10 is only guessing the result.")
print("Altenative hypothsis: The modle is not only guessing the result")
print("If the model is only guessing, for each result, it has 50% chance to get the correct answer.")
print("The number of correct answer follows a binomial distribution B(59,0.5)")
p = stats.binomtest(k,n,1/2).pvalue
print("The model achieved an accuracy of {:.1f}%.".format(sklearn.metrics.accuracy_score(y_test, y_pred_unrelated)*100))
print("P-Value of the result is {}.".format(p))
print("We can not reject the null hypothesis that the model is only guessing.")