In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import arff
from io import BytesIO
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split

# Process Datasets

## 1. Create Data Frame

In [2]:
#create data frame for Child
data = arff.loadarff('Autism-Child-Data.arff')
df_child = pd.DataFrame(data[0])

In [3]:
df_child

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'1',b'0',b'0',...,b'm',b'Others',b'no',b'no',b'Jordan',b'no',5.0,b'4-11 years',b'Parent',b'NO'
1,b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'1',b'0',b'0',...,b'm',b'Middle Eastern ',b'no',b'no',b'Jordan',b'no',5.0,b'4-11 years',b'Parent',b'NO'
2,b'1',b'1',b'0',b'0',b'0',b'1',b'1',b'1',b'0',b'0',...,b'm',b'?',b'no',b'no',b'Jordan',b'yes',5.0,b'4-11 years',b'?',b'NO'
3,b'0',b'1',b'0',b'0',b'1',b'1',b'0',b'0',b'0',b'1',...,b'f',b'?',b'yes',b'no',b'Jordan',b'no',4.0,b'4-11 years',b'?',b'NO'
4,b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',...,b'm',b'Others',b'yes',b'no',b'United States',b'no',10.0,b'4-11 years',b'Parent',b'YES'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',...,b'f',b'White-European',b'yes',b'yes',b'United Kingdom',b'no',10.0,b'4-11 years',b'Parent',b'YES'
288,b'1',b'0',b'0',b'0',b'1',b'0',b'1',b'0',b'0',b'1',...,b'f',b'White-European',b'yes',b'yes',b'Australia',b'no',4.0,b'4-11 years',b'Parent',b'NO'
289,b'1',b'0',b'1',b'1',b'1',b'1',b'1',b'0',b'0',b'1',...,b'm',b'Latino',b'no',b'no',b'Brazil',b'no',7.0,b'4-11 years',b'Parent',b'YES'
290,b'1',b'1',b'1',b'0',b'1',b'1',b'1',b'1',b'1',b'1',...,b'm',b'South Asian',b'no',b'no',b'India',b'no',9.0,b'4-11 years',b'Parent',b'YES'


In [4]:
#data type of each feature in the dataset
df_child.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A1_Score         292 non-null    object 
 1   A2_Score         292 non-null    object 
 2   A3_Score         292 non-null    object 
 3   A4_Score         292 non-null    object 
 4   A5_Score         292 non-null    object 
 5   A6_Score         292 non-null    object 
 6   A7_Score         292 non-null    object 
 7   A8_Score         292 non-null    object 
 8   A9_Score         292 non-null    object 
 9   A10_Score        292 non-null    object 
 10  age              288 non-null    float64
 11  gender           292 non-null    object 
 12  ethnicity        292 non-null    object 
 13  jundice          292 non-null    object 
 14  austim           292 non-null    object 
 15  contry_of_res    292 non-null    object 
 16  used_app_before  292 non-null    object 
 17  result          

## 2. Drop Unrelated Features

In [5]:
df_child_drop = df_child.drop(['ethnicity','contry_of_res', 'age_desc'], axis=1)

In [6]:
df_child_drop

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,jundice,austim,used_app_before,result,relation,Class/ASD
0,b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'1',b'0',b'0',6.0,b'm',b'no',b'no',b'no',5.0,b'Parent',b'NO'
1,b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'1',b'0',b'0',6.0,b'm',b'no',b'no',b'no',5.0,b'Parent',b'NO'
2,b'1',b'1',b'0',b'0',b'0',b'1',b'1',b'1',b'0',b'0',6.0,b'm',b'no',b'no',b'yes',5.0,b'?',b'NO'
3,b'0',b'1',b'0',b'0',b'1',b'1',b'0',b'0',b'0',b'1',5.0,b'f',b'yes',b'no',b'no',4.0,b'?',b'NO'
4,b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',5.0,b'm',b'yes',b'no',b'no',10.0,b'Parent',b'YES'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',7.0,b'f',b'yes',b'yes',b'no',10.0,b'Parent',b'YES'
288,b'1',b'0',b'0',b'0',b'1',b'0',b'1',b'0',b'0',b'1',7.0,b'f',b'yes',b'yes',b'no',4.0,b'Parent',b'NO'
289,b'1',b'0',b'1',b'1',b'1',b'1',b'1',b'0',b'0',b'1',4.0,b'm',b'no',b'no',b'no',7.0,b'Parent',b'YES'
290,b'1',b'1',b'1',b'0',b'1',b'1',b'1',b'1',b'1',b'1',4.0,b'm',b'no',b'no',b'no',9.0,b'Parent',b'YES'


## 3. Translate Bytes Object

In [7]:
df_child_remove = df_child_drop

In [8]:
df_child_remove = df_child_remove.select_dtypes([object])
df_child_remove = df_child_remove.stack().str.decode('utf-8').unstack()

In [9]:
df_child_remove = df_child[['result','age']].join(df_child_remove)

In [68]:
df_child_remove

Unnamed: 0,result,age,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,gender,jundice,austim,used_app_before,relation,Class/ASD
0,5.0,6.0,1,1,0,0,1,1,0,1,0,0,m,no,no,no,Parent,NO
1,5.0,6.0,1,1,0,0,1,1,0,1,0,0,m,no,no,no,Parent,NO
2,5.0,6.0,1,1,0,0,0,1,1,1,0,0,m,no,no,yes,?,NO
3,4.0,5.0,0,1,0,0,1,1,0,0,0,1,f,yes,no,no,?,NO
4,10.0,5.0,1,1,1,1,1,1,1,1,1,1,m,yes,no,no,Parent,YES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,10.0,7.0,1,1,1,1,1,1,1,1,1,1,f,yes,yes,no,Parent,YES
288,4.0,7.0,1,0,0,0,1,0,1,0,0,1,f,yes,yes,no,Parent,NO
289,7.0,4.0,1,0,1,1,1,1,1,0,0,1,m,no,no,no,Parent,YES
290,9.0,4.0,1,1,1,0,1,1,1,1,1,1,m,no,no,no,Parent,YES


## 4. One-Hot-Encoding
#### Convert not numeric data to numeric for analysis purpose

In [69]:
df_child_onehot = df_child_remove.copy()

In [70]:
#Check the different values each feature has
for feature in df_child_remove:
    unique_value = df_child_remove[feature].unique()
    print(feature + " has value: ", unique_value )

result has value:  [ 5.  4. 10.  7.  8.  3.  9.  2.  1.  6.  0.]
age has value:  [ 6.  5.  4. 11. 10.  8.  7.  9. nan]
A1_Score has value:  ['1' '0']
A2_Score has value:  ['1' '0']
A3_Score has value:  ['0' '1']
A4_Score has value:  ['0' '1']
A5_Score has value:  ['1' '0']
A6_Score has value:  ['1' '0']
A7_Score has value:  ['0' '1']
A8_Score has value:  ['1' '0']
A9_Score has value:  ['0' '1']
A10_Score has value:  ['0' '1']
gender has value:  ['m' 'f']
jundice has value:  ['no' 'yes']
austim has value:  ['no' 'yes']
used_app_before has value:  ['no' 'yes']
relation has value:  ['Parent' '?' 'Self' 'Relative' 'Health care professional' 'self']
Class/ASD has value:  ['NO' 'YES']


In [71]:
#do one-hot-encoing for each feature 
df_child_onehot['gender'] = df_child_onehot['gender'].map({'m': 0, 'f': 1})
df_child_onehot['jundice'] = df_child_onehot['jundice'].map({'no': 0, 'yes': 1})
df_child_onehot['austim'] = df_child_onehot['austim'].map({'no': 0, 'yes': 1})
df_child_onehot['used_app_before'] = df_child_onehot['used_app_before'].map({'no': 0, 'yes': 1})
df_child_onehot['relation'] = df_child_onehot['relation'].map({'Parent': 0, 'Self': 1, 'self': 1, 'Relative': 2, 'Health care professional': 3})
df_child_onehot['A1_Score'] = df_child_onehot['A1_Score'].map({'0': 0, '1': 1})
df_child_onehot['A2_Score'] = df_child_onehot['A2_Score'].map({'0': 0, '1': 1})
df_child_onehot['A3_Score'] = df_child_onehot['A3_Score'].map({'0': 0, '1': 1})
df_child_onehot['A4_Score'] = df_child_onehot['A4_Score'].map({'0': 0, '1': 1})
df_child_onehot['A5_Score'] = df_child_onehot['A5_Score'].map({'0': 0, '1': 1})
df_child_onehot['A6_Score'] = df_child_onehot['A6_Score'].map({'0': 0, '1': 1})
df_child_onehot['A7_Score'] = df_child_onehot['A7_Score'].map({'0': 0, '1': 1})
df_child_onehot['A8_Score'] = df_child_onehot['A8_Score'].map({'0': 0, '1': 1})
df_child_onehot['A9_Score'] = df_child_onehot['A9_Score'].map({'0': 0, '1': 1})
df_child_onehot['A10_Score'] = df_child_onehot['A10_Score'].map({'0': 0, '1': 1})

unique_value_ASD = df_child_onehot['Class/ASD'].unique()
val_no = unique_value[0]
val_yes = unique_value[1]
df_child_onehot['Class/ASD'] = df_child_onehot['Class/ASD'].map({val_no: 0, val_yes: 1})

#df_child_onehot['Class/ASD'] = df_child_remove['jundice'].map({'NO': 0, 'YES': 1})

## 5. Handle Missing Value

In [72]:
df_child_onehot[df_child_onehot.isna().sum(axis=1) > 0]

Unnamed: 0,result,age,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,gender,jundice,austim,used_app_before,relation,Class/ASD
2,5.0,6.0,1,1,0,0,0,1,1,1,0,0,0,0,0,1,,0
3,4.0,5.0,0,1,0,0,1,1,0,0,0,1,1,1,0,0,,0
5,5.0,4.0,0,0,1,0,1,1,0,1,0,1,0,0,1,0,,0
9,5.0,11.0,0,0,1,1,1,0,1,1,0,0,1,0,1,0,,0
11,3.0,5.0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,,0
19,8.0,5.0,1,1,1,1,1,1,0,1,0,1,0,0,0,0,,1
21,5.0,9.0,0,0,1,1,0,1,0,1,1,0,1,0,0,0,,0
24,8.0,11.0,1,0,1,1,1,1,0,1,1,1,0,0,0,0,,1
28,5.0,6.0,0,1,1,0,0,0,1,1,0,1,0,0,1,0,,0
32,7.0,,1,0,0,1,0,1,1,1,1,1,0,0,0,0,,1


In [73]:
#fill missing value of 'relation' with 'Parents', since most questions are answered by children's Parents
df_child_onehot['relation'].fillna(value = 0, inplace=True)

#fill missing value of 'age' with its mean
df_child_onehot['age'].fillna(round(df_child_onehot['age'].mean()), inplace=True)

In [74]:
#check if all missing values are handeled
df_child_onehot[df_child_onehot.isna().sum(axis=1) > 0]

Unnamed: 0,result,age,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,gender,jundice,austim,used_app_before,relation,Class/ASD


## 6. Check if the data set is balaned or not

In [75]:
#find number of negative(-) class
num_of_negative = len(df_child_onehot[df_child_onehot['Class/ASD'] == 0])
num_of_negative

151

In [76]:
#find number of postive(+) class
num_of_negative = len(df_child_onehot[df_child_onehot['Class/ASD'] == 1])
num_of_negative

141

# Build Random Forest Model

## 1. Split df_child_onehot into train and test sets

In [186]:
X = df_child_onehot.iloc[:, :-1]
y = df_child_onehot.iloc[:,-1]

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

## 2. Train the model

In [195]:
model = RandomForestClassifier(n_estimators=1000)

In [196]:
model.fit(X_train, y_train)

In [197]:
y_pred = model.predict(X_test)

In [202]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


## 2. Use Grid Search to Find Best Mtry and Ntree for RF Model

In [80]:
#get the number of features
num_feature = len(train_feature.columns)

In [81]:
#find three values of Mtry
mtry_1 = round(0.5 * math.sqrt(num_feature))
mtry_2 = round(math.sqrt(num_feature))
mtry_3 = round(2 * math.sqrt(num_feature))

In [82]:
#parameters list for tuning
mtry = [mtry_1, mtry_2, mtry_3]
ntree = [500,1000]

In [84]:
#do grid search 
for i in range (2):
    for j in range (3):
        nTree = ntree[i]
        mTry = mtry[j]
        #create a RF model
        model = RandomForestClassifier(n_estimators=nTree, max_features=mTry, oob_score= True, random_state = 0)
        #train the RF model
        model.fit(train_feature, train_label)
        #use decision funciton estimated on training set to find prediction based on training set
        prediction = np.argmax(model.oob_decision_function_, axis = 1)
        #print out ntree value and mtry value with their corresponding f1 score
        print("nTree = ", nTree, "     mTry = " , mTry)
        print("F1: ", f1_score(train_label, prediction))
        print("Accuracy: ", accuracy_score(train_label, prediction))

  model.fit(train_feature, train_label)


nTree =  500      mTry =  2
F1:  1.0
Accuracy:  1.0


  model.fit(train_feature, train_label)


nTree =  500      mTry =  4
F1:  1.0
Accuracy:  1.0


  model.fit(train_feature, train_label)


nTree =  500      mTry =  8
F1:  1.0
Accuracy:  1.0


  model.fit(train_feature, train_label)


nTree =  1000      mTry =  2
F1:  1.0
Accuracy:  1.0


  model.fit(train_feature, train_label)


nTree =  1000      mTry =  4
F1:  1.0
Accuracy:  1.0


  model.fit(train_feature, train_label)


nTree =  1000      mTry =  8
F1:  1.0
Accuracy:  1.0
