## Preprocessing

In [1]:
# !pip install tensorflow

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import pandas as pd
import tensorflow as tf
import numpy as np
from scipy.stats import pointbiserialr, chi2_contingency, ttest_ind
import joblib

In [2]:
#  Import and read the QChat data.
QChat_df = pd.read_csv("Resources/Toddler Autism dataset July 2018.csv")
QChat_df.head()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes


In [3]:
QChat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Case_No                 1054 non-null   int64 
 1   A1                      1054 non-null   int64 
 2   A2                      1054 non-null   int64 
 3   A3                      1054 non-null   int64 
 4   A4                      1054 non-null   int64 
 5   A5                      1054 non-null   int64 
 6   A6                      1054 non-null   int64 
 7   A7                      1054 non-null   int64 
 8   A8                      1054 non-null   int64 
 9   A9                      1054 non-null   int64 
 10  A10                     1054 non-null   int64 
 11  Age_Mons                1054 non-null   int64 
 12  Qchat-10-Score          1054 non-null   int64 
 13  Sex                     1054 non-null   object
 14  Ethnicity               1054 non-null   object
 15  Jaun

In [4]:
QChat_df['Class/ASD Traits '].value_counts()

Yes    728
No     326
Name: Class/ASD Traits , dtype: int64

In [5]:
# Update Yes/No in Class/ASD Traits column to 1/0.
# Change 0 to 1 and 1 to 0 in A10 - this is a reverse question
QChat_df['Class/ASD Traits '] = QChat_df['Class/ASD Traits '].map({'Yes': 1, 'No': 0})
# QChat_df['A10'] = 1 - QChat_df['A10']  #This is not necessary.  The data is already reversed.
QChat_df

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,0
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,1
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,1
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,1
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1049,1050,0,0,0,0,0,0,0,0,0,1,24,1,f,White European,no,yes,family member,0
1050,1051,0,0,1,1,1,0,1,0,1,0,12,5,m,black,yes,no,family member,1
1051,1052,1,0,1,1,1,1,1,1,1,1,18,9,m,middle eastern,yes,no,family member,1
1052,1053,1,0,0,0,0,0,0,1,0,1,19,3,m,White European,no,yes,family member,0


In [6]:
QChat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Case_No                 1054 non-null   int64 
 1   A1                      1054 non-null   int64 
 2   A2                      1054 non-null   int64 
 3   A3                      1054 non-null   int64 
 4   A4                      1054 non-null   int64 
 5   A5                      1054 non-null   int64 
 6   A6                      1054 non-null   int64 
 7   A7                      1054 non-null   int64 
 8   A8                      1054 non-null   int64 
 9   A9                      1054 non-null   int64 
 10  A10                     1054 non-null   int64 
 11  Age_Mons                1054 non-null   int64 
 12  Qchat-10-Score          1054 non-null   int64 
 13  Sex                     1054 non-null   object
 14  Ethnicity               1054 non-null   object
 15  Jaun

In [7]:
# Convert categorical data to numeric with `pd.get_dummies`
category_dummies = pd.get_dummies(QChat_df)

In [8]:
category_dummies
# category_dummies.info()
# print(category_dummies.isnull().sum())
# print(np.isinf(category_dummies).sum())

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,Ethnicity_south asian,Jaundice_no,Jaundice_yes,Family_mem_with_ASD_no,Family_mem_with_ASD_yes,Who completed the test_Health Care Professional,Who completed the test_Health care professional,Who completed the test_Others,Who completed the test_Self,Who completed the test_family member
0,1,0,0,0,0,0,0,1,1,0,...,0,0,1,1,0,0,0,0,0,1
1,2,1,1,0,0,0,1,1,0,0,...,0,0,1,1,0,0,0,0,0,1
2,3,1,0,0,0,0,0,1,1,0,...,0,0,1,1,0,0,0,0,0,1
3,4,1,1,1,1,1,1,1,1,1,...,0,1,0,1,0,0,0,0,0,1
4,5,1,1,0,1,1,1,1,1,1,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1049,1050,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
1050,1051,0,0,1,1,1,0,1,0,1,...,0,0,1,1,0,0,0,0,0,1
1051,1052,1,0,1,1,1,1,1,1,1,...,0,0,1,1,0,0,0,0,0,1
1052,1053,1,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1


# Correlations?

In [9]:
# Use Phi coefficient to find relationship between QChat data and ASD Traits. 
correlation_df = category_dummies[['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Class/ASD Traits ']]

numbers = range(1,11)
survey_correlations_list = []

for number in numbers:
    question = f'A{number}'
    phi_coefficient, p_value = pointbiserialr(correlation_df[question], correlation_df['Class/ASD Traits '])
    question_data = {'question': question, 'Phi Coefficient (φ)': phi_coefficient, 'P-value': p_value}
    survey_correlations_list.append(question_data)

survey_correlations = pd.DataFrame(survey_correlations_list)
survey_correlations


Unnamed: 0,question,Phi Coefficient (φ),P-value
0,A1,0.50381,6.336049000000001e-69
1,A2,0.463467,3.0364429999999996e-57
2,A3,0.409701,6.325775999999999e-44
3,A4,0.505204,2.3421060000000003e-69
4,A5,0.563297,2.7726619999999998e-89
5,A6,0.569424,1.27083e-91
6,A7,0.563177,3.078939e-89
7,A8,0.427155,5.4670139999999994e-48
8,A9,0.577336,1.024092e-94
9,A10,0.179833,4.116972e-09


In [10]:
# Use Phi coefficient to find relationship between binary demographic data and ASD Traits. 
correlation2_df = category_dummies[[
    'Jaundice_yes', 
    'Family_mem_with_ASD_yes', 
    'Sex_m', 
    'Class/ASD Traits ']]

columns = ['Jaundice_yes', 'Family_mem_with_ASD_yes', 'Sex_m']
survey_correlations_list2 = []

for column in columns:
    phi_coefficient, p_value = pointbiserialr(correlation2_df[column], correlation2_df['Class/ASD Traits '])
    question_data = {'question': column, 'Phi Coefficient (φ)': phi_coefficient, 'P-value': p_value}
    survey_correlations_list2.append(question_data)

demographic_correlations_df = pd.DataFrame(survey_correlations_list2)
demographic_correlations_df

Unnamed: 0,question,Phi Coefficient (φ),P-value
0,Jaundice_yes,0.07406,0.01618
1,Family_mem_with_ASD_yes,-0.013503,0.661485
2,Sex_m,0.117664,0.000129


In [11]:
# Use independent t-test for age and total score.
correlation3_df = category_dummies[[
    'Qchat-10-Score', 
    'Age_Mons', 
    'Class/ASD Traits ']]

# Split the data into two groups based on the binary column (0 and 1)
demo_columns = [correlation3_df['Age_Mons'], correlation3_df['Qchat-10-Score']]
binary_column = correlation3_df['Class/ASD Traits ']
results_list = []

for i, column in enumerate(demo_columns):
    group_0 = column[binary_column == 0]
    group_1 = column[binary_column == 1]

    # Perform an independent samples t-test
    t_statistic, p_value = ttest_ind(group_0, group_1)
    result_dict = {
        'question': correlation3_df.columns[i],
        'T-Statistic': t_statistic,
        'P-value': p_value}
    results_list.append(result_dict)

demographic_correlations2_df = pd.DataFrame(results_list)
demographic_correlations2_df

Unnamed: 0,question,T-Statistic,P-value
0,Qchat-10-Score,-2.172494,0.03004116
1,Age_Mons,-44.867971,1.5477479999999998e-246


# Prep all data to train

In [12]:
# Split our preprocessed data into our features and target arrays
y = QChat_df['Class/ASD Traits ']
X = category_dummies.drop(['Class/ASD Traits '], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
X

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,Ethnicity_south asian,Jaundice_no,Jaundice_yes,Family_mem_with_ASD_no,Family_mem_with_ASD_yes,Who completed the test_Health Care Professional,Who completed the test_Health care professional,Who completed the test_Others,Who completed the test_Self,Who completed the test_family member
0,1,0,0,0,0,0,0,1,1,0,...,0,0,1,1,0,0,0,0,0,1
1,2,1,1,0,0,0,1,1,0,0,...,0,0,1,1,0,0,0,0,0,1
2,3,1,0,0,0,0,0,1,1,0,...,0,0,1,1,0,0,0,0,0,1
3,4,1,1,1,1,1,1,1,1,1,...,0,1,0,1,0,0,0,0,0,1
4,5,1,1,0,1,1,1,1,1,1,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1049,1050,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
1050,1051,0,0,1,1,1,0,1,0,1,...,0,0,1,1,0,0,0,0,0,1
1051,1052,1,0,1,1,1,1,1,1,1,...,0,0,1,1,0,0,0,0,0,1
1052,1053,1,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1


In [14]:
y

0       0
1       1
2       1
3       1
4       1
       ..
1049    0
1050    1
1051    1
1052    0
1053    1
Name: Class/ASD Traits , Length: 1054, dtype: int64

In [15]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [16]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 8

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 288       
                                                                 
 dense_1 (Dense)             (None, 8)                 72        
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 369 (1.44 KB)
Trainable params: 369 (1.44 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [18]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [19]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

9/9 - 0s - loss: 0.0423 - accuracy: 0.9848 - 444ms/epoch - 49ms/step
Loss: 0.04228851944208145, Accuracy: 0.9848484992980957


In [20]:
# Export our model to HDF5 file
nn.save('Resources/QChatNN.h5')

  saving_api.save_model(


# Optimize the Model

In [2]:
QChat_df = pd.read_csv("Resources/Toddler Autism dataset July 2018.csv")
QChat_df['Class/ASD Traits '] = QChat_df['Class/ASD Traits '].map({'Yes': 1, 'No': 0})
# Drop the non-beneficial columns,  'Case_No', 'Qchat-10-Score', 'Sex', 'Ethnicity', 'Who completed the test', 'Jaundice', 'Family_mem_with_ASD'
# Remaining columns: Age_Mons'
QChat_df = QChat_df.drop(columns=['Case_No', 'Who completed the test', 'Ethnicity', 'Qchat-10-Score', 'Sex', 'Jaundice', 'Family_mem_with_ASD'])
QChat_df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Class/ASD Traits
0,0,0,0,0,0,0,1,1,0,1,28,0
1,1,1,0,0,0,1,1,0,0,0,36,1
2,1,0,0,0,0,0,1,1,0,1,36,1
3,1,1,1,1,1,1,1,1,1,1,24,1
4,1,1,0,1,1,1,1,1,1,1,20,1


In [3]:
# Determine the number of unique values in each column.
QChat_df.nunique()

A1                    2
A2                    2
A3                    2
A4                    2
A5                    2
A6                    2
A7                    2
A8                    2
A9                    2
A10                   2
Age_Mons             25
Class/ASD Traits      2
dtype: int64

In [4]:
# Convert categorical data to numeric with `pd.get_dummies`
category_dummies = pd.get_dummies(QChat_df)
category_dummies

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Class/ASD Traits
0,0,0,0,0,0,0,1,1,0,1,28,0
1,1,1,0,0,0,1,1,0,0,0,36,1
2,1,0,0,0,0,0,1,1,0,1,36,1
3,1,1,1,1,1,1,1,1,1,1,24,1
4,1,1,0,1,1,1,1,1,1,1,20,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1049,0,0,0,0,0,0,0,0,0,1,24,0
1050,0,0,1,1,1,0,1,0,1,0,12,1
1051,1,0,1,1,1,1,1,1,1,1,18,1
1052,1,0,0,0,0,0,0,1,0,1,19,0


In [5]:
# Split our preprocessed data into our features and target arrays
y = QChat_df['Class/ASD Traits ']
X = category_dummies.drop(['Class/ASD Traits '], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Save the scaler to a file
joblib.dump(scaler, 'Resources/scaler.joblib')

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 8

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 96        
                                                                 
 dense_1 (Dense)             (None, 8)                 72        
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 177 (708.00 Byte)
Trainable params: 177 (708.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [8]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

9/9 - 0s - loss: 0.0150 - accuracy: 1.0000 - 375ms/epoch - 42ms/step
Loss: 0.014991269446909428, Accuracy: 1.0


In [9]:
#Generate training predictions
training_predictions = nn.predict(X_train)

#Generate testing predictions
testing_predictions = nn.predict(X_test)



In [10]:
# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[  0 244]
 [  0 546]]


In [11]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[  0  82]
 [  0 182]]


In [28]:
# Export our model to HDF5 file
nn.save('Resources/QChatNN_for_new_survey.h5')

  saving_api.save_model(
