In [23]:
import sys
import pandas as pd
import sklearn
import keras
import numpy as np

In [24]:
# import the dataset
file = 'Autism-Child-Data.txt'

# read the csv
data = pd.read_table(file, sep = ',', index_col = None)

In [25]:
print(data.loc[0])

A1_Score                      1
A2_Score                      1
A3_Score                      0
A4_Score                      0
A5_Score                      1
A6_Score                      1
A7_Score                      0
A8_Score                      1
A9_Score                      0
A10_Score                     0
age                           6
gender                        m
ethnicity                Others
jundice                      no
austim                       no
contry_of_res            Jordan
used_app_before              no
result                        5
age_desc           '4-11 years'
relation                 Parent
Class                        NO
Name: 0, dtype: object


In [26]:
# Total number of records
n_records = len(data.index)

#Number of records where individual's with ASD
n_asd_yes = len(data[data['Class'] == 'YES'])

# Number of records where individual's with no ASD
n_asd_no = len(data[data['Class'] == 'NO'])

# Percentage of individuals whose are with ASD
yes_percent = float(n_asd_yes) / n_records *100

# Print the results
print("Total number of records: {}".format(n_records))
print("Individuals diagonised with ASD: {}".format(n_asd_yes))
print("Individuals not diagonised with ASD: {}".format(n_asd_no))
print("Percentage of individuals diagonised with ASD: {:.2f}%".format(yes_percent))

Total number of records: 292
Individuals diagonised with ASD: 141
Individuals not diagonised with ASD: 151
Percentage of individuals diagonised with ASD: 48.29%


In [27]:
asd_data = pd.read_table(file, sep = ',', index_col = None)

In [28]:
asd_data.describe()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,result
count,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0
mean,0.633562,0.534247,0.743151,0.55137,0.743151,0.712329,0.606164,0.496575,0.493151,0.726027,6.239726
std,0.482658,0.499682,0.437646,0.498208,0.437646,0.453454,0.489438,0.500847,0.500811,0.446761,2.284882
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,6.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0


In [29]:
asd_data.dropna(inplace=True)
asd_data.describe()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,result
count,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0
mean,0.633562,0.534247,0.743151,0.55137,0.743151,0.712329,0.606164,0.496575,0.493151,0.726027,6.239726
std,0.482658,0.499682,0.437646,0.498208,0.437646,0.453454,0.489438,0.500847,0.500811,0.446761,2.284882
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,6.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import fbeta_score, make_scorer, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.base import clone
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [31]:
data.replace('?', np.nan, inplace=True)

In [32]:
categorical_columns = ['gender', 'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc', 'relation']
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Convert target variable to numerical format
label_encoder = LabelEncoder()
data_encoded['Class'] = label_encoder.fit_transform(data_encoded['Class'])

# Split data into features (X) and target (Y)
X = data_encoded.drop(columns=['Class'])  # Features
Y = data_encoded['Class']  # Target variable

In [33]:
X.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'result',
       'gender_m', 'ethnicity_'South Asian'', 'ethnicity_Asian',
       'ethnicity_Black', 'ethnicity_Hispanic', 'ethnicity_Latino',
       'ethnicity_Others', 'ethnicity_Pasifika', 'ethnicity_Turkish',
       'ethnicity_White-European', 'jundice_yes', 'austim_yes',
       'contry_of_res_'Isle of Man'', 'contry_of_res_'New Zealand'',
       'contry_of_res_'Saudi Arabia'', 'contry_of_res_'South Africa'',
       'contry_of_res_'South Korea'', 'contry_of_res_'U.S. Outlying Islands'',
       'contry_of_res_'United Arab Emirates'',
       'contry_of_res_'United Kingdom'', 'contry_of_res_'United States'',
       'contry_of_res_Afghanistan', 'contry_of_res_Argentina',
       'contry_of_res_Armenia', 'contry_of_res_Australia',
       'contry_of_res_Austria', 'contry_of_res_Bahrain',
       'contry_of_res_Bangladesh', 'contry_of_res_Bhutan',
   

In [34]:
Y

0      0
1      0
2      0
3      0
4      1
      ..
287    1
288    0
289    1
290    1
291    0
Name: Class, Length: 292, dtype: int32

In [35]:
from sklearn import model_selection
# split the X and Y data into training and testing datasets
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size = 0.2)

In [36]:
imputer = SimpleImputer(strategy='most_frequent')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
X_imputed = imputer.fit_transform(X)

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.metrics import fbeta_score, make_scorer
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import numpy as np

classifiers = {
    "Decision Tree": DecisionTreeClassifier(random_state=1),
    "Random Forest": RandomForestClassifier(n_estimators=5, random_state=1),
    "SVM": SVC(kernel='linear', C=1.0, gamma=2),
    "KNN": KNeighborsClassifier(n_neighbors=10),
    "Multinomial Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
    "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=0),
    "AdaBoost": AdaBoostClassifier(random_state=0)
}

# Define a custom scoring function for fbeta_score
scoring = make_scorer(fbeta_score, beta=0.5)

# Create an empty list to store pipelines
pipelines = []

# Iterate through each classifier
for name, classifier in classifiers.items():
    print(f"Evaluating {name}")
    
    # Cross-validation with fbeta_score
    cv_scores_fbeta = cross_val_score(classifier, X_imputed, Y, cv=10, scoring=scoring)
    print(f"Cross-validated Fbeta Score: {np.mean(cv_scores_fbeta)}")
    
    # Cross-validation with ROC AUC score
    cv_scores_roc_auc = cross_val_score(classifier, X_imputed, Y, cv=10, scoring='roc_auc')
    print(f"Cross-validated ROC AUC Score: {np.mean(cv_scores_roc_auc)}")
    
    # Create a pipeline with the classifier and fit it
    classifier_clone = clone(classifier)
    pipeline = Pipeline([
        ('classifier', classifier_clone)
    ])
    pipeline.fit(X_imputed, Y)
    pipelines.append((name, pipeline))
    
    # Make predictions on the test data using the pipeline
    predictions_test = pipeline.predict(X_test_imputed)
    
    # Evaluate the predictions using fbeta_score
    fbeta = fbeta_score(Y_test, predictions_test, average='binary', beta=0.5)
    
    print(f"Fbeta Score on Test Set: {fbeta}")
    
    confusion = metrics.confusion_matrix(Y_test, predictions_test)
    print(confusion)
    print("===")

Evaluating Decision Tree
Cross-validated Fbeta Score: 1.0
Cross-validated ROC AUC Score: 1.0
Fbeta Score on Test Set: 1.0
[[25  0]
 [ 0 34]]
===
Evaluating Random Forest
Cross-validated Fbeta Score: 0.9511589979331914
Cross-validated ROC AUC Score: 0.9883333333333333
Fbeta Score on Test Set: 1.0
[[25  0]
 [ 0 34]]
===
Evaluating SVM
Cross-validated Fbeta Score: 1.0
Cross-validated ROC AUC Score: 1.0
Fbeta Score on Test Set: 1.0
[[25  0]
 [ 0 34]]
===
Evaluating KNN
Cross-validated Fbeta Score: 0.9826107465347972
Cross-validated ROC AUC Score: 0.9990476190476191
Fbeta Score on Test Set: 0.9770114942528735
[[24  1]
 [ 0 34]]
===
Evaluating Multinomial Naive Bayes
Cross-validated Fbeta Score: 0.7902765241166901
Cross-validated ROC AUC Score: 0.8941190476190476
Fbeta Score on Test Set: 0.8544303797468353
[[21  4]
 [ 7 27]]
===
Evaluating Logistic Regression
Cross-validated Fbeta Score: 1.0
Cross-validated ROC AUC Score: 1.0
Fbeta Score on Test Set: 1.0
[[25  0]
 [ 0 34]]
===
Evaluating Lin



Cross-validated Fbeta Score: 1.0
Cross-validated ROC AUC Score: 1.0
Fbeta Score on Test Set: 1.0
[[25  0]
 [ 0 34]]
===
Evaluating AdaBoost
Cross-validated Fbeta Score: 1.0
Cross-validated ROC AUC Score: 1.0
Fbeta Score on Test Set: 1.0
[[25  0]
 [ 0 34]]
===


In [39]:
predictions_test

array([0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1])

In [40]:
import joblib

# Save the trained model
model_filename = 'autism_model.joblib'
joblib.dump(classifiers, model_filename)

['autism_model.joblib']

In [41]:
pipelines = []

# Iterate through each classifier and create a pipeline
for name, classifier in classifiers.items():
    pipeline = Pipeline([
        ('imputer', imputer),
        ('classifier', classifier)
    ])
    pipeline.fit(X_imputed, Y)
    pipelines.append((name, pipeline))



In [42]:
joblib.dump(pipelines, 'autism_detection_pipelines.pkl')

['autism_detection_pipelines.pkl']

In [43]:
joblib.dump(imputer, 'autism_imputer.pkl')

['autism_imputer.pkl']