<a href="https://colab.research.google.com/github/Kirk-KD/toddler-asd/blob/master/asd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import metrics, svm
import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
CSV_PATH = '/content/drive/MyDrive/Colab Notebooks/ASD Toddlers/data/Toddler Autism dataset July 2018.csv'
FEATURE_COLUMNS = [f'A{i+1}' for i in range(10)] + ['Age_Mons', 'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD']
TARGET_COLUMN = 'ASD'
SEED = 25
np.random.seed(SEED)

In [None]:
dataset = pd.read_csv(CSV_PATH)
dataset.head()

In [None]:
len(dataset)

In [None]:
dataset['Ethnicity'].unique()

In [None]:
# Cleanup, rename, and drop
dataset['Sex'] = dataset['Sex'].map({'f': 0, 'm': 1})
dataset['Jaundice'] = dataset['Jaundice'].map({'yes': 1, 'no': 0})
dataset['Family_mem_with_ASD'] = dataset['Family_mem_with_ASD'].map({'yes': 1, 'no': 0})
dataset['Class/ASD Traits '] = dataset['Class/ASD Traits '].map({'Yes': 1, 'No': 0})
dataset.rename(columns={'Class/ASD Traits ': 'ASD'}, inplace=True)
dataset = dataset[FEATURE_COLUMNS + [TARGET_COLUMN]]
dataset.head()

In [None]:
counts = dataset[TARGET_COLUMN].value_counts()

plt.figure(figsize=(6,4))
plt.bar(counts.index, counts.values, color=['skyblue','salmon'])
plt.xticks(counts.index, ['0 - No ASD','1 - ASD'])
plt.ylabel('Count')
plt.title('Distribution of Target (ASD)')
plt.show()

In [None]:
dataset_oh = dataset.copy()
oh = pd.get_dummies(dataset_oh['Ethnicity'], prefix='Ethnicity', dtype=int)
dataset_oh.drop('Ethnicity', axis=1, inplace=True)
dataset_oh = pd.concat([dataset_oh, oh], axis=1)
FEATURE_COLUMNS_OH = dataset_oh.columns.drop(TARGET_COLUMN)
dataset_oh.head()

In [None]:
def train_test_valid_split(X, y):
    # First split: 80% train, 20% temp (val+test)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.2, random_state=SEED
    )
    # Second split: 50% val, 50% test of the remaining 20% -> 10% each
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=SEED
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

## Decision Tree Classifier

In [None]:
class Trainer:
    '''Trainer template for better organization of splits and results.'''

    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.20, random_state=SEED)

    def fit(self, results_dict):
        trained_model = self.model.fit(self.X_train, self.y_train)
        y_pred = trained_model.predict(self.X_test)

        results_dict['model'] = trained_model
        self.store_results(results_dict, y_pred)

    def store_results(self, results_dict, y_pred):
        results_dict['accuracy'] = metrics.accuracy_score(self.y_test, y_pred)
        results_dict['balanced_accuracy'] = metrics.balanced_accuracy_score(self.y_test, y_pred)
        results_dict['precision'] = metrics.precision_score(self.y_test, y_pred)
        results_dict['report'] = metrics.classification_report(self.y_test, y_pred)
        results_dict['confusion_matrix'] = metrics.confusion_matrix(self.y_test, y_pred)

In [None]:
class DecisionTreeTrainer(Trainer):
    def __init__(self):
        super().__init__(X=dataset_oh[FEATURE_COLUMNS_OH], y=dataset_oh[TARGET_COLUMN])

        self.model = DecisionTreeClassifier(random_state=SEED)

class DecisionTreeBalancedTrainer(Trainer):
    def __init__(self):
        super().__init__(X=dataset_oh[FEATURE_COLUMNS_OH], y=dataset_oh[TARGET_COLUMN])

        self.model = DecisionTreeClassifier(random_state=SEED, class_weight='balanced')

In [None]:
class SVMTrainer(Trainer):
    def __init__(self):
        super().__init__(X=dataset_oh[FEATURE_COLUMNS_OH], y=dataset_oh[TARGET_COLUMN])

        self.model = svm.SVC(random_state=SEED)

class SVMBalancedTrainer(Trainer):
    def __init__(self):
        super().__init__(X=dataset_oh[FEATURE_COLUMNS_OH], y=dataset_oh[TARGET_COLUMN])

        self.model = svm.SVC(random_state=SEED, class_weight='balanced')

In [None]:
class KNNTrainer(Trainer):
    def __init__(self):
        super().__init__(X=dataset_oh[FEATURE_COLUMNS_OH], y=dataset_oh[TARGET_COLUMN])

        self.model = KNeighborsClassifier()

In [None]:
class XGBoostTrainer(Trainer):
    def __init__(self):
        super().__init__(X=dataset_oh[FEATURE_COLUMNS_OH], y=dataset_oh[TARGET_COLUMN])

        self.model = xgb.XGBClassifier(random_state=SEED)

class XGBoostBalancedTrainer(XGBoostTrainer):
    def fit(self, results_dict):
        sample_weights = compute_sample_weight(class_weight='balanced', y=self.y_train)
        trained_model = self.model.fit(self.X_train, self.y_train, sample_weight=sample_weights)
        y_pred = trained_model.predict(self.X_test)

        result_dict['model'] = trained_model
        self.store_results(results_dict, y_pred)

In [None]:
class AdaBoostTrainer(Trainer):
    def __init__(self):
        super().__init__(X=dataset_oh[FEATURE_COLUMNS_OH], y=dataset_oh[TARGET_COLUMN])

        self.model = AdaBoostClassifier(random_state=SEED)

class AdaBoostBalancedTrainer(AdaBoostTrainer):
    def fit(self, results_dict):
        sample_weights = compute_sample_weight(class_weight='balanced', y=self.y_train)
        trained_model = self.model.fit(self.X_train, self.y_train, sample_weight=sample_weights)
        y_pred = trained_model.predict(self.X_test)

        result_dict['model'] = trained_model
        self.store_results(results_dict, y_pred)

## Train Modes

In [None]:
models = {
    'DecisionTree': DecisionTreeTrainer(),
    'DecisionTree_Balanced': DecisionTreeBalancedTrainer(),
    'SVM': SVMTrainer(),
    'SVM_Balanced': SVMBalancedTrainer(),
    'KNN': KNNTrainer(),
    'XGBoost': XGBoostTrainer(),
    'XGBoost_Balanced': XGBoostBalancedTrainer(),
    'AdaBoost': AdaBoostTrainer(),
    'AdaBoost_Balanced': AdaBoostBalancedTrainer()
}

records = []

for name, model in models.items():
    result_dict = {}
    model.fit(result_dict)
    records.append({'Name': name, **result_dict})

df_results = pd.DataFrame(records)
df_results.drop(['report', 'confusion_matrix'], axis=1)