ASD Prediction Model

In [None]:
# Import relevant modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from operator import index


data = pd.read_csv('data_csv.csv')

In [None]:
# Create a copy of data

df = data.copy()
df.head()

In [None]:
# Check the features of dataset

df.info()

In [None]:
# Sum up the AQ score to get fnal score over 10

cols_to_sum = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6',
               'A7', 'A8', 'A9', 'A10_Autism_Spectrum_Quotient' ]

df['AQ-10 score'] = df[cols_to_sum].sum(axis=1)

In [None]:
df = df.drop(cols_to_sum, axis=1)

In [None]:
df.head()

#### Exploratory Data Analysiss

In [None]:
# Visualise the count of children with genetic disorders

sns.catplot(data=df, x='Genetic_Disorders', kind='count')

In [None]:
sns.catplot(data=df, x='Speech Delay/Language Disorder', kind='count')

In [None]:
# Check for relationship between repsonsiveness and q10 score

sns.relplot(data=df, x='Social_Responsiveness_Scale', y='Qchat_10_Score',
            kind='scatter', col='Global developmental delay/intellectual disability')

In [None]:
sns.relplot(data=df, x='Childhood Autism Rating Scale', y='Qchat_10_Score',
            kind='scatter', col='Genetic_Disorders')

In [None]:
sns.regplot(data=df.query('Qchat_10_Score>=6'),x='Childhood Autism Rating Scale', y='Qchat_10_Score')

In [None]:
# Identify count of children with Q10 score above 6 and genetic disorders

sns.catplot(data=df.query('Qchat_10_Score >= 6'), x='Genetic_Disorders', kind='count')

In [None]:

sns.catplot(data=df.query('Qchat_10_Score >= 6'), x='Speech Delay/Language Disorder', kind='count')

Correlation could be hidden because of the scale of measurement, maybe normalising data will help.

In [None]:
df.info()

In [None]:
# Finding missing values

df.isna().sum()

In [None]:
df = df.drop('Qchat_10_Score', axis=1)

In [None]:
df = df.rename({'Social_Responsiveness_Scale':'SRS',
                'Global developmental delay/Intellectual disability':'GDD/ID',
                'Family_mem_with_ASD':'FMHx',
                'Who_completed_the_test':'Respondent',
                'Childhood Autism Rating Scale':'CARS',
                'Learning disorder':'SLD',
                'Age_Years':'Age',
                'Speech Delay/Language Disorder':'Speech_lang'
}, axis=1) 

In [None]:
df['Social/Behavioural Issues'].fillna(df['Social/Behavioural Issues'].mode()[0], inplace=True)

In [None]:
df['Depression'].fillna(df['Depression'].mode()[0], inplace=True)

In [None]:
df.isna().sum()

In [None]:
# Subset for categorical features 

cat = (df.dtypes == 'object')
cat_col = cat[cat].index
cat_col

In [None]:
for col in cat_col:
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.codes

In [None]:
df = df.drop('Respondent', axis=1)

In [None]:
df['SRS'].fillna(df['SRS'].mean(), inplace=True)

In [None]:
df.info()

In [None]:
y = df.pop('ASD_traits')

X= df

In [None]:
X.pop("CASE_NO_PATIENT'S")

In [None]:
X.head()

#### Model Building

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, plot_roc_curve, RocCurveDisplay, classification_report, roc_auc_score

from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.svm import SVC

svc = SVC()

In [None]:
svc.fit(X_train, y_train)

In [None]:
log_reg = LogisticRegression()

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
log_pred = log_reg.predict(X_test)
svc_pred = svc.predict(X_test)

In [None]:
print(confusion_matrix(y_test, log_pred))
# print('\n')
# print(confusion_matrix(y_test, svc_pred))

In [None]:
print(classification_report(y_test, log_pred))

print('\n')

print(classification_report(y_test, svc_pred))

In [None]:
log_proba = log_reg.predict_proba(X_test)

In [None]:
roc_auc_score(y_test, log_proba[:, 1])

In [None]:
def plot_sklearn_roc_curve(y_real, y_pred):
    fpr, tpr, _ = roc_curve(y_real, y_pred)
    roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
    roc_display.figure_.set_size_inches(5,5)
    plt.plot([0, 1], [0, 1], color = 'g')

plot_sklearn_roc_curve(y_test, log_proba[:, 1])

In [None]:
# Find out MI of features

from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, name='MI scores', index=X.columns)

scores = mi_scores.sort_values(ascending=False)

print(scores)

In [None]:
X_new = X.drop(['Jaundice', 'Age', 'FMHx', 'Speech_lang'], axis=1)

In [None]:
X_new_train, X_new_test, y_train, y_test = train_test_split(X_new, y, stratify=y, random_state=2)

In [None]:
log_reg_new = LogisticRegression()

In [None]:
# Fit model on the data

log_reg_new.fit(X_new_train, y_train)

In [None]:
log_proba_new = log_reg_new.predict_proba(X_new_test)

In [None]:
log_pred_new = log_reg_new.predict(X_new_test)

In [None]:
# Print the classification report

print(classification_report(y_test, log_pred_new))