# ASD Screening Dataset Exploration

This notebook explores the AQ-10 screening dataset for autism spectrum disorder classification.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## Load the Dataset

In [None]:
# Load training data
train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nColumns: {train_df.columns.tolist()}")

In [None]:
# Display first few rows
train_df.head()

In [None]:
# Data types and missing values
train_df.info()

## Target Variable Distribution

In [None]:
# Class distribution
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

# Bar plot
train_df['Class/ASD'].value_counts().plot(kind='bar', ax=ax[0], color=['#2ecc71', '#e74c3c'])
ax[0].set_title('ASD Classification Distribution')
ax[0].set_xlabel('Class (0=No ASD, 1=ASD)')
ax[0].set_ylabel('Count')
ax[0].set_xticklabels(['No ASD', 'ASD'], rotation=0)

# Pie chart
train_df['Class/ASD'].value_counts().plot(kind='pie', ax=ax[1], autopct='%1.1f%%',
                                          colors=['#2ecc71', '#e74c3c'],
                                          labels=['No ASD', 'ASD'])
ax[1].set_title('Class Distribution (%)')
ax[1].set_ylabel('')

plt.tight_layout()
plt.show()

print(f"\nClass counts:\n{train_df['Class/ASD'].value_counts()}")

## AQ-10 Score Analysis

In [None]:
# AQ-10 question columns
aq10_cols = [f'A{i}_Score' for i in range(1, 11)]

# Calculate total AQ-10 score
train_df['AQ10_Total'] = train_df[aq10_cols].sum(axis=1)

# Distribution by class
fig, ax = plt.subplots(figsize=(10, 5))

for label, group in train_df.groupby('Class/ASD'):
    group['AQ10_Total'].hist(alpha=0.6, bins=11, ax=ax, 
                              label=f"{'ASD' if label==1 else 'No ASD'}")

ax.set_xlabel('AQ-10 Total Score')
ax.set_ylabel('Frequency')
ax.set_title('AQ-10 Total Score Distribution by Class')
ax.legend()
plt.show()

In [None]:
# Individual question analysis
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.flatten()

for i, col in enumerate(aq10_cols):
    cross_tab = pd.crosstab(train_df[col], train_df['Class/ASD'], normalize='index') * 100
    cross_tab.plot(kind='bar', ax=axes[i], legend=False, color=['#2ecc71', '#e74c3c'])
    axes[i].set_title(f'{col}', fontsize=10)
    axes[i].set_xlabel('')
    axes[i].set_xticklabels(['0', '1'], rotation=0)
    
plt.suptitle('AQ-10 Questions: % ASD by Response', fontsize=14)
plt.tight_layout()
plt.show()

## Demographic Analysis

In [None]:
# Age distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Age histogram
for label, group in train_df.groupby('Class/ASD'):
    group['age'].hist(alpha=0.6, bins=30, ax=axes[0],
                      label=f"{'ASD' if label==1 else 'No ASD'}")
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Age Distribution by Class')
axes[0].legend()

# Age boxplot
train_df.boxplot(column='age', by='Class/ASD', ax=axes[1])
axes[1].set_title('Age by Class')
axes[1].set_xlabel('Class (0=No ASD, 1=ASD)')
axes[1].set_ylabel('Age')
plt.suptitle('')

plt.tight_layout()
plt.show()

In [None]:
# Gender analysis
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Gender distribution
train_df['gender'].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('Gender Distribution')
axes[0].set_xlabel('Gender')
axes[0].set_xticklabels(['Male', 'Female'], rotation=0)

# Gender by class
pd.crosstab(train_df['gender'], train_df['Class/ASD']).plot(kind='bar', ax=axes[1])
axes[1].set_title('Gender by ASD Classification')
axes[1].set_xlabel('Gender')
axes[1].legend(['No ASD', 'ASD'])
axes[1].set_xticklabels(['Female', 'Male'], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Family history analysis
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Jaundice
pd.crosstab(train_df['jaundice'], train_df['Class/ASD'], normalize='index').plot(kind='bar', ax=axes[0])
axes[0].set_title('ASD Rate by Jaundice at Birth')
axes[0].set_xlabel('Jaundice')
axes[0].legend(['No ASD', 'ASD'])
axes[0].set_xticklabels(['No', 'Yes'], rotation=0)

# Family autism history
pd.crosstab(train_df['austim'], train_df['Class/ASD'], normalize='index').plot(kind='bar', ax=axes[1])
axes[1].set_title('ASD Rate by Family ASD History')
axes[1].set_xlabel('Family ASD History')
axes[1].legend(['No ASD', 'ASD'])
axes[1].set_xticklabels(['No', 'Yes'], rotation=0)

plt.tight_layout()
plt.show()

## Feature Correlation

In [None]:
# Correlation matrix for AQ-10 scores
corr_cols = aq10_cols + ['AQ10_Total', 'Class/ASD']
corr_matrix = train_df[corr_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target
target_corr = train_df[aq10_cols + ['age']].corrwith(train_df['Class/ASD']).sort_values()

plt.figure(figsize=(10, 6))
target_corr.plot(kind='barh', color=['#e74c3c' if x < 0 else '#2ecc71' for x in target_corr])
plt.xlabel('Correlation with ASD')
plt.title('Feature Correlation with Target Variable')
plt.axvline(x=0, color='black', linestyle='--')
plt.tight_layout()
plt.show()

## Key Insights

1. **Class Imbalance**: The dataset has some imbalance between ASD and non-ASD classes
2. **AQ-10 Score**: Higher total AQ-10 scores strongly correlate with ASD classification
3. **Individual Questions**: Some questions (e.g., A7, A10) have stronger predictive power
4. **Family History**: Having a family member with ASD increases the likelihood of ASD classification
5. **Gender**: Males are more represented in the ASD positive group