### Importing Libraries 

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.cluster import KMeans # Here is where you import the k-means algorithm from scikit-learn.
import pylab as pl # PyLab is a convenience module that bulk imports matplotlib.

In [3]:
# This option ensures the graphs you create are displayed in your notebook without the need to "call" them specifically.

%matplotlib inline

### Importing Data

In [4]:
# Path
path= r"C:\Users\nabar\OneDrive\Documents\A6 Project"
# Dataframe
df=pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'cleaned_dataset.csv'))

### Prep Data

In [5]:
df.shape

(1985, 28)

In [6]:
df.columns

Index(['CASE_NO_PATIENT'S', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8',
       'A9', 'A10_Autism_Spectrum_Quotient', 'Social_Responsiveness_Scale',
       'Age_Years', 'Qchat_10_Score', 'Speech Delay/Language Disorder',
       'Learning disorder', 'Genetic_Disorders', 'Depression',
       'Global developmental delay/intellectual disability',
       'Social/Behavioural Issues', 'Childhood Autism Rating Scale',
       'Anxiety_disorder', 'Sex', 'Ethnicity', 'Jaundice',
       'Family_mem_with_ASD', 'Who_completed_the_test', 'ASD_traits'],
      dtype='object')

In [7]:
print(df.info())  # Get an overview of columns, data types, and missing values
print(df.head())  # Preview the first few rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1985 entries, 0 to 1984
Data columns (total 28 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   CASE_NO_PATIENT'S                                   1985 non-null   int64  
 1   A1                                                  1985 non-null   int64  
 2   A2                                                  1985 non-null   int64  
 3   A3                                                  1985 non-null   int64  
 4   A4                                                  1985 non-null   int64  
 5   A5                                                  1985 non-null   int64  
 6   A6                                                  1985 non-null   int64  
 7   A7                                                  1985 non-null   int64  
 8   A8                                                  1985 non-null   int64  
 9

In [8]:
df_numeric = df.select_dtypes(include=[np.number])  # Keep only numerical columns

In [9]:
df_numeric.fillna(df_numeric.mean(), inplace=True)

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)

### Elbow Method Implementation

In [11]:
df_cleaned = df_scaled[~np.isnan(df_scaled).any(axis=1)]

In [12]:
from sklearn.impute import SimpleImputer

In [13]:
imputer = SimpleImputer(strategy='mean')  
df_cleaned = imputer.fit_transform(df_scaled)

In [14]:
import warnings
warnings.filterwarnings("ignore", message="KMeans is known to have a memory leak")

In [15]:
optimal_k = 3  # Set the best k-value based on the elbow method

kmeans_final = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans_final.fit_predict(df_cleaned)

# View cluster assignments
print(df[['Cluster']].value_counts())

Cluster
1          931
0          651
2          403
Name: count, dtype: int64


In [17]:
import os

# Create the directory if it doesn't exist
os.makedirs('path/02 Data/Prepared Data', exist_ok=True)

# Now save the file in that path
df.to_csv('path/02 Data/Prepared Data/clustered_data.csv', index=False)


In [22]:
# Assuming df is your main DataFrame
df['Cluster'] = kmeans_final.fit_predict(df_cleaned)


In [23]:
import os
os.makedirs('path/02 Data/Prepared Data', exist_ok=True)
df.to_csv('path/02 Data/Prepared Data/clustered_data.csv', index=False)


In [24]:
import os
print(os.getcwd())


C:\Users\nabar\OneDrive\Documents\A6 Project\03 Scripts


In [25]:
df.to_csv(r'C:\Users\nabar\OneDrive\Documents\A6 Project\02 Data\Prepared Data\clustered_data.csv', index=False)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# 1. Define the range for number of clusters
num_cl = range(1, 10)

# 2. Initialize lists to store models, WCSS, and scores
kmeans_models = []
wcss = []
scores = []

# 3. Fit models once per k, collect inertia_ and score()
for k in num_cl:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(df_cleaned)
    kmeans_models.append(model)
    wcss.append(model.inertia_)
    scores.append(model.score(df_cleaned))

# 4. Plot the Elbow Method and Score Evaluation
plt.figure(figsize=(12, 5))

# Elbow plot
plt.subplot(1, 2, 1)
plt.plot(num_cl, wcss, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.title('Elbow Method')

# Score evaluation plot
plt.subplot(1, 2, 2)
plt.plot(num_cl, scores, marker='s', linestyle='--', color='r')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Score (Negative WCSS)')
plt.title('Score Evaluation for Cluster Selection')

plt.tight_layout()
plt.show()

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
df.groupby('Cluster')[numeric_cols].mean().T

In [None]:
# Selecting a subset of numerical columns for visualization
selected_columns = ['A1', 'A2', 'A3', 'Qchat_10_Score', 'ASD_traits', 'Cluster']  # Modify based on dataset
df_subset = df[selected_columns]

# Create pairwise scatterplots with cluster coloring
sns.pairplot(df_subset, hue='Cluster', palette='viridis')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(18, 5))

# 1. Qchat_10_Score vs Childhood Autism Rating Scale
plt.subplot(1, 3, 1)
sns.scatterplot(
    x=df['Qchat_10_Score'],
    y=df['Childhood Autism Rating Scale'],
    hue=df['Cluster'],
    palette='viridis'
)
plt.title('Qchat vs CARS')
plt.xlabel('Qchat_10_Score')
plt.ylabel('CARS')

# 2. ASD_traits vs Qchat_10_Score
plt.subplot(1, 3, 2)
sns.scatterplot(
    x=df['ASD_traits'],
    y=df['Qchat_10_Score'],
    hue=df['Cluster'],
    palette='viridis'
)
plt.title('ASD Traits vs Qchat')
plt.xlabel('ASD_traits')
plt.ylabel('Qchat_10_Score')

# 3. ASD_traits vs Childhood Autism Rating Scale
plt.subplot(1, 3, 3)
sns.scatterplot(
    x=df['ASD_traits'],
    y=df['Childhood Autism Rating Scale'],
    hue=df['Cluster'],
    palette='viridis'
)
plt.title('ASD Traits vs CARS')
plt.xlabel('ASD_traits')
plt.ylabel('CARS')

plt.tight_layout()
plt.show()

### Plot 1: **Qchat vs CARS**
- **X-axis**: Qchat_10_Score (a quick ASD screening tool)
- **Y-axis**: Childhood Autism Rating Scale (CARS, a deeper clinical tool)

**Interpretation**:  
- **Cluster 0** sits higher on both axes → these individuals tend to have elevated scores on both early screening and clinical evaluation, suggesting strong ASD indicators.
- **Cluster 1** huddles in the bottom-left corner → low scores on both Qchat and CARS, likely neurotypical cases.
- **Cluster 2** is mostly low on Qchat but slightly higher on CARS → could represent mixed or subclinical patterns, where clinical traits show despite a low screening score.

---

### Plot 2: **ASD Traits vs Qchat**
- **X-axis**: ASD_traits (your binary or probabilistic ASD classification)
- **Y-axis**: Qchat_10_Score

**Interpretation**:  
- **Cluster 0** clumps at **ASD_traits = 1** and **Qchat > 6**, showing strong screening alignment.
- **Cluster 1** mostly hangs around **ASD_traits = 0** and low Qchat scores.
- **Cluster 2** spreads across low trait values but is a bit messy—this may capture borderline or noisy cases, suggesting Qchat doesn’t always align neatly with trait inference.

---

### Plot 3: **ASD Traits vs - Childhood Autism Rating Scale**
- Similar to Plot 2 but replacing Qchat with CARS.

**Interpretation**:  
- **Cluster 0** again shows a strong positive relationship: **high ASD traits → high CARS scores**.
- **Cluster 2** is interesting—it stays low on traits but is somewhat elevated on CARS. This might indicate under-detection in the trait labeling or overlap with non-ASD conditions that still score high on CARS.
- **Cluster 1** is consistently low on both.

---

These visualizations show how **unsupervised clusters align with actual ASD screening scores**—even without feeding those scores directly into the clustering. That’s a strong validation of the model.




### Cluster Analysis Interpretation Using Core ASD Features:
To validate the unsupervised clusters, we visualized how each group aligns with three key ASD-related variables:
1. Qchat_10_Score vs Childhood Autism Rating Scale
This comparison revealed a strong linear pattern within Cluster 0, which had high scores on both metrics—indicating individuals with consistent ASD indicators. Cluster 1 remained low on both, suggesting neurotypical profiles, while Cluster 2 hovered in between, showing modest CARS scores but low Qchat results.
2. ASD Traits vs Qchat_10_Score
Cluster 0 displayed perfect alignment between high ASD traits and high Qchat scores. Clusters 1 and 2, however, included individuals with low or zero ASD traits but varying Qchat scores—highlighting possible inconsistencies between screening flags and final labels.
3. ASD Traits vs Childhood Autism Rating Scale
This pairing showed that CARS scores were elevated for some cases even when ASD_traits = 0 (notably in Cluster 2), suggesting that clinical assessments may capture subtler traits not flagged in trait classification alone. Cluster 1 was again positioned at the lower-left, consistent with minimal behavioral indicators.

* The clustering structure aligns meaningfully with validated screening tools—especially in identifying one clear group (Cluster 0) with strong ASD characteristics. Cluster 2 may represent borderline or subclinical profiles worth further attention.


### Future Applications of Clustering Results:

The unsupervised clusters identified in this analysis can be used in several key ways:

- **Model Feature**: Use cluster labels as an input feature in supervised learning tasks (e.g., predicting ASD diagnosis or treatment outcomes).
- **Targeted Profiling**: Tailor screening or interventions by identifying distinct subgroups with shared characteristics.
- **Improved Screening Strategy**: Use clusters to uncover borderline or overlooked profiles that standard tools may miss.
- **Dimensionality Reduction Input**: Leverage PCA components for improved model performance and noise reduction.
- **Stakeholder Communication**: Present clusters visually to support decision-making in clinical or educational settings.

In [None]:
from sklearn.decomposition import PCA
# Reduce features to 2 principal components
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_cleaned)

# Create scatterplot with PCA-transformed features
sns.scatterplot(x=df_pca[:,0], y=df_pca[:,1], hue=df['Cluster'], palette='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA-Based Cluster Visualization')
plt.show()

In [None]:
# If your cluster assignments came from a separate array (e.g., df_cleaned)
df['Cluster'] = kmeans_final.labels_  # Ensure you're assigning a flat 1D array

# Now retry selecting numeric columns and grouping
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cluster_means = df[numeric_cols].groupby(df['Cluster']).mean()
print(cluster_means)

### Cluster Interpretation Summary:
##### Plot:
That scatter plot is a 2D map of the data created by Principal Component Analysis (PCA). Each point represents one person (or case) in the dataset. Since the dataset has many features (like A1 to A10, Qchat score, etc.), PCA compresses all of those into two new axes:
- PC1 (x-axis): The direction across the data that explains the most variation.
- PC2 (y-axis): The direction that explains the next most variation, at a right angle to PC1.
In other words, this plot is showing the “shape” of the data from a bird's eye view, using just the two most important ingredients.


##### What the Colors Mean:

The colors represent clusters, based on the k-means results:
- Each cluster groups people who had similar patterns across all those original variables.
- So when see tight blobs of the same color, it means those data points (people) are grouped together based on how similar they are across the dataset.
When those blobs are spread out from one another, that suggests the clusters are well-separated, which is exactly what we want in a good clustering solution.


##### Cluster Interpretation:

After applying k-means clustering with k = 3, the resulting clusters show clear distinctions:
- Cluster 0 represents individuals with consistent indicators of ASD traits and developmental concerns.
- Cluster 1 appears to be a neurotypical group with minimal clinical symptoms.
- Cluster 2 seems to fall in between—potentially reflecting subclinical or mixed presentations, with higher rates of comorbidities but fewer ASD traits.

### Proposals for Future Steps:
##### Future Applications of Clustering Results:
- Refinement of diagnostic pathways: The presence of distinct subgroups may help prioritize more tailored screening or follow-up assessments.
- Feature selection for supervised learning: Now that we see which features contribute to meaningful cluster separation, we can use them to train classifiers or predictive models.
- Targeted intervention strategies: The clusters suggest that different subpopulations may benefit from tailored support strategies—particularly the mixed-symptom group in Cluster 2.
- Data validation: Future work could compare clusters to clinical labels (if available) to validate these findings or guide semi-supervised approaches.


