In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
# Read the dataset
file_path = 'C:/Users/Kimbe/Downloads/mental_health_data.xlsx'
print("Loading data...")
df = pd.read_excel(file_path)
df

Loading data...


Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,2014-08-27 11:29:00,Female,United States,Corporate,,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,2014-08-27 11:31:00,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,2014-08-27 11:32:00,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,2014-08-27 11:37:00,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,2014-08-27 11:43:00,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292359,2015-07-27 23:25:00,Male,United States,Business,Yes,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,Maybe,Not sure
292360,2015-08-17 09:38:00,Male,South Africa,Business,No,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,Yes
292361,2015-08-25 19:59:00,Male,United States,Business,No,Yes,No,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,No
292362,2015-09-26 01:07:00,Male,United States,Business,No,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,Yes


In [3]:
df = df[df['Country'] == 'United States'].copy()  # Filtering data
df.drop('Timestamp', axis=1, inplace=True)  # Removing the Timestamp column

In [4]:
# Encode categorical features
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le


In [5]:
# Define X and y
y = df['treatment']
X = df.drop('treatment', axis=1)

In [6]:
# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Fit Final KMeans Model
optimal_k = 6
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df['Cluster'] = clusters


In [8]:
# Summarize Clusters (Profiles)
def summarize_cluster(group):
    summary = {}
    for col in group.columns:
        if col == 'Cluster':
            continue
        if group[col].dtype == 'object':
            summary[col] = group[col].mode()[0]
        else:
            summary[col] = round(group[col].mean(), 2)
    return pd.Series(summary)

cluster_summary = df.groupby('Cluster').apply(summarize_cluster)
print("Cluster Profiles:\n")
print(cluster_summary)

Cluster Profiles:

         Gender  Country  Occupation  self_employed  family_history  \
Cluster                                                               
0          1.00      0.0        2.15           0.00            0.41   
1          1.00      0.0        1.61           0.00            0.41   
2          0.00      0.0        2.12           0.00            0.58   
3          0.94      0.0        2.03           0.00            0.37   
4          0.83      0.0        2.05           1.26            0.51   
5          1.00      0.0        2.32           0.00            0.41   

         treatment  Days_Indoors  Growing_Stress  Changes_Habits  \
Cluster                                                            
0             0.52          2.21            1.01            1.06   
1             0.52          1.46            1.07            1.22   
2             0.69          2.03            1.10            1.04   
3             0.42          1.95            0.98            1.05   
4   

  cluster_summary = df.groupby('Cluster').apply(summarize_cluster)


In [9]:
# Derive Treatment Suggestions Based on Profiles
treatment_suggestions = {}

for cluster_num, row in cluster_summary.iterrows():
    stress = row['Growing_Stress']
    mood = row['Mood_Swings']
    history = row['Mental_Health_History']
    coping = row['Coping_Struggles']
    treatment_likelihood = row['treatment']
    care = row['care_options']
    
    if history > 1.0 and mood > 1.5:
        suggestion = 'Medication + CBT'
    elif coping > 0.9:
        suggestion = 'Stress Management Coaching & Support Groups'
    elif stress > 1.2 and care > 1.1:
        suggestion = 'Stress & Behavior Therapy Combo'
    elif mood > 1.6 and row['Social_Weakness'] > 1.0:
        suggestion = 'CBT & Social Skills Training'
    elif history < 0.9 and stress < 1.0 and mood < 0.6:
        suggestion = 'Preventive Education + Peer Support'
    else:
        suggestion = 'General Counseling & Routine Screening'

    treatment_suggestions[cluster_num] = suggestion

In [10]:
# Assign Treatment Suggestions to Each Participant
df['treatment_suggestion'] = df['Cluster'].map(treatment_suggestions)

In [11]:
# Display Final Cluster Summary and Suggestions
print("\nTreatment Suggestions by Cluster:\n", treatment_suggestions)


Treatment Suggestions by Cluster:
 {0: 'Stress Management Coaching & Support Groups', 1: 'Medication + CBT', 2: 'General Counseling & Routine Screening', 3: 'General Counseling & Routine Screening', 4: 'General Counseling & Routine Screening', 5: 'Preventive Education + Peer Support'}
