In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Carregar dados preparados
data = pd.read_csv('./data/prepared_data.csv')
data.head()

Unnamed: 0,Survey_Date,Age,Gender,Region,Industry,Job_Role,Work_Arrangement,Hours_Per_Week,Mental_Health_Status,Burnout_Level,Work_Life_Balance_Score,Social_Isolation_Score,Salary_Range,Physical_Health_Shoulder_Pain,Physical_Health_Neck_Pain,Physical_Health_Back_Pain,Physical_Health_Eye_Strain,Physical_Health_Wrist_Pain
0,2025-06-01,27,Female,Asia,Professional Services,Data Analyst,Onsite,64,Stress Disorder,High,3,2,$40K-60K,1,0,0,0,0
1,2025-06-01,37,Female,Asia,Professional Services,Data Analyst,Onsite,37,Stress Disorder,High,4,2,$80K-100K,0,0,1,0,0
2,2025-06-01,32,Female,Africa,Education,Business Analyst,Onsite,36,ADHD,High,3,2,$80K-100K,1,0,0,0,0
3,2025-06-01,40,Female,Europe,Education,Data Analyst,Onsite,63,ADHD,Medium,1,2,$60K-80K,1,0,0,0,0
4,2025-06-01,52,Male,Oceania,Customer Service,Business Analyst,Onsite,61,Burnout,Medium,4,3,$60K-80K,0,0,1,0,0


In [4]:
data.drop(columns=['Survey_Date'], inplace=True)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2877 entries, 0 to 2876
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Age                            2877 non-null   int64 
 1   Gender                         2877 non-null   object
 2   Region                         2877 non-null   object
 3   Industry                       2877 non-null   object
 4   Job_Role                       2877 non-null   object
 5   Work_Arrangement               2877 non-null   object
 6   Hours_Per_Week                 2877 non-null   int64 
 7   Mental_Health_Status           2877 non-null   object
 8   Burnout_Level                  2877 non-null   object
 9   Work_Life_Balance_Score        2877 non-null   int64 
 10  Social_Isolation_Score         2877 non-null   int64 
 11  Salary_Range                   2877 non-null   object
 12  Physical_Health_Shoulder_Pain  2877 non-null   int64 
 13  Phy

In [6]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Selecionar colunas categóricas
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
encoder = OneHotEncoder(sparse_output=False, drop='if_binary') # Não dropei aqui, porque quero manter todas as categorias
encoded = encoder.fit_transform(data[categorical_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols), index=data.index)

# Concatenar com colunas numéricas
data_kmeans = pd.concat([data.drop(columns=categorical_cols), encoded_df], axis=1)

# Normalizar todas as variáveis para o KMeans
scaler = MinMaxScaler()
data_kmeans = pd.DataFrame(scaler.fit_transform(data_kmeans), columns=data_kmeans.columns, index=data_kmeans.index)
data_kmeans.head()

Unnamed: 0,Age,Hours_Per_Week,Work_Life_Balance_Score,Social_Isolation_Score,Physical_Health_Shoulder_Pain,Physical_Health_Neck_Pain,Physical_Health_Back_Pain,Physical_Health_Eye_Strain,Physical_Health_Wrist_Pain,Gender_Female,...,Mental_Health_Status_Stress Disorder,Mental_Health_Status_unknown,Burnout_Level_High,Burnout_Level_Low,Burnout_Level_Medium,Salary_Range_$100K-120K,Salary_Range_$120K+,Salary_Range_$40K-60K,Salary_Range_$60K-80K,Salary_Range_$80K-100K
0,0.116279,0.966667,0.5,0.25,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.348837,0.066667,0.75,0.25,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.232558,0.033333,0.5,0.25,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.418605,0.933333,0.0,0.25,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.697674,0.866667,0.75,0.5,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [7]:
data_kmeans.columns

Index(['Age', 'Hours_Per_Week', 'Work_Life_Balance_Score',
       'Social_Isolation_Score', 'Physical_Health_Shoulder_Pain',
       'Physical_Health_Neck_Pain', 'Physical_Health_Back_Pain',
       'Physical_Health_Eye_Strain', 'Physical_Health_Wrist_Pain',
       'Gender_Female', 'Gender_Male', 'Gender_Non-binary',
       'Gender_Prefer not to say', 'Region_Africa', 'Region_Asia',
       'Region_Europe', 'Region_North America', 'Region_Oceania',
       'Region_South America', 'Industry_Customer Service',
       'Industry_Education', 'Industry_Finance', 'Industry_Healthcare',
       'Industry_Manufacturing', 'Industry_Marketing',
       'Industry_Professional Services', 'Industry_Retail',
       'Industry_Technology', 'Job_Role_Account Manager',
       'Job_Role_Business Analyst', 'Job_Role_Consultant',
       'Job_Role_Content Writer', 'Job_Role_Customer Service Manager',
       'Job_Role_Data Analyst', 'Job_Role_Data Scientist',
       'Job_Role_DevOps Engineer', 'Job_Role_Digital Mar

In [8]:
# Classificação de cluster

from sklearn.cluster import KMeans
modelo = KMeans(n_clusters=4)

modelo.fit(data_kmeans)

print('Grupos {}'.format(modelo.labels_))

Grupos [2 2 2 ... 1 3 1]


  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight


In [9]:
print(data.columns)
print(modelo.cluster_centers_)

Index(['Age', 'Gender', 'Region', 'Industry', 'Job_Role', 'Work_Arrangement',
       'Hours_Per_Week', 'Mental_Health_Status', 'Burnout_Level',
       'Work_Life_Balance_Score', 'Social_Isolation_Score', 'Salary_Range',
       'Physical_Health_Shoulder_Pain', 'Physical_Health_Neck_Pain',
       'Physical_Health_Back_Pain', 'Physical_Health_Eye_Strain',
       'Physical_Health_Wrist_Pain'],
      dtype='object')
[[ 4.88372093e-01  5.06519862e-01  4.88665803e-01  5.18134715e-01
   2.79792746e-01  4.14507772e-02  5.33678756e-01  1.33419689e-01
   1.16580311e-02  5.55111512e-16  9.52072539e-01  3.49740933e-02
   1.29533679e-02  1.73575130e-01  1.82642487e-01  1.45077720e-01
   1.43782383e-01  1.93005181e-01  1.61917098e-01  3.62694301e-02
   8.16062176e-02  9.71502591e-02  6.21761658e-02  1.56735751e-01
   5.69948187e-02  1.70984456e-01  4.79274611e-02  2.90155440e-01
   4.01554404e-02  4.27461140e-02  4.27461140e-02  4.27461140e-02
   4.66321244e-02  4.40414508e-02  4.40414508e-02  4.4041

In [10]:
grupos = pd.DataFrame(modelo.cluster_centers_, columns=data_kmeans.columns)
grupos = grupos.T

In [11]:
grupos

Unnamed: 0,0,1,2,3
Age,0.488372,0.524965,0.518067,0.486673
Hours_Per_Week,0.506520,0.493959,0.493559,0.489064
Work_Life_Balance_Score,0.488666,0.502210,0.509462,0.497493
Social_Isolation_Score,0.518135,0.417127,0.427291,0.345272
Physical_Health_Shoulder_Pain,0.279793,0.269613,0.239044,0.242120
...,...,...,...,...
Salary_Range_$100K-120K,0.116580,0.142541,0.137450,0.181948
Salary_Range_$120K+,0.077720,0.056354,0.059761,0.061605
Salary_Range_$40K-60K,0.187824,0.154696,0.177291,0.140401
Salary_Range_$60K-80K,0.308290,0.333702,0.316733,0.310888


In [12]:
data_kmeans.columns

Index(['Age', 'Hours_Per_Week', 'Work_Life_Balance_Score',
       'Social_Isolation_Score', 'Physical_Health_Shoulder_Pain',
       'Physical_Health_Neck_Pain', 'Physical_Health_Back_Pain',
       'Physical_Health_Eye_Strain', 'Physical_Health_Wrist_Pain',
       'Gender_Female', 'Gender_Male', 'Gender_Non-binary',
       'Gender_Prefer not to say', 'Region_Africa', 'Region_Asia',
       'Region_Europe', 'Region_North America', 'Region_Oceania',
       'Region_South America', 'Industry_Customer Service',
       'Industry_Education', 'Industry_Finance', 'Industry_Healthcare',
       'Industry_Manufacturing', 'Industry_Marketing',
       'Industry_Professional Services', 'Industry_Retail',
       'Industry_Technology', 'Job_Role_Account Manager',
       'Job_Role_Business Analyst', 'Job_Role_Consultant',
       'Job_Role_Content Writer', 'Job_Role_Customer Service Manager',
       'Job_Role_Data Analyst', 'Job_Role_Data Scientist',
       'Job_Role_DevOps Engineer', 'Job_Role_Digital Mar

In [13]:
data_kmeans.Work_Arrangement_Hybrid.value_counts()

Work_Arrangement_Hybrid
0.0    1961
1.0     916
Name: count, dtype: int64

In [14]:
grupo_bem_estar = [
    'Age', 'Hours_Per_Week', 'Work_Life_Balance_Score', 'Social_Isolation_Score',
    'Physical_Health_Shoulder_Pain', 'Physical_Health_Neck_Pain',
    'Physical_Health_Back_Pain', 'Physical_Health_Eye_Strain', 'Physical_Health_Wrist_Pain'
]

grupo_demografia = [
    'Gender_Female', 'Gender_Male', 'Gender_Non-binary', 'Gender_Prefer not to say',
    'Region_Africa', 'Region_Asia', 'Region_Europe', 'Region_North America',
    'Region_Oceania', 'Region_South America'
]

grupo_industria = [
    'Industry_Customer Service', 'Industry_Education', 'Industry_Finance',
    'Industry_Healthcare', 'Industry_Manufacturing', 'Industry_Marketing',
    'Industry_Professional Services', 'Industry_Retail', 'Industry_Technology'
]

grupo_funcoes = [
    'Job_Role_Account Manager', 'Job_Role_Business Analyst', 'Job_Role_Consultant',
    'Job_Role_Content Writer', 'Job_Role_Customer Service Manager',
    'Job_Role_Data Analyst', 'Job_Role_Data Scientist', 'Job_Role_DevOps Engineer',
    'Job_Role_Digital Marketing Specialist', 'Job_Role_Executive Assistant',
    'Job_Role_Financial Analyst', 'Job_Role_HR Manager', 'Job_Role_IT Support',
    'Job_Role_Marketing Specialist', 'Job_Role_Operations Manager',
    'Job_Role_Product Manager', 'Job_Role_Project Manager', 'Job_Role_Quality Assurance',
    'Job_Role_Research Scientist', 'Job_Role_Sales Representative',
    'Job_Role_Social Media Manager', 'Job_Role_Software Engineer',
    'Job_Role_Technical Writer', 'Job_Role_UX Designer'
]

grupo_arranjo = [
    'Work_Arrangement_Hybrid', 'Work_Arrangement_Onsite', 'Work_Arrangement_Remote'
]

grupo_saude_mental = [
    'Mental_Health_Status_ADHD', 'Mental_Health_Status_Anxiety',
    'Mental_Health_Status_Burnout', 'Mental_Health_Status_Depression',
    'Mental_Health_Status_PTSD', 'Mental_Health_Status_Stress Disorder',
    'Mental_Health_Status_unknown', 'Burnout_Level_High',
    'Burnout_Level_Low', 'Burnout_Level_Medium'
]

grupo_salario = [
    'Salary_Range_$40K-60K', 'Salary_Range_$60K-80K',
    'Salary_Range_$80K-100K', 'Salary_Range_$100K-120K', 'Salary_Range_$120K+'
]



In [15]:
grupos.index

Index(['Age', 'Hours_Per_Week', 'Work_Life_Balance_Score',
       'Social_Isolation_Score', 'Physical_Health_Shoulder_Pain',
       'Physical_Health_Neck_Pain', 'Physical_Health_Back_Pain',
       'Physical_Health_Eye_Strain', 'Physical_Health_Wrist_Pain',
       'Gender_Female', 'Gender_Male', 'Gender_Non-binary',
       'Gender_Prefer not to say', 'Region_Africa', 'Region_Asia',
       'Region_Europe', 'Region_North America', 'Region_Oceania',
       'Region_South America', 'Industry_Customer Service',
       'Industry_Education', 'Industry_Finance', 'Industry_Healthcare',
       'Industry_Manufacturing', 'Industry_Marketing',
       'Industry_Professional Services', 'Industry_Retail',
       'Industry_Technology', 'Job_Role_Account Manager',
       'Job_Role_Business Analyst', 'Job_Role_Consultant',
       'Job_Role_Content Writer', 'Job_Role_Customer Service Manager',
       'Job_Role_Data Analyst', 'Job_Role_Data Scientist',
       'Job_Role_DevOps Engineer', 'Job_Role_Digital Mar

In [16]:
grupo_bem_estar

['Age',
 'Hours_Per_Week',
 'Work_Life_Balance_Score',
 'Social_Isolation_Score',
 'Physical_Health_Shoulder_Pain',
 'Physical_Health_Neck_Pain',
 'Physical_Health_Back_Pain',
 'Physical_Health_Eye_Strain',
 'Physical_Health_Wrist_Pain']

In [17]:
grupos.columns

RangeIndex(start=0, stop=4, step=1)

In [18]:
for grupo, nome in zip([grupo_bem_estar, grupo_demografia, grupo_industria, grupo_funcoes,
                       grupo_arranjo, grupo_saude_mental, grupo_salario],
                       ['Bem Estar', 'Demografia', 'Industria', 'Funcoes', 'Arranjo',
                        'Saude Mental', 'Salario']):
    plt.figure(figsize=(16, 5))  # Largura aumentada
    grupos.loc[grupo].T.plot.bar(r)
    plt.title(f'Perfil dos Clusters – {nome}')
    plt.xticks(rotation=45, ha='right', fontsize=8)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=8)
    plt.tight_layout()
    plt.show()


NameError: name 'r' is not defined

<Figure size 1600x500 with 0 Axes>