## Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv(r'C:\repositorio\Student-Depression-Prediction-RandomForest\data\raw\Student Depression Dataset.csv')

data.sample(5)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
6519,32688,Female,21.0,Vadodara,Student,5.0,0.0,5.27,3.0,0.0,More than 8 hours,Unhealthy,MSc,Yes,11.0,2.0,Yes,1
7056,35502,Male,24.0,Jaipur,Student,2.0,0.0,7.25,3.0,0.0,7-8 hours,Unhealthy,B.Ed,No,3.0,2.0,Yes,0
24864,125228,Female,21.0,Chennai,Student,5.0,0.0,7.75,5.0,0.0,5-6 hours,Moderate,BA,No,12.0,1.0,No,0
9054,45647,Male,33.0,Ludhiana,Student,3.0,0.0,5.16,5.0,0.0,5-6 hours,Healthy,M.Ed,No,4.0,5.0,No,0
23866,120225,Female,28.0,Faridabad,Student,1.0,0.0,9.96,4.0,0.0,5-6 hours,Moderate,MBBS,Yes,9.0,5.0,Yes,1


### Verificando os valores Nulos

In [3]:
# Verificar valores faltantes por coluna
print("Valores faltantes por coluna:")
data.isnull().sum()

Valores faltantes por coluna:


id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64

### Substituindo os nulos pela média

In [4]:

data['Financial Stress'].fillna(data['Financial Stress'].median(), inplace=True)

# Verificar novamente os valores faltantes
print("Valores faltantes após tratamento:")
data.isnull().sum()

Valores faltantes após tratamento:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Financial Stress'].fillna(data['Financial Stress'].median(), inplace=True)


id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64

### Codificando as variáveis categóricas

In [None]:

binary_columns = ['Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
for col in binary_columns:
    data[col] = data[col].map({'Yes': 1, 'No': 0})


ordinal_mapping = {
    'Sleep Duration': {'Less than 5 hours': 1, '5-6 hours': 2, '7-8 hours': 3, 'More than 8 hours': 4, 'Others': 0},
    'Dietary Habits': {'Unhealthy': 1, 'Moderate': 2, 'Healthy': 3, 'Others': 0}
}

for col, mapping in ordinal_mapping.items():
    data[col] = data[col].map(mapping)


le = LabelEncoder()

data['Gender'] = le.fit_transform(data['Gender']) #homens 1 | mulheres 0

# Visualizar as primeiras linhas após codificação
print("Dataset após codificação:")
data.head()

Dataset após codificação:


Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,1,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,2,3,B.Pharm,1,3.0,1.0,0,1
1,8,0,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,2,2,BSc,0,3.0,2.0,1,0
2,26,1,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,1,3,BA,0,9.0,1.0,1,0
3,30,0,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,3,2,BCA,1,4.0,5.0,1,1
4,32,0,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,2,2,M.Tech,1,1.0,1.0,0,0


### Remoção dos outliers

In [12]:
numerical_columns = ['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Financial Stress']

z_scores = data[numerical_columns].apply(zscore)
data = data[(z_scores < 3).all(axis=1)]

print("Tamanho do dataset após remoção de outliers:", data.shape)

Tamanho do dataset após remoção de outliers: (27882, 18)


### Remoção de colunas desnecessárias

In [13]:
columns_to_drop = ['id', 'City', 'Profession', 'Degree']
data = data.drop(columns=columns_to_drop)


### Salvar o dataset limpo em um novo arquivo CSV

In [15]:
data.to_csv(r'C:\repositorio\Student-Depression-Prediction-RandomForest\data\processed\cleaned_student_depression_dataset.csv', index=False)

In [14]:
data

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,1,33.0,5.0,0.0,8.97,2.0,0.0,2,3,1,3.0,1.0,0,1
1,0,24.0,2.0,0.0,5.90,5.0,0.0,2,2,0,3.0,2.0,1,0
2,1,31.0,3.0,0.0,7.03,5.0,0.0,1,3,0,9.0,1.0,1,0
3,0,28.0,3.0,0.0,5.59,2.0,0.0,3,2,1,4.0,5.0,1,1
4,0,25.0,4.0,0.0,8.13,3.0,0.0,2,2,1,1.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27896,0,27.0,5.0,0.0,5.75,5.0,0.0,2,1,1,7.0,1.0,1,0
27897,1,27.0,2.0,0.0,9.40,3.0,0.0,1,3,0,0.0,3.0,1,0
27898,1,31.0,3.0,0.0,6.61,4.0,0.0,2,1,0,12.0,2.0,0,0
27899,0,18.0,5.0,0.0,6.88,2.0,0.0,1,3,1,10.0,5.0,0,1
