In [2]:
import pandas as pd
from scipy.stats import zscore

#### Load the data

In [3]:
data = pd.read_csv("cleaned_data.csv")

# Step 1: Missing values

##### check if missing values exist. if yes, replace them with the mean data

In [5]:
missing_values = data.isnull().sum()

In [7]:
missing_values

student_id                      0
anxiety_level                   1
self_esteem                     3
mental_health_history           1
depression                      3
headache                        1
blood_pressure                  0
sleep_quality                   1
breathing_problem               1
noise_level                     3
living_conditions               4
safety                          1
basic_needs                     1
academic_performance            1
study_load                      1
teacher_student_relationship    1
future_career_concerns          0
social_support                  2
peer_pressure                   2
extracurricular_activities      1
bullying                        2
stress_level                    2
dtype: int64

##### For the first column, since the data type isn't integer, we can't replace it with the mean. rather we give it a unique identifier

In [12]:
if data.iloc[:, 0].isnull().any():
    last_index = data.index[-1]
    data.iloc[:, 0].fillna(f'student_unknown_{last_index}', inplace=True)

##### For numeric columns, we replace null values with the data mean

In [13]:
numeric_columns = data.select_dtypes(include='number').columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

##### Check if all missing values are filled

In [14]:
data.isnull().sum()

student_id                      0
anxiety_level                   0
self_esteem                     0
mental_health_history           0
depression                      0
headache                        0
blood_pressure                  0
sleep_quality                   0
breathing_problem               0
noise_level                     0
living_conditions               0
safety                          0
basic_needs                     0
academic_performance            0
study_load                      0
teacher_student_relationship    0
future_career_concerns          0
social_support                  0
peer_pressure                   0
extracurricular_activities      0
bullying                        0
stress_level                    0
dtype: int64

# Step 2: Outliers

#### Identify and handle outliers using Z-scores

In [17]:
z_scores = zscore(data[numeric_columns])
z_scores.head()

Unnamed: 0,anxiety_level,self_esteem,mental_health_history,depression,headache,blood_pressure,sleep_quality,breathing_problem,noise_level,living_conditions,...,basic_needs,academic_performance,study_load,teacher_student_relationship,future_career_concerns,social_support,peer_pressure,extracurricular_activities,bullying,stress_level
0,0.4797,0.249367,-0.986892,-0.201464,-0.361091,-1.418416,-0.427645,0.891343,-0.486888,0.429982,...,-0.538374,0.162353,-0.472082,0.256096,0.22955,0.11314,0.187015,0.16323,-0.403101,0.006663
1,0.643271,-1.093238,1.014204,0.317634,1.768635,0.981981,-1.074112,0.891343,0.26858,-1.358483,...,-0.538374,-1.253723,1.049379,-1.190946,1.537869,-0.842454,0.890241,1.575744,1.558221,1.225905
2,0.152557,0.0256,1.014204,0.187859,-0.361091,-1.418416,-0.427645,-0.537666,-0.486888,-0.46425,...,-0.538374,-0.545685,0.288649,0.256096,-0.424609,0.11314,0.187015,-0.543028,-0.403101,0.006663
3,0.806842,-0.645703,1.014204,0.317634,1.058726,0.981981,-1.074112,0.176838,1.024048,-0.46425,...,-0.538374,-0.545685,1.049379,-1.190946,0.883709,-0.842454,0.890241,0.869487,1.558221,1.225905
4,0.806842,1.144438,-0.986892,-0.720562,-0.361091,0.981981,1.511757,-1.252171,0.26858,-0.46425,...,0.160177,0.870391,0.288649,-1.190946,-0.424609,-0.842454,1.593467,-1.955542,1.558221,0.006663


In [36]:
outliers = (abs(z_scores) > 3).all(axis=1)
outliers[190:210]

190    False
191    False
192    False
193    False
194    False
195    False
196    False
197    False
198    False
199    False
200    False
201    False
202    False
203    False
204    False
205    False
206    False
207    False
208    False
209    False
dtype: bool

In [19]:
data = data[~outliers]