In [72]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('C:/Users/KiuStudnet/PycharmProjects/Student_Depression_Analysis/data/Student_Depression_Dataset.csv')

In [73]:
# Step 1: Inspect data
print("Initial Dataset Information:")
print(data.info())
print(data.head())

Initial Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27

In [74]:
# Step 2: Handle missing values
# Check for missing values
missing_values = data.isnull().sum()
print("Columns with missing values:")
print(missing_values[missing_values > 0])

# Fill missing values with the median (only 'Financial Stress' has missing values)
data['Financial Stress'] = data['Financial Stress'].fillna(data['Financial Stress'].median())

# Verify missing values have been resolved
print("Remaining missing values:", data.isnull().sum().sum())

Columns with missing values:
Financial Stress    3
dtype: int64
Remaining missing values: 0


In [75]:
# Step 3: Encode non-numeric columns
# Columns detected earlier: ['City', 'Profession', 'Sleep Duration', 'Degree', 'Have you ever had suicidal thoughts ?']
# 3.1. One-Hot Encoding for 'City' and 'Profession'
data = pd.get_dummies(data, columns=['City', 'Profession'], drop_first=True)

# 3.2. Ordinal Encoding for 'Sleep Duration'
sleep_order = {'Less than 5 hours': 1, '5-6 hours': 2, '7-8 hours': 3, 'More than 8 hours': 4}
data['Sleep Duration'] = data['Sleep Duration'].map(sleep_order)

# 3.3. One-Hot Encoding for 'Degree'
data = pd.get_dummies(data, columns=['Degree'], drop_first=True)

# 3.4. Binary Encoding for 'Have you ever had suicidal thoughts ?'
data['Have you ever had suicidal thoughts ?'] = data['Have you ever had suicidal thoughts ?'].map({'No': 0, 'Yes': 1})

# Binary encoding for Gender
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})

# One-hot encoding for Dietary Habits
data = pd.get_dummies(data, columns=['Dietary Habits'], drop_first=True)

# Binary encoding for Family History of Mental Illness
data['Family History of Mental Illness'] = data['Family History of Mental Illness'].map({'No': 0, 'Yes': 1})


# Verify all columns are numeric
print("Post-Encoding Dataset Information:")
print(data.info())

Post-Encoding Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Columns: 108 entries, id to Dietary Habits_Unhealthy
dtypes: bool(94), float64(9), int64(5)
memory usage: 5.5 MB
None


In [76]:
# Step 4: Standardize numerical columns
# Identify numerical columns
numerical_cols = data.select_dtypes(include=['float64']).columns
print("Numerical Columns to Standardize:", numerical_cols)

# Standardize using StandardScaler
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


Numerical Columns to Standardize: Index(['Age', 'Academic Pressure', 'Work Pressure', 'CGPA',
       'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration',
       'Work/Study Hours', 'Financial Stress'],
      dtype='object')


In [77]:
    # Step 5: Final Check
print("Final Dataset Information:")
print(data.info())
print(data.describe())

Final Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Columns: 108 entries, id to Dietary Habits_Unhealthy
dtypes: bool(94), float64(9), int64(5)
memory usage: 5.5 MB
None
                  id        Gender           Age  Academic Pressure  \
count   27901.000000  27901.000000  2.790100e+04       2.790100e+04   
mean    70442.149421      0.442780  1.739367e-16       1.871793e-17   
std     40641.175216      0.496724  1.000018e+00       1.000018e+00   
min         2.000000      0.000000 -1.594566e+00      -2.273869e+00   
25%     35039.000000      0.000000 -9.830196e-01      -8.261043e-01   
50%     70684.000000      0.000000 -1.676248e-01      -1.022220e-01   
75%    105818.000000      1.000000  8.516186e-01       6.216603e-01   
max    140699.000000      1.000000  6.763230e+00       1.345543e+00   

       Work Pressure          CGPA  Study Satisfaction  Job Satisfaction  \
count   2.790100e+04  2.790100e+04        2.790100e+04      2.7

In [78]:
remaining_cols = data.select_dtypes(include=['object']).columns
print("Remaining Non-Numeric Columns (if any):", remaining_cols)


Remaining Non-Numeric Columns (if any): Index([], dtype='object')


In [80]:
data.to_csv('../data/processed_student_depression.csv', index=False)
