In [105]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('C:/Users/KiuStudnet/PycharmProjects/Student_Depression_Analysis/data/Student_Depression_Dataset.csv')

In [106]:
# Step 1: Inspect data
print("Initial Dataset Information:")
print(data.info())
print(data.head())

Initial Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27

In [107]:
# Step 2: Handle missing values
# Check for missing values
missing_values = data.isnull().sum()
print("Columns with missing values:")
print(missing_values[missing_values > 0])

# Fill missing values with the median (only 'Financial Stress' has missing values)
data['Financial Stress'] = data['Financial Stress'].fillna(data['Financial Stress'].median())

# Verify missing values have been resolved
print("Remaining missing values:", data.isnull().sum().sum())

Columns with missing values:
Financial Stress    3
dtype: int64
Remaining missing values: 0


In [108]:
from sklearn.preprocessing import LabelEncoder

# Step 3: Encode non-numeric columns
# 3.1. Label Encoding for City
city_encoder = LabelEncoder()
data['City'] = city_encoder.fit_transform(data['City'])
city_mapping = dict(zip(city_encoder.classes_, range(len(city_encoder.classes_))))
print("City Mapping:", city_mapping)

# 3.2. Label Encoding for Profession
profession_encoder = LabelEncoder()
data['Profession'] = profession_encoder.fit_transform(data['Profession'])
profession_mapping = dict(zip(profession_encoder.classes_, range(len(profession_encoder.classes_))))
print("Profession Mapping:", profession_mapping)

# 3.3. Ordinal Encoding for Sleep Duration
sleep_order = {'Less than 5 hours': 1, '5-6 hours': 2, '7-8 hours': 3, 'More than 8 hours': 4}
data['Sleep Duration'] = data['Sleep Duration'].map(sleep_order)
print("Sleep Duration Mapping:", sleep_order)

# 3.4. Label Encoding for Degree
degree_encoder = LabelEncoder()
data['Degree'] = degree_encoder.fit_transform(data['Degree'])
degree_mapping = dict(zip(degree_encoder.classes_, range(len(degree_encoder.classes_))))
print("Degree Mapping:", degree_mapping)

# 3.5. Binary Encoding for 'Have you ever had suicidal thoughts ?'
data['Have you ever had suicidal thoughts ?'] = data['Have you ever had suicidal thoughts ?'].map({'No': 0, 'Yes': 1})
print("Suicidal Thoughts Mapping: {'No': 0, 'Yes': 1}")

# 3.6. Binary Encoding for Gender
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
print("Gender Mapping: {'Male': 0, 'Female': 1}")

# 3.7. Label Encoding for Dietary Habits
dietary_encoder = LabelEncoder()
data['Dietary Habits'] = dietary_encoder.fit_transform(data['Dietary Habits'])
dietary_mapping = dict(zip(dietary_encoder.classes_, range(len(dietary_encoder.classes_))))
print("Dietary Habits Mapping:", dietary_mapping)

# 3.8. Binary Encoding for Family History of Mental Illness
data['Family History of Mental Illness'] = data['Family History of Mental Illness'].map({'No': 0, 'Yes': 1})
print("Family History Mapping: {'No': 0, 'Yes': 1}")


# Verify all columns are numeric
print("Post-Encoding Dataset Information:")
print(data.info())

City Mapping: {'3.0': 0, 'Agra': 1, 'Ahmedabad': 2, 'Bangalore': 3, 'Bhavna': 4, 'Bhopal': 5, 'Chennai': 6, 'City': 7, 'Delhi': 8, 'Faridabad': 9, 'Gaurav': 10, 'Ghaziabad': 11, 'Harsh': 12, 'Harsha': 13, 'Hyderabad': 14, 'Indore': 15, 'Jaipur': 16, 'Kalyan': 17, 'Kanpur': 18, 'Khaziabad': 19, 'Kibara': 20, 'Kolkata': 21, 'Less Delhi': 22, 'Less than 5 Kalyan': 23, 'Lucknow': 24, 'Ludhiana': 25, 'M.Com': 26, 'M.Tech': 27, 'ME': 28, 'Meerut': 29, 'Mihir': 30, 'Mira': 31, 'Mumbai': 32, 'Nagpur': 33, 'Nalini': 34, 'Nalyan': 35, 'Nandini': 36, 'Nashik': 37, 'Patna': 38, 'Pune': 39, 'Rajkot': 40, 'Rashi': 41, 'Reyansh': 42, 'Saanvi': 43, 'Srinagar': 44, 'Surat': 45, 'Thane': 46, 'Vaanya': 47, 'Vadodara': 48, 'Varanasi': 49, 'Vasai-Virar': 50, 'Visakhapatnam': 51}
Profession Mapping: {'Architect': 0, 'Chef': 1, 'Civil Engineer': 2, 'Content Writer': 3, 'Digital Marketer': 4, 'Doctor': 5, 'Educational Consultant': 6, 'Entrepreneur': 7, 'Lawyer': 8, 'Manager': 9, 'Pharmacist': 10, 'Student': 1

In [109]:
# Step 4: Standardize numerical columns
# Identify numerical columns
# numerical_cols = data.select_dtypes(include=['float64']).columns
# print("Numerical Columns to Standardize:", numerical_cols)
# 
# # Standardize using StandardScaler
# scaler = StandardScaler()
# data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [110]:
    # Step 5: Final Check
print("Final Dataset Information:")
print(data.info())
print(data.describe())

Final Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  int64  
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  int64  
 4   Profession                             27901 non-null  int64  
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         2788

In [111]:
remaining_cols = data.select_dtypes(include=['object']).columns
print("Remaining Non-Numeric Columns (if any):", remaining_cols)


Remaining Non-Numeric Columns (if any): Index([], dtype='object')


In [112]:
data.to_csv('../data/processed_student_depression.csv', index=False)
