In [27]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Step 1: Load the Data
data_path = 'artifacts/data_ingestion/mental_health_dataset.csv'
data = pd.read_csv(data_path)

# Step 2: Inspect and Clean the Data
# Check for missing values and handle them if necessary
missing_values = data.isnull().sum()
print("Missing values per column:\n", missing_values)

# Assume no missing values or anomalies for simplicity; adjust based on actual inspection results

# Step 3: Encode Categorical Data
# Initialize one-hot encoder
one_hot_encoder = OneHotEncoder()
# Fit and transform the data, converting the output to a dense array
categorical_columns = ['Gender', 'Occupation', 'Country', 'Mental_Health_Condition', 'Consultation_History']
one_hot_encoded = one_hot_encoder.fit_transform(data[categorical_columns]).toarray()
# Create DataFrame for one-hot encoded columns
one_hot_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(categorical_columns))

# Map ordinal columns
severity_mapping = {'None': 0, 'Low': 1, 'Medium': 2, 'High': 3}
stress_level_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
data['Severity'] = data['Severity'].map(severity_mapping)
data['Stress_Level'] = data['Stress_Level'].map(stress_level_mapping)

# Step 4: Normalize Numerical Data
# Initialize the scaler
scaler = StandardScaler()
numerical_cols = ['Age', 'Sleep_Hours', 'Work_Hours', 'Physical_Activity_Hours']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Concatenate all processed parts of the dataset
processed_data = pd.concat([data.drop(categorical_columns, axis=1), one_hot_df], axis=1)

# Display the processed data
print(processed_data.head())


Missing values per column:
 User_ID                      0
Age                          0
Gender                       0
Occupation                   0
Country                      0
Mental_Health_Condition      0
Severity                   501
Consultation_History         0
Stress_Level                 0
Sleep_Hours                  0
Work_Hours                   0
Physical_Activity_Hours      0
dtype: int64
   User_ID       Age  Severity  Stress_Level  Sleep_Hours  Work_Hours  \
0        1 -0.421829       2.0             1     0.002569   -0.586396   
1        2 -0.565015       NaN             0     0.236077   -0.518376   
2        3  1.654381       3.0             0     0.761470    0.229838   
3        4 -0.565015       1.0             1     1.578748   -1.674707   
4        5 -1.424137       1.0             1    -1.281725    0.501915   

   Physical_Activity_Hours  Gender_Female  Gender_Male  Gender_Non-binary  \
0                -0.043503            0.0          0.0                1