# Creating pipeline

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

class MyColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pipeline = ColumnTransformer([
            ("income", StandardScaler(), ["Income", "Age"])
        ], remainder="passthrough")
    
    def fit(self, X, y=None):
        self.pipeline.fit(self.first_preprocess(X))
        return self

    def transform(self, X):
        return self.pipeline.transform(self.first_preprocess(X))

    def fit_transform(self, X, y=None):
        return self.pipeline.fit_transform(self.first_preprocess(X))

    def first_preprocess(self, df):
        df = df.copy()
        
        df.drop("Name", axis=1, inplace=True)

        df["Marital Status"] = np.where(
            df["Marital Status"] == "Divorced", 0,
            np.where(df["Marital Status"] == "Married", 1,  # Fixed incorrect condition
                     np.where(df["Marital Status"] == "Single", 2, 3)
                    )
        )

        df["Education Level"] = np.where(
            df["Education Level"] == "High School", 0,
            np.where(df["Education Level"] == "Associate Degree", 1,  # Fixed spelling ("Accociate")
                     np.where(df["Education Level"] == "Bachelor's Degree", 2,
                              np.where(df["Education Level"] == "Master's Degree", 3,
                                       4)
                             )
                    )
        )

        df["Smoking Status"] = np.where(
            df["Smoking Status"] == "Non-smoker", 0,
            np.where(df["Smoking Status"] == "Former", 1, 2)
        )

        df["Physical Activity Level"] = np.where(
            df["Physical Activity Level"] == "Sedentary", 0,
            np.where(df["Physical Activity Level"] == "Moderate", 1, 2)
        )

        df["Employed"] = (df["Employment Status"] == "Employed").astype(np.int8)
        df.drop("Employment Status", axis=1, inplace=True)

        df["Alcohol Consumption"] = np.where(
            df["Alcohol Consumption"] == "Low", 0,
            np.where(df["Alcohol Consumption"] == "Moderate", 1, 2)
        )

        df["Dietary Habits"] = np.where(
            df["Dietary Habits"] == "Unhealthy", 0,
            np.where(df["Dietary Habits"] == "Moderate", 1, 2)
        )

        df["Sleep Patterns"] = np.where(
            df["Sleep Patterns"] == "Poor", 0,
            np.where(df["Sleep Patterns"] == "Fair", 1, 2)
        )

        df["Income"] = np.log(df["Income"])  # Ensure Income > 0 to avoid log errors

        for colname in ["History of Mental Illness", "History of Substance Abuse", 
                        "Family History of Depression"]:
            df[colname] = (df[colname] == "Yes").astype(np.int8)

        return df


In [3]:
mct = MyColumnTransformer()

In [4]:
data = pd.read_csv("../Playground/depression_data.csv")

In [5]:
data.shape

(413768, 16)

In [6]:
data_transformed = mct.fit_transform(data)

In [7]:
data_transformed.shape

(413768, 15)

array([[-0.21309620791134823, -0.9912976071164842, 1, ..., 1, 'Yes', 0],
       [0.22812899754537577, 0.33038018552256504, 1, ..., 0, 'Yes', 1],
       [1.205120718805423, 1.5969880701349872, 3, ..., 1, 'No', 1],
       ...,
       [0.7671526210460916, 0.44052000157581916, 1, ..., 1, 'Yes', 1],
       [-0.27413946958569535, 1.2114987139485978, 1, ..., 0, 'No', 0],
       [1.0626653300142777, 0.7158695417089544, 3, ..., 0, 'No', 1]],
      dtype=object)

In [9]:
data.head(1)

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,Married,Bachelor's Degree,2,Non-smoker,Active,Unemployed,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes


In [10]:
data_transformed[0]

array([-0.21309620791134823, -0.9912976071164842, 1, 2, 2, 0, 2, 1, 1, 1,
       1, 0, 1, 'Yes', 0], dtype=object)