In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv("./Playground/depression_data.csv")

In [3]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

class MyColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pipeline = ColumnTransformer([
            ("income", StandardScaler(), ["Income", "Age"])
        ], remainder="passthrough")
    
    def fit(self, X, y=None):
        self.pipeline.fit(self.first_preprocess(X))
        return self

    def transform(self, X):
        return self.pipeline.transform(self.first_preprocess(X))

    def fit_transform(self, X, y=None):
        return self.pipeline.fit_transform(self.first_preprocess(X))

    def first_preprocess(self, df):
        df = df.copy()
        
        df.drop("Name", axis=1, inplace=True)

        df["Marital Status"] = np.where(
            df["Marital Status"] == "Divorced", 0,
            np.where(df["Marital Status"] == "Married", 1, 
                     np.where(df["Marital Status"] == "Single", 2, 3)
                    )
        )

        df["Education Level"] = np.where(
            df["Education Level"] == "High School", 0,
            np.where(df["Education Level"] == "Associate Degree", 1,  
                     np.where(df["Education Level"] == "Bachelor's Degree", 2,
                              np.where(df["Education Level"] == "Master's Degree", 3,
                                       4)
                             )
                    )
        )

        df["Smoking Status"] = np.where(
            df["Smoking Status"] == "Non-smoker", 0,
            np.where(df["Smoking Status"] == "Former", 1, 2)
        )

        df["Physical Activity Level"] = np.where(
            df["Physical Activity Level"] == "Sedentary", 0,
            np.where(df["Physical Activity Level"] == "Moderate", 1, 2)
        )

        df["Employed"] = (df["Employment Status"] == "Employed").astype(np.int8)
        df.drop("Employment Status", axis=1, inplace=True)

        df["Alcohol Consumption"] = np.where(
            df["Alcohol Consumption"] == "Low", 0,
            np.where(df["Alcohol Consumption"] == "Moderate", 1, 2)
        )

        df["Dietary Habits"] = np.where(
            df["Dietary Habits"] == "Unhealthy", 0,
            np.where(df["Dietary Habits"] == "Moderate", 1, 2)
        )

        df["Sleep Patterns"] = np.where(
            df["Sleep Patterns"] == "Poor", 0,
            np.where(df["Sleep Patterns"] == "Fair", 1, 2)
        )

        df["Income_log"] = np.log(df["Income"])  

        for colname in ["History of Mental Illness", "History of Substance Abuse", 
                        "Family History of Depression", "Chronic Medical Conditions"]:
            df[colname] = (df[colname] == "Yes").astype(np.int8)

        return df


In [4]:
ct = MyColumnTransformer()

In [5]:
df = ct.fit_transform(df)

In [7]:
dfx = df

In [9]:
df.head()

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
0,Christine Barker,31,Married,Bachelor's Degree,2,Non-smoker,Active,Unemployed,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes
1,Jacqueline Lewis,55,Married,High School,1,Non-smoker,Sedentary,Employed,42710.36,High,Unhealthy,Fair,Yes,No,No,Yes
2,Shannon Church,78,Widowed,Master's Degree,1,Non-smoker,Sedentary,Employed,125332.79,Low,Unhealthy,Good,No,No,Yes,No
3,Charles Jordan,58,Divorced,Master's Degree,3,Non-smoker,Moderate,Unemployed,9992.78,Moderate,Moderate,Poor,No,No,No,No
4,Michael Rich,18,Single,High School,0,Non-smoker,Sedentary,Unemployed,8595.08,Low,Moderate,Fair,Yes,No,Yes,Yes


In [28]:
cols = list(df.drop(["Name"], axis=1).columns)+["Income_log"]

In [29]:
dfpreprocessed = pd.DataFrame(dfx, columns=cols)

In [30]:
dfpreprocessed.to_csv("full_prepocessed_data.csv")

In [32]:
dfpreprocessed.drop(["Marital Status", "Family History of Depression", "History of Substance Abuse"], inplace=True, axis=1)

In [33]:
dfpreprocessed 

Unnamed: 0,Age,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,Chronic Medical Conditions,Income_log
0,-0.600532,1.0,2.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,10.176018
1,-0.195730,1.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,0.0,1.0,10.662197
2,1.838100,3.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,11.738728
3,-1.001105,0.0,3.0,3.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,9.209618
4,-1.035510,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,9.058945
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413763,1.441799,1.0,3.0,0.0,1.0,1.0,0.0,2.0,2.0,0.0,0.0,1.0,11.601242
413764,1.134778,2.0,2.0,0.0,2.0,2.0,0.0,2.0,0.0,1.0,1.0,1.0,11.479999
413765,0.657038,1.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,11.256138
413766,-0.642590,1.0,1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,10.108755


In [34]:
dfpreprocessed.to_csv("data_with_important_features.csv")

In [35]:
X, y = dfpreprocessed.drop("Chronic Medical Conditions", axis=1), dfpreprocessed["Chronic Medical Conditions"]

In [56]:
X_train, X_test = train_test_split(dfpreprocessed,stratify=y, train_size=0.8)

In [57]:
X_train.shape, X_test.shape

((331014, 13), (82754, 13))

In [59]:
X_train, X_valid = train_test_split(X_train, stratify=X_train["Chronic Medical Conditions"], train_size=0.75)

In [60]:
X_train.shape, X_valid.shape, X_test.shape

((248260, 13), (82754, 13), (82754, 13))

In [61]:
X_train

Unnamed: 0,Age,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,Chronic Medical Conditions,Income_log
33827,-0.683091,3.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,10.039407
299209,0.485017,2.0,2.0,0.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,11.161452
42792,0.747126,0.0,2.0,2.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,11.302365
71842,0.801411,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,11.329222
38380,0.748743,1.0,2.0,3.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,1.0,11.303175
...,...,...,...,...,...,...,...,...,...,...,...,...,...
363361,-1.173060,0.0,2.0,2.0,0.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,8.008781
230455,0.665245,1.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,11.260439
280934,1.001696,2.0,3.0,0.0,0.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,11.422505
53618,1.214341,0.0,2.0,3.0,0.0,2.0,1.0,2.0,2.0,0.0,0.0,1.0,11.512857


In [62]:
X_train.to_csv('./data/train.csv')
X_valid.to_csv('./data/valid.csv')
X_test.to_csv('./data/test.csv')