In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [80]:
df = pd.read_csv("./Playground/depression_data.csv")

In [71]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

class MyColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pipeline = ColumnTransformer([
            ("income", StandardScaler(), ["Income", "Income_log", "Age"])
        ], remainder="passthrough")
    
    def fit(self, X, y=None):
        self.pipeline.fit(self.first_preprocess(X))
        return self

    def transform(self, X):
        return self.pipeline.transform(self.first_preprocess(X))

    def fit_transform(self, X, y=None):
        return self.pipeline.fit_transform(self.first_preprocess(X))

    def first_preprocess(self, df):
        df = df.copy()
        
        df.drop("Name", axis=1, inplace=True)

        df["Marital Status"] = np.where(
            df["Marital Status"] == "Divorced", 0,
            np.where(df["Marital Status"] == "Married", 1, 
                     np.where(df["Marital Status"] == "Single", 2, 3)
                    )
        )

        df["Education Level"] = np.where(
            df["Education Level"] == "High School", 0,
            np.where(df["Education Level"] == "Associate Degree", 1,  
                     np.where(df["Education Level"] == "Bachelor's Degree", 2,
                              np.where(df["Education Level"] == "Master's Degree", 3,
                                       4)
                             )
                    )
        )

        df["Smoking Status"] = np.where(
            df["Smoking Status"] == "Non-smoker", 0,
            np.where(df["Smoking Status"] == "Former", 1, 2)
        )

        df["Physical Activity Level"] = np.where(
            df["Physical Activity Level"] == "Sedentary", 0,
            np.where(df["Physical Activity Level"] == "Moderate", 1, 2)
        )

        df["Employed"] = (df["Employment Status"] == "Employed").astype(np.int8)
        df.drop("Employment Status", axis=1, inplace=True)

        df["Alcohol Consumption"] = np.where(
            df["Alcohol Consumption"] == "Low", 0,
            np.where(df["Alcohol Consumption"] == "Moderate", 1, 2)
        )

        df["Dietary Habits"] = np.where(
            df["Dietary Habits"] == "Unhealthy", 0,
            np.where(df["Dietary Habits"] == "Moderate", 1, 2)
        )

        df["Sleep Patterns"] = np.where(
            df["Sleep Patterns"] == "Poor", 0,
            np.where(df["Sleep Patterns"] == "Fair", 1, 2)
        )

        df["Income_log"] = np.log(df["Income"])  

        for colname in ["History of Mental Illness", "History of Substance Abuse", 
                        "Family History of Depression", "Chronic Medical Conditions"]:
            df[colname] = (df[colname] == "Yes").astype(np.int8)

        return df


In [72]:
ct = MyColumnTransformer()

In [73]:
dfz = ct.fit_transform(df)

In [74]:
dfx = dfz

In [76]:
pd.Series(dfx[:, 0]).value_counts().shape, pd.Series(dfx[:, 1]).value_counts().shape, pd.Series(dfx[:, 2]).value_counts().shape

((405282,), (405282,), (63,))

In [78]:
df[0]

array([-0.60053188, -0.21309621, -0.99129761,  1.        ,  2.        ,
        2.        ,  0.        ,  2.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  0.        ,  1.        ,  1.        ,
        0.        ])

In [81]:
cols = ["Income", "Income_log", "Age"] + list(df.drop(["Name", "Age", "Income"], axis=1).columns)

In [86]:
dfpreprocessed = pd.DataFrame(dfx, columns=cols)

In [83]:
dfpreprocessed.to_csv("full_prepocessed_data.csv")

In [87]:
dfpreprocessed.drop(["Marital Status", "Family History of Depression", "History of Substance Abuse"], inplace=True, axis=1)

In [88]:
dfpreprocessed 

Unnamed: 0,Income,Income_log,Age,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,Chronic Medical Conditions
0,-0.600532,-0.213096,-0.991298,2.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0
1,-0.195730,0.228129,0.330380,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,0.0,1.0
2,1.838100,1.205121,1.596988,3.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0
3,-1.001105,-1.090140,0.495590,3.0,3.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
4,-1.035510,-1.226881,-1.707206,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413763,1.441799,1.080348,1.046289,3.0,0.0,1.0,1.0,0.0,2.0,2.0,0.0,0.0,1.0
413764,1.134778,0.970315,-1.266647,2.0,0.0,2.0,2.0,0.0,2.0,0.0,1.0,1.0,1.0
413765,0.657038,0.767153,0.440520,2.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
413766,-0.642590,-0.274139,1.211499,1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [89]:
dfpreprocessed.to_csv("data_with_important_features.csv")

In [90]:
X, y = dfpreprocessed.drop("Chronic Medical Conditions", axis=1), dfpreprocessed["Chronic Medical Conditions"]

In [91]:
X_train, X_test = train_test_split(dfpreprocessed,stratify=y, train_size=0.8)

In [92]:
X_train.shape, X_test.shape

((331014, 13), (82754, 13))

In [93]:
X_train, X_valid = train_test_split(X_train, stratify=X_train["Chronic Medical Conditions"], train_size=0.75)

In [94]:
X_train.shape, X_valid.shape, X_test.shape

((248260, 13), (82754, 13), (82754, 13))

In [97]:
X_train.to_csv('./data/train.csv', index=False)
X_valid.to_csv('./data/valid.csv', index=False)
X_test.to_csv('./data/test.csv', index=False)