### 1. Importing Dependencies

In [61]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import RobustScaler

### 2. Loading Dataset

In [3]:
df = pd.read_csv(r"D:\Study\Model X\Dementia Prediction\Dataset\processed/Outlier_Handled.csv")
df.head()

Unnamed: 0,BIRTHMO,BIRTHYR,SEX,HISPANIC,RACE,PRIMLANG,EDUC,MARISTAT,NACCLIVS,INDEPEND,...,B12DEF,THYROID,INCONTU,INCONTF,ALCOHOL,ABUSOTHR,DEP2YRS,DEPOTHR,PSYCDIS,DEMENTED
0,5.0,1952,1,0.0,1.0,1.0,16.0,1.0,4.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,5.0,1952,1,0.0,1.0,1.0,16.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,12.0,1956,1,0.0,1.0,1.0,16.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.0,1958,2,1.0,1.0,2.0,16.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,2.0,1945,1,1.0,1.0,1.0,12.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [10]:
print(df.columns)

Index(['BIRTHMO', 'BIRTHYR', 'SEX', 'HISPANIC', 'RACE', 'PRIMLANG', 'EDUC',
       'MARISTAT', 'NACCLIVS', 'INDEPEND', 'RESIDENC', 'HANDED', 'NACCAGE',
       'NACCAGEB', 'NACCNIHR', 'INRELTO', 'INLIVWTH', 'INRELY', 'TOBAC30',
       'TOBAC100', 'SMOKYRS', 'PACKSPER', 'CVHATT', 'CVAFIB', 'CVANGIO',
       'CVBYPASS', 'CVPACE', 'CVCHF', 'CVOTHR', 'CBSTROKE', 'CBTIA', 'PD',
       'PDOTHR', 'SEIZURES', 'DIABETES', 'HYPERTEN', 'HYPERCHO', 'B12DEF',
       'THYROID', 'INCONTU', 'INCONTF', 'ALCOHOL', 'ABUSOTHR', 'DEP2YRS',
       'DEPOTHR', 'PSYCDIS', 'DEMENTED'],
      dtype='object')


### 3. Feature Engineer

#### 3.1 Categorizing Columns

In [57]:
binary_cols = [c for c in df.columns if df[c].nunique(dropna=True) == 2]

#Remove Target Variable
binary_cols = [c for c in binary_cols if c != 'DEMENTED']

nominal_cols = [
    'HISPANIC', 'RACE', 'PRIMLANG', 'MARISTAT', 
    'NACCLIVS', 'INDEPEND', 'RESIDENC', 'HANDED', 'NACCNIHR',
     'INLIVWTH', 'INRELY', 'PDOTHR', 'DEPOTHR', 
    'ABUSOTHR', 'PSYCDIS', 'CVOTHR', 'PD', 'PACKSPER', 'CVHATT',
    'CVAFIB', 'CVANGIO', 'CVBYPASS', 'CVPACE', 'CVCHF', 'CVOTHR',
    'CBSTROKE', 'CBTIA', 'SEIZURES', 'DIABETES', 'HYPERTEN', 'HYPERCHO',
    'B12DEF', 'THYROID', 'INCONTU', 'INCONTF', 'ALCOHOL', 'ABUSOTHR',
    'PSYCDIS']

numerical_cols = ['BIRTHMO', 'BIRTHYR', 'EDUC', 'NACCAGE', 'NACCAGEB', 'SMOKYRS']


Changing SEX values 2(Female) to 0

In [58]:
df['SEX'] = df['SEX'].replace(2, 0)

In [59]:
df['SEX'].unique()

array([1, 0])

#### 3.2 Encoding Nominal Features

In [60]:
df_encoded = pd.get_dummies(df, columns=nominal_cols, prefix=nominal_cols)
df_encoded.head()

Unnamed: 0,BIRTHMO,BIRTHYR,SEX,EDUC,NACCAGE,NACCAGEB,INRELTO,TOBAC30,TOBAC100,SMOKYRS,...,INCONTF_2.0,ALCOHOL_0.0,ALCOHOL_1.0,ALCOHOL_2.0,ABUSOTHR_0.0,ABUSOTHR_1.0,ABUSOTHR_2.0,PSYCDIS_0.0,PSYCDIS_1.0,PSYCDIS_2.0
0,5.0,1952,1,16.0,70.0,70.0,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
1,5.0,1952,1,16.0,71.0,70.0,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
2,12.0,1956,1,16.0,66.0,66.0,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
3,1.0,1958,0,16.0,63.0,63.0,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
4,2.0,1945,1,12.0,77.0,77.0,3.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False


#### 3.3 Scaling Numerical Features

In [67]:
numerical_cols = ['BIRTHMO', 'BIRTHYR', 'EDUC', 'NACCAGE', 'NACCAGEB', 'SMOKYRS']

scaler = RobustScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

In [68]:
df_encoded.head()

Unnamed: 0,BIRTHMO,BIRTHYR,SEX,EDUC,NACCAGE,NACCAGEB,INRELTO,TOBAC30,TOBAC100,SMOKYRS,...,INCONTF_2.0,ALCOHOL_0.0,ALCOHOL_1.0,ALCOHOL_2.0,ABUSOTHR_0.0,ABUSOTHR_1.0,ABUSOTHR_2.0,PSYCDIS_0.0,PSYCDIS_1.0,PSYCDIS_2.0
0,-0.142857,0.8125,1,0.0,-0.384615,-0.076923,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
1,-0.142857,0.8125,1,0.0,-0.307692,-0.076923,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
2,0.857143,1.0625,1,0.0,-0.692308,-0.384615,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
3,-0.714286,1.1875,0,0.0,-0.923077,-0.615385,1.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False
4,-0.571429,0.375,1,-1.0,0.153846,0.461538,3.0,0.0,0.0,0.0,...,False,True,False,False,True,False,False,True,False,False


In [70]:
df_encoded.to_csv("D:\Study\Model X\Dementia Prediction\Dataset\processed/Feature_Engineered.csv", index=False)