In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import featuretools as ft

In [30]:
df = pd.read_csv('./HeartDiseaseDataset/ImportedDataset.csv')
df = df.reset_index()
df.head()

Unnamed: 0,index,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


# Feature Engineering

### Combining Features to improve ML

In [39]:
# Automatic Feature Engineering with FeatureTools
dataframes = {
    "heart_disease": (df,'index')
}
features = ft.dfs(dataframes=dataframes,target_dataframe_name="heart_disease",trans_primitives = ['add_numeric'])

In [40]:
print(features)

(        age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
index                                                                           
0      63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1      67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2      67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3      37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4      41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   
...     ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
298    45.0  1.0  1.0     110.0  264.0  0.0      0.0    132.0    0.0      1.2   
299    68.0  1.0  4.0     144.0  193.0  1.0      0.0    141.0    0.0      3.4   
300    57.0  1.0  4.0     130.0  131.0  0.0      0.0    115.0    1.0      1.2   
301    57.0  0.0  2.0     130.0  236.0  0.0      2.0    174.0    0.0      0.0   
302    38.0  1.0  3.0     1

In [14]:
numerical_features = ['age','trestbps','chol','thalach','oldpeak']
categorical_features = ['sex','cp','fbs','thalach','restecg','exang','slope','ca','thal']
# Scaling numerical before combining them
scaler = StandardScaler()
df[[col + "_scaled" for col in numerical_features]] = scaler.fit_transform(df[numerical_features])


In [18]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,age_scaled,trestbps_scaled,chol_scaled,thalach_scaled,oldpeak_scaled
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0.948726,0.757525,-0.264900,0.017197,1.087338
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,1.392002,1.611220,0.760415,-1.821905,0.397182
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,1.392002,-0.665300,-0.342283,-0.902354,1.346147
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,-1.932564,-0.096170,0.063974,1.637359,2.122573
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,-1.489288,-0.096170,-0.825922,0.980537,0.310912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1,-1.046013,-1.234430,0.334813,-0.770990,0.138373
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2,1.502821,0.700612,-1.038723,-0.376896,2.036303
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3,0.283813,-0.096170,-2.238149,-1.515388,0.138373
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1,0.283813,-0.096170,-0.206864,1.068113,-0.896862


In [16]:
scaled_values

array([[ 0.94872647,  0.75752504, -0.2649003 ,  0.01719733,  1.08733806],
       [ 1.39200191,  1.61121989,  0.76041519, -1.82190531,  0.39718162],
       [ 1.39200191, -0.6652997 , -0.34228261, -0.90235399,  1.34614673],
       ...,
       [ 0.28381332, -0.0961698 , -2.23814899, -1.51538821,  0.13837295],
       [ 0.28381332, -0.0961698 , -0.20686358,  1.06811312, -0.89686172],
       [-1.82174501,  0.35913411, -1.38694368,  1.02432497, -0.89686172]])

In [None]:
def combine_features():
    

In [7]:
df.columns[0]

'age'

In [11]:
scaled_values

array([[ 0.94872647,  0.75752504, -0.2649003 ,  0.01719733,  1.08733806],
       [ 1.39200191,  1.61121989,  0.76041519, -1.82190531,  0.39718162],
       [ 1.39200191, -0.6652997 , -0.34228261, -0.90235399,  1.34614673],
       ...,
       [ 0.28381332, -0.0961698 , -2.23814899, -1.51538821,  0.13837295],
       [ 0.28381332, -0.0961698 , -0.20686358,  1.06811312, -0.89686172],
       [-1.82174501,  0.35913411, -1.38694368,  1.02432497, -0.89686172]])