In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

## Data cleaning

In [None]:
df = pd.read_csv('/cardio_train.csv', sep=";")
df.head(3)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [None]:
df.isnull().values.any()

False

There are no null values in the dataset

In [None]:
df.drop(columns=['id'], inplace=True)

In [None]:
continuous_features = [feature for feature in df.columns if len(df[feature].unique())>25]
print('Continuous Values are : {}'.format(continuous_features))

Continuous Values are : ['age', 'height', 'weight', 'ap_hi', 'ap_lo']


In [None]:
categorical_features = [feature for feature in df.columns if feature not in continuous_features]

df[categorical_features]=df[categorical_features].astype("category")

In [None]:
#change categorical features to have right dtype
df.dtypes

age               int64
gender         category
height            int64
weight          float64
ap_hi             int64
ap_lo             int64
cholesterol    category
gluc           category
smoke          category
alco           category
active         category
cardio         category
dtype: object

In [None]:
print("continuous var: " , continuous_features)
print("categorical var: " , categorical_features)

continuous var:  ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
categorical var:  ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']


In [None]:
df.describe()

#min height = 55, max height = 250
#min weight = 10, max weight = 200

Unnamed: 0,age,height,weight,ap_hi,ap_lo
count,70000.0,70000.0,70000.0,70000.0,70000.0
mean,19468.865814,164.359229,74.20569,128.817286,96.630414
std,2467.251667,8.210126,14.395757,154.011419,188.47253
min,10798.0,55.0,10.0,-150.0,-70.0
25%,17664.0,159.0,65.0,120.0,80.0
50%,19703.0,165.0,72.0,120.0,80.0
75%,21327.0,170.0,82.0,140.0,90.0
max,23713.0,250.0,200.0,16020.0,11000.0


In [None]:

cleaned_df = df.copy()
cleaned_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [None]:
#remove outliers above upper fence and below lower fence
for feature in ['height','weight']:
        Q1 = np.percentile(df[feature], 25.) # 25th percentile of the data of the given feature
        Q3 = np.percentile(df[feature], 75.) # 75th percentile of the data of the given feature
        IQR = Q3-Q1 #Interquartile Range
        outlier_step = IQR * 1.5 
        upper_fence = Q3 + outlier_step
        lower_fence = Q1 - outlier_step
        outliers = df[feature][~((df[feature] >= lower_fence) & (df[feature] <= upper_fence))].index.tolist()  
        print('For the feature {}, No of Outliers is {}'.format(feature, len(outliers)))
        cleaned_df = cleaned_df[~((cleaned_df[feature] < lower_fence) | (cleaned_df[feature]> upper_fence))]

For the feature height, No of Outliers is 519
For the feature weight, No of Outliers is 1819


In [None]:
cleaned_df.describe()

#new min height = 143, new max height = 186
#new min weight = 40, new max weight = 107

Unnamed: 0,age,height,weight,ap_hi,ap_lo
count,67723.0,67723.0,67723.0,67723.0,67723.0
mean,19468.502163,164.310013,73.059601,128.585001,95.99303
std,2468.00731,7.565537,12.407575,156.523767,189.111316
min,10798.0,143.0,40.0,-150.0,-70.0
25%,17664.0,159.0,65.0,120.0,80.0
50%,19703.0,165.0,72.0,120.0,80.0
75%,21324.0,170.0,81.0,140.0,90.0
max,23713.0,186.0,107.0,16020.0,11000.0


In [None]:
#remove ap_hi and ap_lo values that are negative
cleaned_df = cleaned_df[cleaned_df['ap_lo']>=0]
cleaned_df = cleaned_df[cleaned_df['ap_hi']>=0]

In [None]:
#remove rows where ap_hi < ap_lo
print('There are {} observations where ap_hi < ap_lo'.format(len(cleaned_df[cleaned_df['ap_hi'] < cleaned_df['ap_lo']])))
cleaned_df = cleaned_df[cleaned_df['ap_hi'] >= cleaned_df['ap_lo']].reset_index(drop=True)

There are 1143 observations where ap_hi < ap_lo


In [None]:
#check for rows where ap_hi > 350
(cleaned_df['ap_hi']>350).sum()


39

In [None]:
#remove rows where ap_hi < 20
(cleaned_df['ap_hi']<20).sum()

3

In [None]:
cleaned_df= cleaned_df[(cleaned_df['ap_lo']<350) & (cleaned_df['ap_hi']<350)].copy()
cleaned_df= cleaned_df[(cleaned_df['ap_lo']>20) & (cleaned_df['ap_hi']>20)].copy()

In [None]:
cleaned_df.describe()

Unnamed: 0,age,height,weight,ap_hi,ap_lo
count,66489.0,66489.0,66489.0,66489.0,66489.0
mean,19463.983275,164.312954,72.997406,126.384124,81.166012
std,2468.911974,7.55372,12.382733,16.502193,9.38677
min,10798.0,143.0,40.0,60.0,30.0
25%,17657.0,159.0,65.0,120.0,80.0
50%,19702.0,165.0,71.0,120.0,80.0
75%,21323.0,170.0,81.0,140.0,90.0
max,23713.0,186.0,107.0,240.0,182.0


In [None]:
print('Total observations preserved : {}'.format(len(cleaned_df)))

Total observations preserved : 66489


In [None]:
cleaned_df.head(3)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1


In [None]:
cleaned_df["BMI"] = round(cleaned_df["weight"] / (cleaned_df["height"]/100)**2,3)
cleaned_df['MAP'] = round(cleaned_df['ap_lo'] + ((cleaned_df['ap_hi']-cleaned_df['ap_lo'])/3),3)

In [None]:
cleaned_df_nooe=cleaned_df.copy()
y = cleaned_df.cardio
# Only uncomment when you want standardized data
y = y.reset_index(drop=True)
y

0        0
1        1
2        1
3        1
4        0
        ..
66484    1
66485    0
66486    1
66487    1
66488    0
Name: cardio, Length: 66489, dtype: category
Categories (2, int64): [0, 1]

In [None]:

X_number = cleaned_df.select_dtypes(include = 'number')
X_category = cleaned_df.drop('cardio',axis = 1).select_dtypes(exclude='number')

# Only uncomment when you want standardized data
X_number = (X_number - X_number.mean())/X_number.std()
X_category = pd.get_dummies(X_category)

cleaned_df = pd.concat([X_number,X_category],axis=1)
cleaned_df['cardio'] = y
cleaned_df.head(3)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,BMI,MAP,gender_1,gender_2,cholesterol_1,...,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1,cardio
0,-0.433788,0.48811,-0.888124,-0.992845,-0.124219,-1.112179,-0.569948,0,1,1,...,1,0,0,1,0,1,0,0,1,0
1,0.309455,-1.100511,0.969301,0.825095,0.941111,1.707131,0.952717,1,0,0,...,1,0,0,1,0,1,0,0,1,1
2,-0.245851,0.090955,-0.726609,0.219115,-1.189548,-0.776977,-0.569948,1,0,0,...,1,0,0,1,0,1,0,1,0,1


In [None]:
cleaned_df.isnull().values.any()

True

In [None]:
cleaned_df.describe()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,BMI,MAP,gender_1,gender_2,cholesterol_1,...,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1
count,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0,...,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0,66489.0
mean,3.4277340000000004e-17,8.869895000000001e-17,-1.429175e-15,-3.2059860000000004e-17,-6.702381e-16,-1.678868e-16,-7.62811e-16,0.65468,0.34532,0.754095,...,0.111988,0.853991,0.071636,0.074373,0.913324,0.086676,0.947615,0.052385,0.195762,0.804238
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.475476,0.475476,0.430626,...,0.315355,0.353118,0.257886,0.262379,0.281362,0.281362,0.222803,0.222803,0.396789,0.396789
min,-3.510041,-2.821518,-2.664792,-4.022746,-5.450865,-2.949374,-4.528768,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7318946,-0.703356,-0.6458515,-0.3868652,-0.1242187,-0.7073695,-0.2654514,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
50%,0.09640551,0.09095465,-0.1613058,-0.3868652,-0.1242187,-0.1872732,-0.2654514,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
75%,0.75297,0.7528802,0.6462704,0.825095,0.9411105,0.5825391,0.6481295,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
max,1.721008,2.871042,2.745968,6.884896,10.74214,5.17966,8.261365,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
cleaned_df.to_csv("./cleaned_df.csv", index=False)

In [None]:
cleaned_df_nooe.head(3)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI,MAP
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,21.967,90.0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,34.928,106.667
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,23.508,90.0


## Feature engineering

In [None]:
#feat eng on training set
X = cleaned_df.drop(columns='cardio')

poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
Xe_with_category_interactions = poly.fit_transform(X)
cols = poly.get_feature_names_out(X.columns)
Xe_with_category_interactions = pd.DataFrame(Xe_with_category_interactions, columns=cols)
Xe_with_category_interactions.head(3)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,BMI,MAP,gender_1,gender_2,cholesterol_1,...,smoke_1 alco_0,smoke_1 alco_1,smoke_1 active_0,smoke_1 active_1,alco_0 alco_1,alco_0 active_0,alco_0 active_1,alco_1 active_0,alco_1 active_1,active_0 active_1
0,-0.433788,0.48811,-0.888124,-0.992845,-0.124219,-1.112179,-0.569948,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.309455,-1.100511,0.969301,0.825095,0.941111,1.707131,0.952717,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.245851,0.090955,-0.726609,0.219115,-1.189548,-0.776977,-0.569948,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
Xe_with_category_interactions.drop(columns=['gender_1 gender_2','cholesterol_1 cholesterol_2', 'cholesterol_1 cholesterol_3', 'cholesterol_2 cholesterol_3', 'gluc_1 gluc_2', 'gluc_1 gluc_3', 'gluc_2 gluc_3', 'smoke_0 smoke_1', 'alco_0 alco_1', 'active_0 active_1', 'weight BMI', 'height BMI', 'ap_lo MAP', 'ap_hi MAP'], inplace=True)
Xe_with_category_interactions.head(3)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,BMI,MAP,gender_1,gender_2,cholesterol_1,...,smoke_0 active_0,smoke_0 active_1,smoke_1 alco_0,smoke_1 alco_1,smoke_1 active_0,smoke_1 active_1,alco_0 active_0,alco_0 active_1,alco_1 active_0,alco_1 active_1
0,-0.433788,0.48811,-0.888124,-0.992845,-0.124219,-1.112179,-0.569948,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.309455,-1.100511,0.969301,0.825095,0.941111,1.707131,0.952717,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.245851,0.090955,-0.726609,0.219115,-1.189548,-0.776977,-0.569948,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
engineered_df_with_category_interactions = pd.DataFrame(Xe_with_category_interactions)
engineered_df_with_category_interactions['cardio'] = y
engineered_df_with_category_interactions.to_csv("./engineered_df_with_category_interactions.csv", index=False)

In [None]:
engineered_df_with_category_interactions

Unnamed: 0,age,height,weight,ap_hi,ap_lo,BMI,MAP,gender_1,gender_2,cholesterol_1,...,smoke_0 active_1,smoke_1 alco_0,smoke_1 alco_1,smoke_1 active_0,smoke_1 active_1,alco_0 active_0,alco_0 active_1,alco_1 active_0,alco_1 active_1,cardio
0,-0.433788,0.488110,-0.888124,-0.992845,-0.124219,-1.112179,-0.569948,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,0.309455,-1.100511,0.969301,0.825095,0.941111,1.707131,0.952717,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
2,-0.245851,0.090955,-0.726609,0.219115,-1.189548,-0.776977,-0.569948,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
3,-0.745666,0.620495,0.727028,1.431075,2.006440,0.354575,1.866298,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
4,-0.806016,-1.100511,-1.372670,-1.598825,-2.254877,-0.885085,-2.092613,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66484,0.652116,0.090955,0.565513,1.431075,-0.124219,0.501403,0.648130,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
66485,-0.090721,0.488110,0.242482,-0.386865,-0.124219,-0.033267,-0.265451,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0
66486,-0.161198,2.473886,2.584453,3.249015,0.941111,0.929705,2.170795,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
66487,1.201751,-0.173816,-0.080548,0.522105,-0.124219,0.004147,0.191339,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1


#### Feature engineering wirthout categorical interactions for fewer columns

In [None]:
X_continuous = cleaned_df[continuous_features].copy()
X_continuous['BMI'] = cleaned_df['BMI']
X_continuous['MAP'] = cleaned_df['MAP']
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
Xe_continuous = poly.fit_transform(X_continuous)
cols = poly.get_feature_names_out(X_continuous.columns)
Xe_continuous = pd.DataFrame(Xe_continuous, columns=cols)
Xe_continuous.head(3)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,BMI,MAP,age height,age weight,age ap_hi,...,weight ap_hi,weight ap_lo,weight BMI,weight MAP,ap_hi ap_lo,ap_hi BMI,ap_hi MAP,ap_lo BMI,ap_lo MAP,BMI MAP
0,-0.433788,0.48811,-0.888124,-0.992845,-0.124219,-1.112179,-0.569948,-0.211736,0.385257,0.430684,...,0.88177,0.110322,0.987753,0.506185,0.12333,1.104222,0.56587,0.138153,0.070798,0.633884
1,0.309455,-1.100511,0.969301,0.825095,0.941111,1.707131,0.952717,-0.340559,0.299955,0.25533,...,0.799765,0.912219,1.654723,0.92347,0.776506,1.408545,0.786082,1.606599,0.896612,1.626413
2,-0.245851,0.090955,-0.726609,0.219115,-1.189548,-0.776977,-0.569948,-0.022361,0.178637,-0.05387,...,-0.159211,0.864336,0.564558,0.414129,-0.260648,-0.170247,-0.124884,0.924251,0.67798,0.442836


In [None]:
#Remove superfluous, redundant interactions
Xe_continuous.drop(columns=['height BMI', 'weight BMI', 'ap_hi MAP', 'ap_lo MAP'], inplace=True)
Xe_continuous.columns

Index(['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'BMI', 'MAP', 'age height',
       'age weight', 'age ap_hi', 'age ap_lo', 'age BMI', 'age MAP',
       'height weight', 'height ap_hi', 'height ap_lo', 'height MAP',
       'weight ap_hi', 'weight ap_lo', 'weight MAP', 'ap_hi ap_lo',
       'ap_hi BMI', 'ap_lo BMI', 'BMI MAP'],
      dtype='object')

In [None]:
engineered_df_without_category_interactions = Xe_continuous.copy()
engineered_df_without_category_interactions[categorical_features] = cleaned_df_nooe[categorical_features].copy()
engineered_df_without_category_interactions.head(3)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,BMI,MAP,age height,age weight,age ap_hi,...,ap_hi BMI,ap_lo BMI,BMI MAP,gender,cholesterol,gluc,smoke,alco,active,cardio
0,-0.433788,0.48811,-0.888124,-0.992845,-0.124219,-1.112179,-0.569948,-0.211736,0.385257,0.430684,...,1.104222,0.138153,0.633884,2,1,1,0,0,1,0
1,0.309455,-1.100511,0.969301,0.825095,0.941111,1.707131,0.952717,-0.340559,0.299955,0.25533,...,1.408545,1.606599,1.626413,1,3,1,0,0,1,1
2,-0.245851,0.090955,-0.726609,0.219115,-1.189548,-0.776977,-0.569948,-0.022361,0.178637,-0.05387,...,-0.170247,0.924251,0.442836,1,3,1,0,0,0,1


In [None]:
engineered_df_without_category_interactions.drop(columns='cardio', inplace=True)
engineered_df_without_category_interactions = pd.get_dummies(engineered_df_without_category_interactions)
engineered_df_without_category_interactions['cardio'] = y
engineered_df_without_category_interactions.head(3)

Unnamed: 0,age,height,weight,ap_hi,ap_lo,BMI,MAP,age height,age weight,age ap_hi,...,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1,cardio
0,-0.433788,0.48811,-0.888124,-0.992845,-0.124219,-1.112179,-0.569948,-0.211736,0.385257,0.430684,...,1,0,0,1,0,1,0,0,1,0
1,0.309455,-1.100511,0.969301,0.825095,0.941111,1.707131,0.952717,-0.340559,0.299955,0.25533,...,1,0,0,1,0,1,0,0,1,1
2,-0.245851,0.090955,-0.726609,0.219115,-1.189548,-0.776977,-0.569948,-0.022361,0.178637,-0.05387,...,1,0,0,1,0,1,0,1,0,1


In [None]:
engineered_df_without_category_interactions.to_csv('./engineered_df_without_category_interactions.csv', index=False)