In [135]:
import pandas as pd 
import math
import numpy as np 

In [136]:
data = pd.read_csv("cardio_train.csv", sep=";") 
data.shape

(70000, 13)

In [137]:
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [138]:
# check for null values 
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB
None


In [139]:
def convert_age_to_years(age: int) -> int:
    age_years = math.floor(age / 365) # convert from days to years
    return int(age_years)

In [140]:
data["age_years"] = data["age"].apply(convert_age_to_years)

In [141]:
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47


In [142]:
def calculate_bmi(height, weight) -> float: 
    height_m = height/100 # convert height to meters
    return (weight/(height_m**2))

In [143]:
data["BMI"] = data.apply(lambda x: calculate_bmi(x.height, x.weight), axis=1)

In [144]:
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,23.507805
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,23.011177


In [145]:
def get_bp_category(sys, dias): 
    if (sys <= 120 and dias <= 80): 
        return "Normal" 
    elif ((sys >= 120 and sys < 129) and (dias < 80)):
        return "Elevated" 
    elif (sys > 180 or dias > 120): 
        return "Hypertensive Crisis" 
    elif ((sys >= 130 and sys < 139) or (dias >= 80 and dias < 89)):
        return "High Blood Pressure Stage 1"
    else: 
        return "High Blood Pressure Stage 2" 

In [146]:
data["BP Category"] = data.apply(lambda x: get_bp_category(x.ap_hi, x.ap_lo), axis=1)

In [147]:
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI,BP Category
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712,Normal
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,High Blood Pressure Stage 2
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,23.507805,High Blood Pressure Stage 1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,High Blood Pressure Stage 2
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,23.011177,Normal


In [148]:
data["BP Category"].value_counts()

BP Category
Normal                         39057
High Blood Pressure Stage 2    16153
High Blood Pressure Stage 1    13313
Hypertensive Crisis             1348
Elevated                         129
Name: count, dtype: int64

In [149]:
def replace_gender(gender_num: int) -> str:
    if (gender_num == 1):
        return 'F'
    else: 
        return 'M'

In [150]:
data["gender"] = data["gender"].apply(replace_gender)

In [151]:
data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI,BP Category
0,0,18393,M,168,62.0,110,80,1,1,0,0,1,0,50,21.967120,Normal
1,1,20228,F,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,High Blood Pressure Stage 2
2,2,18857,F,165,64.0,130,70,3,1,0,0,0,1,51,23.507805,High Blood Pressure Stage 1
3,3,17623,M,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,High Blood Pressure Stage 2
4,4,17474,F,156,56.0,100,60,1,1,0,0,0,0,47,23.011177,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,M,168,76.0,120,80,1,1,1,0,1,0,52,26.927438,Normal
69996,99995,22601,F,158,126.0,140,90,2,2,0,0,1,1,61,50.472681,High Blood Pressure Stage 2
69997,99996,19066,M,183,105.0,180,90,3,1,0,1,0,1,52,31.353579,High Blood Pressure Stage 2
69998,99998,22431,F,163,72.0,135,80,1,2,0,0,0,1,61,27.099251,High Blood Pressure Stage 1


In [152]:
# look for impossible height and weight values 
data["height"].min()

55

In [153]:
data["BMI"].describe()

count    70000.000000
mean        27.556513
std          6.091511
min          3.471784
25%         23.875115
50%         26.374068
75%         30.222222
max        298.666667
Name: BMI, dtype: float64

In [154]:
data[data["BMI"] > 298]

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI,BP Category
12770,18218,19594,F,75,168.0,120,80,1,1,1,0,1,1,53,298.666667,Normal


In [155]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  object 
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
 13  age_years    70000 non-null  int64  
 14  BMI          70000 non-null  float64
 15  BP Category  70000 non-null  object 
dtypes: float64(2), int64(12), object(2)
memory usage: 8.5+ MB
None


In [156]:
data.drop('age', axis=1, inplace=True)

In [157]:
data

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI,BP Category
0,0,M,168,62.0,110,80,1,1,0,0,1,0,50,21.967120,Normal
1,1,F,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,High Blood Pressure Stage 2
2,2,F,165,64.0,130,70,3,1,0,0,0,1,51,23.507805,High Blood Pressure Stage 1
3,3,M,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,High Blood Pressure Stage 2
4,4,F,156,56.0,100,60,1,1,0,0,0,0,47,23.011177,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,M,168,76.0,120,80,1,1,1,0,1,0,52,26.927438,Normal
69996,99995,F,158,126.0,140,90,2,2,0,0,1,1,61,50.472681,High Blood Pressure Stage 2
69997,99996,M,183,105.0,180,90,3,1,0,1,0,1,52,31.353579,High Blood Pressure Stage 2
69998,99998,F,163,72.0,135,80,1,2,0,0,0,1,61,27.099251,High Blood Pressure Stage 1


In [163]:
# account for bp less than 0 (assume sign was changed) 
sys_outliers = data[data["ap_hi"] < 0]

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI,BP Category
4607,6525,F,165,78.0,-100,80,2,1,0,0,1,0,41,28.650138,Normal
16021,22881,M,161,90.0,-115,70,1,1,0,0,1,0,60,34.720883,Normal
20536,29313,F,153,54.0,-100,70,1,1,0,0,1,0,42,23.068051,Normal
23988,34295,F,162,74.0,-140,90,1,1,0,0,1,1,50,28.196921,High Blood Pressure Stage 2
25240,36025,M,168,50.0,-120,80,2,1,0,0,0,1,40,17.71542,Normal
35040,50055,M,168,59.0,-150,80,1,1,0,0,1,1,63,20.904195,Normal
46627,66571,M,160,59.0,-120,80,1,1,0,0,0,0,64,23.046875,Normal


In [164]:
data[data["ap_lo"] < 0]

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI,BP Category
60106,85816,F,167,74.0,15,-70,1,1,0,0,1,1,61,26.533759,Normal


In [165]:
data.to_csv("cleaned_data_v1.csv", index="False")

In [170]:
# fix negative systolic values based on assumption 
def convert_sign(sys): 
    if sys < 0: 
        x = -1 * sys 
    else: 
        x = sys
    return x

In [173]:
data["sys"] = data["ap_hi"].apply(convert_sign)