In [None]:
# Rerunning the Cardio Project

In [1]:
# Import dependencies 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [65]:
%matplotlib inline

In [2]:
# Import dataset as csv
df = pd.read_csv("cardio_complete.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0,21.97
1,1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1,34.93
2,2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1,23.51
3,3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1,28.71
4,4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0,23.01


In [None]:
# Legend for columns ***

# Gender: 1 = Female, 2 = Male

# Cholesterol:  1 = normal, 2 = above normal, 3 = far above normal range

# Glucose (Blood surgar level): - 1 = within normal range, 2 = above normal range, 3 = far above normal range

# Binary variables -> 0 = No, 1 = Yes

In [None]:
# CLEANING SECTION

In [3]:
# Remove Unnamed:0 & id columns
df = df.drop(columns= ["Unnamed: 0", "id"], axis=1)
df.head(3)

Unnamed: 0,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,21.97
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,34.93
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,23.51


In [4]:
# Convert the age column (currently in days) to years
df = df.rename(columns= {"age" : "age_years"})
df["age_years"] = (df["age_years"]/365).astype(int)
df.head(3)

Unnamed: 0,age_years,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50,2,168,62.0,110,80,1,1,0,0,1,0,21.97
1,55,1,156,85.0,140,90,3,1,0,0,1,1,34.93
2,51,1,165,64.0,130,70,3,1,0,0,0,1,23.51


In [None]:
# checking for outliers/misenteries 

In [7]:
# age_years_max - good to go!
print("the maximum value is", df.age_years.max())

the maximum value is 64


In [17]:
# age_range - need to remove values that are 30 >= x
print("Statistical Readout:", df.age_years.describe())

Statistical readout: count    70000.000000
mean        52.840671
std          6.766774
min         29.000000
25%         48.000000
50%         53.000000
75%         58.000000
max         64.000000
Name: age_years, dtype: float64


In [19]:
# age_years | fix | remove the (3) enteries for people 30 or younger.
df = df[df["age_years"]>30]
print("Statistical Readout, Corrected:", df.age_years.describe())

Statistical Readout, Corrected: count    69996.000000
mean        52.842020
std          6.764616
min         39.000000
25%         48.000000
50%         53.000000
75%         58.000000
max         64.000000
Name: age_years, dtype: float64


In [18]:
# height stats - need to remove outlier minimum value counts
print("Statistical Readout:", df.height.describe())

Statistical Readout: count    70000.000000
mean       164.359229
std          8.210126
min         55.000000
25%        159.000000
50%        165.000000
75%        170.000000
max        250.000000
Name: height, dtype: float64


In [20]:
# Height | fix | outliers | bins
df.height.value_counts(bins=6)
# so we have 86 outlier values that are between 54.8 - 120.0

# and 1 outlier value x > 217.5

(152.5, 185.0]    65255
(120.0, 152.5]     4277
(185.0, 217.5]      377
(87.5, 120.0]        62
(54.804, 87.5]       24
(217.5, 250.0]        1
Name: height, dtype: int64

In [29]:
# Height | removing all values below 145
df=df[df["height"]>145]

In [30]:
# Height | removing the max outlier that is above 217.50 (250)
df=df[df["height"]<217.5]

In [31]:
# Height | FIXED! | re-check the descriptive statistics and then run bins
print("Updated Descriptive Stats:", df.height.describe())

Updated Descriptive Stats: count    69475.000000
mean       164.575660
std          7.681668
min        146.000000
25%        159.000000
50%        165.000000
75%        170.000000
max        207.000000
Name: height, dtype: float64


In [32]:
# Height | FIXED | updated bins |
df.height.value_counts(bins=6)

(156.167, 166.333]    30854
(166.333, 176.5]      23321
(145.938, 156.167]    10882
(176.5, 186.667]       4150
(186.667, 196.833]      248
(196.833, 207.0]         20
Name: height, dtype: int64

In [33]:
# Systolic descriptive stats | before cleaning | 
print("pre-cleaned descriptive stats:", df.systolic_bp.describe())

pre-cleaned descriptive stats: count    69475.000000
mean       128.822339
std        154.581866
min       -150.000000
25%        120.000000
50%        120.000000
75%        140.000000
max      16020.000000
Name: systolic_bp, dtype: float64


In [None]:
# Systolic ("top") background info for analysis write-up


# When your heart beats, it squeezes and pushes blood through your arteries to the rest of your body. 
# This force creates pressure on those blood vessels, and that's your systolic blood pressure.


# Normal: Below 120
# Elevated: 120-129
# Stage 1 high blood pressure (also called hypertension): 130-139
# Stage 2 hypertension: 140 or more
# Hypertensive crisis: 180 or more. Death is coming.

In [34]:
# Systolic_bp bins | before cleaning | 
df.systolic_bp.value_counts(bins=7)
# results: whack. have 8 outliers < 13710 that are totally skewing everything.  

# results cont: also have negative values?? need to get rid of right tail outliers before addressing.

(-166.171, 2160.0]    69466
(13710.0, 16020.0]        5
(11400.0, 13710.0]        3
(9090.0, 11400.0]         1
(6780.0, 9090.0]          0
(4470.0, 6780.0]          0
(2160.0, 4470.0]          0
Name: systolic_bp, dtype: int64

In [35]:
# Systolic_bp | FIXING | remove right tail outliers | 
df= df[df["systolic_bp"]<180]

In [36]:
# Systolic_bp | FIXING | re-check BINS after removing RT outliers | 
df["systolic_bp"].value_counts(bins=7)

# Possible non-misentry range: 85 - 179 | Need to remove left tail outliers | < 85

(85.0, 132.0]        49646
(132.0, 179.0]       18463
(-9.0, 38.0]           179
(38.0, 85.0]           131
(-150.33, -103.0]        5
(-103.0, -56.0]          2
(-56.0, -9.0]            0
Name: systolic_bp, dtype: int64

In [37]:
# Systolic_bp | FIXING | remove left tail outliers | 
df=df[df["systolic_bp"]>85]

In [38]:
# Systolic_bp | FIXING | re-check BINS after removing the LT outliers | 
df["systolic_bp"].value_counts(bins=7)

(115.429, 128.143]    28185
(128.143, 140.857]    18606
(102.714, 115.429]     8972
(140.857, 153.571]     4780
(89.91, 102.714]       3583
(153.571, 166.286]     3204
(166.286, 179.0]        779
Name: systolic_bp, dtype: int64

In [39]:
# Systolic_bp | FIXED | Updated Descriptive Stats
print("Updated Descriptive Statistics for Systolic BP:", df.systolic_bp.describe())

Updated Descriptive Statistics for Systolic BP: count    68109.000000
mean       126.209414
std         15.438512
min         90.000000
25%        120.000000
50%        120.000000
75%        140.000000
max        179.000000
Name: systolic_bp, dtype: float64


In [None]:
# Diastolic Blood Pressure ("bottom number") Background Info for Writeup

# The diastolic reading, or the bottom number, is the pressure in the arteries 
# when the heart rests between beats. This is the time when the heart fills with blood and gets oxygen.


# Normal: Lower than 80
# Stage 1 hypertension: 80-89
# Stage 2 hypertension: 90 or more
# Hypertensive crisis: 120 or more. Death is near!


In [40]:
# Diastolic_BP | pre-cleaned | descriptive stats | pls fix
print("Pre-Cleaned Descriptive Stats for  Diastolic_BP:", df.diastolic_bp.describe())

Pre-Cleaned Descriptive Stats for  Diastolic_BP: count    68109.000000
mean        94.635452
std        181.131301
min          0.000000
25%         80.000000
50%         80.000000
75%         90.000000
max      10000.000000
Name: diastolic_bp, dtype: float64


In [41]:
# Diastolic_BP | pre-cleaned | BINS | 
print("Pre-cleaned bins for Diastolic Blood Pressure:", df["diastolic_bp"].value_counts(bins=7))

# same deal as with Systolic -> outliers on both the left and right tails.

# deal with right tail outliers first.

Pre-cleaned bins for Diastolic Blood Pressure: (-10.001, 1428.571]     68084
(7142.857, 8571.429]       11
(8571.429, 10000.0]         7
(5714.286, 7142.857]        3
(4285.714, 5714.286]        2
(1428.571, 2857.143]        2
(2857.143, 4285.714]        0
Name: diastolic_bp, dtype: int64


In [42]:
# Diastolic_BP | CLEANING | removing right tail outliers | re-check bins

df= df[df["diastolic_bp"]<120]
print("Diastolic_BP bins with right tail outliers removed: ", df["diastolic_bp"].value_counts(bins=7))

Diastolic_BP bins with right tail outliers removed:  (68.0, 85.0]      45815
(85.0, 102.0]     18110
(51.0, 68.0]       2810
(102.0, 119.0]      325
(34.0, 51.0]         60
(-0.12, 17.0]        25
(17.0, 34.0]         19
Name: diastolic_bp, dtype: int64


In [43]:
# Diastolic_BP | CLEANING | removing left tail outliers | re-check bins
df= df[df["diastolic_bp"]>60]
print("Diastolic_BP bins with both right AND left tail outliers removed:  ", df["diastolic_bp"].value_counts(bins=7))

Diastolic_BP bins with both right AND left tail outliers removed:   (77.571, 85.857]      35262
(85.857, 94.143]      14248
(69.286, 77.571]      10456
(94.143, 102.429]      3862
(102.429, 110.714]      311
(60.941, 69.286]        245
(110.714, 119.0]         14
Name: diastolic_bp, dtype: int64


In [44]:
# Diastolic Blood Pressure | Cleaned | re-check descriptive stats

print("Updated descriptive stats for Diastolic Blood Pressure: ", df["diastolic_bp"].describe())

Updated descriptive stats for Diastolic Blood Pressure:  count    64398.000000
mean        81.914376
std          7.942641
min         61.000000
25%         80.000000
50%         80.000000
75%         90.000000
max        119.000000
Name: diastolic_bp, dtype: float64


In [45]:
# Weight | pre-cleaned | Checking descriptive stats and BINS | 

print("pre-cleaned descriptive stats for Weight: ", df["weight"].describe())

# apparent outliers on both left and right tail

pre-cleaned descriptive state for Weight:  count    64398.000000
mean        74.367472
std         14.128542
min         11.000000
25%         65.000000
50%         72.000000
75%         82.000000
max        200.000000
Name: weight, dtype: float64


In [46]:
# Weight | pre-cleaned | BINS | 
print("Pre-Cleaned Bins for Weight: ", df["weight"].value_counts(bins=7))

Pre-Cleaned Bins for Weight:  (65.0, 92.0]      39237
(38.0, 65.0]      18580
(92.0, 119.0]      6036
(119.0, 146.0]      462
(146.0, 173.0]       47
(10.81, 38.0]        26
(173.0, 200.0]       10
Name: weight, dtype: int64


In [47]:
# Weight | CLEANING | removing right tail outliers | x < 160 kg (352 pounds)

df= df[df["weight"]<160]

In [48]:
# Weight | CLEANING | re-check bins after removing x < 160 (kg)
print("re-checking Weight bins after removing RT outliers: ", df["weight"].value_counts(bins=7))

# 40 kg = 88 pounds for cutoff on left tail

re-checking Weight bins after removing RT outliers:  (53.286, 74.429]      34077
(74.429, 95.571]      23222
(95.571, 116.714]      4265
(32.143, 53.286]       2180
(116.714, 137.857]      546
(137.857, 159.0]         69
(10.851, 32.143]          9
Name: weight, dtype: int64


In [49]:
# Weight | CLEANING | removing left tail outliers | x > 40 kg (88 pounds)
df = df[df["weight"]>40]
print("Updated Weight bins with outliers removed: ", df["weight"].value_counts(bins=7))

Updated Weight bins with outliers removed:  (57.857, 74.714]      31411
(74.714, 91.571]      20906
(91.571, 108.429]      5797
(40.881, 57.857]       4800
(108.429, 125.286]     1133
(125.286, 142.143]      228
(142.143, 159.0]         37
Name: weight, dtype: int64


In [50]:
# Weight | Cleaned | Updated Descriptive Stats |
print("Updated Descriptive Stats for Weight in kg: ", df.weight.describe())

Updated Descriptive Stats for Weight in kg:  count    64312.000000
mean        74.354941
std         13.933497
min         41.000000
25%         65.000000
50%         72.000000
75%         82.000000
max        159.000000
Name: weight, dtype: float64


In [58]:
# Cholesterol | Checking to make sure every value is 1-3

df.cholesterol.value_counts()

df.cholesterol.value_counts(normalize=True)

# 75.1% of individuals have normal cholesterol levels
# 13.3% have above normal cholesterol levels
# 11.6% have much higher than normal cholesterol levels 

1    0.750793
2    0.133365
3    0.115842
Name: cholesterol, dtype: float64

In [59]:
# Glucose | Checking to make sure all values are in range of 1-3

df.gluc.value_counts()

df.gluc.value_counts(normalize=True)

# 85.1% of individuals have normal glucose levels
# 7.7% of individuals have above normal glucose levels
# 7.3% of individuals have much higher than normal glucose levels

1    0.850526
3    0.076642
2    0.072832
Name: gluc, dtype: float64

In [61]:
# Smoke | Checking to be sure it's binary | Subjective feature | 

df.smoke.value_counts()

df.smoke.value_counts(normalize=True)

#  91.1% of individuals do not smoke on a regular basis
# 8.9% of individuals smoke 

0    0.911727
1    0.088273
Name: smoke, dtype: float64

In [62]:
# Alcohol | Checking to be sure it's binary | Subjective feature |

df.alco.value_counts()

df.alco.value_counts(normalize=True)

# 94.6% of individuals drink alcohol at least somewhat regularly 
# 5.4% of individuals do not drink alcohol at all

0    0.946495
1    0.053505
Name: alco, dtype: float64

In [63]:
# Active | Checking to be sure it's binary | Subjective feature | 

df.active.value_counts()

df.active.value_counts(normalize=True)

# 80% of individuals are at least somewhat physicially active 
# 20% of individuals are not at all physicailly active

1    0.803583
0    0.196417
Name: active, dtype: float64

In [64]:
# Cardio | TARGET VARIABLE | 50/50

df.cardio.value_counts(normalize=True)

# 50% of individuals have some form of cardiovasuclar disease 
# 50% of individuals do NOT have any form of cardiovasuclar disease 

0    0.500342
1    0.499658
Name: cardio, dtype: float64

In [51]:
# Re-checking Dataframe after cleaning:  Height / Weight / Systolic_BP / Diastolic_BP 
df.head(5)

Unnamed: 0,age_years,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50,2,168,62.0,110,80,1,1,0,0,1,0,21.97
1,55,1,156,85.0,140,90,3,1,0,0,1,1,34.93
2,51,1,165,64.0,130,70,3,1,0,0,0,1,23.51
3,48,2,169,82.0,150,100,1,1,0,0,1,1,28.71
5,60,1,151,67.0,120,80,2,2,0,0,0,0,29.38


In [52]:
# checking for null values

df.isnull().sum()

# No null values!

age_years       0
gender          0
height          0
weight          0
systolic_bp     0
diastolic_bp    0
cholesterol     0
gluc            0
smoke           0
alco            0
active          0
cardio          0
BMI             0
dtype: int64

In [66]:
# preparing to export the CLEANED dataframe as CSV file

cardio_df_ultimate = df
cardio_df_ultimate.head(3)

Unnamed: 0,age_years,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50,2,168,62.0,110,80,1,1,0,0,1,0,21.97
1,55,1,156,85.0,140,90,3,1,0,0,1,1,34.93
2,51,1,165,64.0,130,70,3,1,0,0,0,1,23.51


In [69]:
cardio_df_ultimate.to_csv(r"cardio_df_ultimate.csv", index=True, header=True)

In [None]:
# Exported to Final_Project as "cardio_df_ultimate.csv" 