In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile

import warnings
warnings.filterwarnings("ignore")

# Display all rows and columns of a dataframe instead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Dataset Overview

1. **Age**: Age of participant (integer)
2. **Gender**: Gender of participant (male = 1 / female =2) 
3. **Height**: Height measured in centimeters (integer) 
4. **Weight**: Weight measured in kilograms (integer)
5. **Ap_hi**: Systolic blood pressure reading taken from patient (integer)
6. **Ap_lo** : Diastolic blood pressure reading taken from patient (integer)
7. **Cholesterol** : Total cholesterol level read as mg/dl on a scale 0 - 5+ units( integer). Each unit denoting increase/decrease by 20 mg/dL respectively.
8. **Gluc** : Glucose level read as mmol/l on a scale 0 - 16+ units( integer). Each unit denoting increase Decreaseby 1 mmol/L respectively.
9. **Smoke**  : Whether person smokes or not(binary; 0= No , 1=Yes).              
10. **Alco** : Whether person drinks alcohol or not(binary; 0 =No ,1 =Yes ).                     
11. **Active** : whether person physically active or not( Binary ;0 =No,1 = Yes ).
12. **Cardio** : whether person suffers from cardiovascular diseases or not(Binary ;0 – no , 1 ­‑yes ).

In [2]:
# df = pd.read_csv('archive.zip', compression='zip')
df = pd.read_csv('./Data/heart_data.csv')
df.head()

Unnamed: 0,index,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


## Basic Data Checkup & Feature Engineering

In [3]:
# Basic Checks between index and id 
if (df['index'] == df['id']).all():
    df = df.drop(['index'], axis=1)
elif len(df['index'].unique()) < len(df['id'].unique()):
    df = df.drop(['id'], axis=1)
else:
    df = df.drop(['index'], axis=1)

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [5]:
df.shape

(70000, 13)

In [6]:
# Checking for nulls
df.isnull().sum().sort_values(ascending=False)

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [7]:
# Checking for the unique values in the datasets
df.nunique()

id             70000
age             8076
gender             2
height           109
weight           287
ap_hi            153
ap_lo            157
cholesterol        3
gluc               3
smoke              2
alco               2
active             2
cardio             2
dtype: int64

In [8]:
def convert_to_years(days):
    return days // 365.25

df['age'] = df['age'].apply(convert_to_years)
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50.0,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55.0,1,156,85.0,140,90,3,1,0,0,1,1
2,2,51.0,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48.0,2,169,82.0,150,100,1,1,0,0,1,1
4,4,47.0,1,156,56.0,100,60,1,1,0,0,0,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  float64
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(2), int64(11)
memory usage: 6.9 MB


In [10]:
# Unique Values
obj_col = df.columns
for col in obj_col:
    print("{}: {}".format(col,len(df[col].unique())))

id: 70000
age: 28
gender: 2
height: 109
weight: 287
ap_hi: 153
ap_lo: 157
cholesterol: 3
gluc: 3
smoke: 2
alco: 2
active: 2
cardio: 2


In [12]:
df['age'].value_counts()

55.0    4225
53.0    4166
57.0    3987
59.0    3844
49.0    3664
51.0    3601
54.0    3309
56.0    3307
58.0    3110
52.0    3043
50.0    2971
63.0    2937
61.0    2933
60.0    2933
47.0    2303
45.0    2214
43.0    2137
41.0    2004
62.0    1991
64.0    1986
39.0    1880
48.0    1705
40.0    1524
46.0    1499
44.0    1408
42.0    1315
29.0       3
30.0       1
Name: age, dtype: int64

## Exploratory Data Analysis