# Cardio disease analysis

In [4]:
# Importing main libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import RocCurveDisplay


plt.rcParams.update({'figure.figsize': (10.0, 10.0)})
plt.rcParams.update({'font.size': 12})
plt.rcParams.update({'figure.dpi': 300})

In [5]:
df = pd.read_csv("cardio_train.csv", sep=";")

#### Data description reported by authors

There are 3 types of input features:

1. Objective: factual information;
2. Examination: results of medical examination;
3. Subjective: information given by the patient.


| Feature | Feature type | Name in dataset | Data type
| :---: | :-------------------: | :-----: | :-----------: |
|Age | Objective Feature | age | int (days)|
|Height | Objective Feature | height | int (cm) |
|Weight | Objective Feature | weight | float (kg) |
|Gender | Objective Feature | gender | categorical code (1: women, 2: men) |
|Systolic blood pressure | Examination Feature | ap_hi | int |
|Diastolic blood pressure | Examination Feature | ap_lo | int |
|Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
|Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
|Smoking | Subjective Feature | smoke | binary |
|Alcohol intake | Subjective Feature | alco | binary |
|Physical activity | Subjective Feature | active | binary |
|Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

All of the dataset values were collected at the moment of medical examination. 

In [240]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [241]:
df.dtypes

id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [242]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


Add the BMI feature from height and weight

In [243]:
BMI = df["weight"] / (df["height"] / 100)**2
df.insert (4, "BMI", BMI)

Convert age in years

In [244]:
df["age"] = (df["age"]/365).astype(int)

In [245]:
df.head()

Unnamed: 0,id,age,gender,height,BMI,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,21.96712,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,34.927679,85.0,140,90,3,1,0,0,1,1
2,2,51,1,165,23.507805,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,28.710479,82.0,150,100,1,1,0,0,1,1
4,4,47,1,156,23.011177,56.0,100,60,1,1,0,0,0,0


Filter dataset by $BMI \geq 18.5$

In [246]:
df = df.loc[BMI >= 18.5]

Search for duplicated rows

In [247]:
print("Duplicate rows: {}".format(df.duplicated().sum()))

Duplicate rows: 0


In [248]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69350 entries, 0 to 69999
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           69350 non-null  int64  
 1   age          69350 non-null  int64  
 2   gender       69350 non-null  int64  
 3   height       69350 non-null  int64  
 4   BMI          69350 non-null  float64
 5   weight       69350 non-null  float64
 6   ap_hi        69350 non-null  int64  
 7   ap_lo        69350 non-null  int64  
 8   cholesterol  69350 non-null  int64  
 9   gluc         69350 non-null  int64  
 10  smoke        69350 non-null  int64  
 11  alco         69350 non-null  int64  
 12  active       69350 non-null  int64  
 13  cardio       69350 non-null  int64  
dtypes: float64(2), int64(12)
memory usage: 7.9 MB
