# The project: Heart Diseases Prediction

The goal of the competition is to predict heart diseases based on the given data. The target variable is 'cardio'. The evalution metric for the competition is ROC-AUC.

**Plan of the notebook:**
1. [Import of Modules & Files Opening](#importing)
2. [Data Preprocessing and Exploratory Data Analysis (EDA)](#preprocessing_and_eda)
3. [Development of ML-models](#ml_models)
4. [Conclusion](#conlusion)

<a name='importing'></a>
# 1. Import of Modules & Files Opening

In [65]:
%pip install phik
%pip install scikit-learn --upgarde
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.

Usage:   
  /Users/midle/Desktop/Coding/Python Apps/heart_disease_app/venv/bin/python -m pip install [options] <requirement specifier> [package-index-options] ...
  /Users/midle/Desktop/Coding/Python Apps/heart_disease_app/venv/bin/python -m pip install [options] -r <requirements file> [package-index-options] ...
  /Users/midle/Desktop/Coding/Python Apps/heart_disease_app/venv/bin/python -m pip install [options] [-e] <vcs project url> ...
  /Users/midle/Desktop/Coding/Python Apps/heart_disease_app/venv/bin/python -m pip install [options] [-e] <local project path> ...
  /Users/midle/Desktop/Coding/Python Apps/heart_disease_app/venv/bin/python -m pip install [options] <archive url/path> ...

no such option: --upgarde
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [66]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import phik
from phik.report import plot_correlation_matrix
from phik import report

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

pd.options.display.float_format = '{:,.3f}'.format
pd.options.mode.chained_assignment = None

In [67]:
# train.csv dataset
df_train = pd.read_csv('train.csv')

In [68]:
# test.csv dataset
df_test = pd.read_csv('test.csv')
df_test_orig = df_test  # Original test.csv dataset

In [69]:
df_train = df_train.drop('id', axis=1)
df_test = df_test.drop('id', axis=1)

In [70]:
df_train.loc[df_train['gender'] == 2, 'gender'] = 0
df_test.loc[df_test['gender'] == 2, 'gender'] = 0

df_train = df_train.rename(columns={'gender': 'female'})
df_test = df_test.rename(columns={'gender': 'female'})

In [71]:
male_median_height = df_train[df_train['female'] == 0]['height'].median()
female_median_height = df_train[df_train['female'] == 1]['height'].median()

# train.csv
df_train.loc[(df_train['height'] < 153) & (df_train['female'] == 0), 'height'] = male_median_height
df_train.loc[(df_train['height'] < 145) & (df_train['female'] == 1), 'height'] = female_median_height

# test.csv
df_test.loc[(df_test['height'] < 153) & (df_test['female'] == 0), 'height'] = male_median_height
df_test.loc[(df_test['height'] < 145) & (df_test['female'] == 1), 'height'] = female_median_height

In [72]:
display(df_train[df_train['height'] > 210]['female'])

df_train.loc[df_train['height'] > 210, 'height'] = female_median_height

6486    1
Name: female, dtype: int64

In [73]:
female_median_weight = df_train[df_train['female'] == 1]['weight'].median()
male_median_weight = df_train[df_train['female'] == 0]['weight'].median()

df_train.loc[(df_train['female'] == 0) & (df_train['weight'] <= 30), 'weight'] = male_median_weight
df_train.loc[(df_train['female'] == 1) & (df_train['weight'] <= 30), 'weight'] = female_median_weight

df_test.loc[(df_train['female'] == 0) & (df_test['weight'] <= 30), 'weight'] = male_median_weight
df_test.loc[(df_train['female'] == 1) & (df_test['weight'] <= 30), 'weight'] = female_median_weight

In [74]:
df_train['bmi'] = df_train['weight'] / ((df_train['height'] /100) ** 2)
df_test['bmi'] = df_test['weight'] / ((df_train['height'] /100) ** 2)

df_train = df_train.drop(['weight', 'height'], axis=1)
df_test = df_test.drop(['weight', 'height'], axis=1)

In [75]:
# Taking the absolute value of negative values
df_train.loc[df_train['ap_hi'] < 0, 'ap_hi'] *= -1
df_test.loc[df_test['ap_hi'] < 0, 'ap_hi'] *= -1

df_train.loc[df_train['ap_lo'] < 0, 'ap_lo'] *= -1
df_test.loc[df_test['ap_lo'] < 0, 'ap_lo'] *= -1

In [76]:
df_train.loc[(df_train['ap_hi'] >= 700) & (df_train['ap_hi'] <= 2600), 'ap_hi'] /= 10
df_test.loc[(df_test['ap_hi'] >= 700) & (df_test['ap_hi'] <= 2600), 'ap_hi'] /= 10

df_train.loc[(df_train['ap_lo'] >= 500) & (df_train['ap_lo'] <= 1600), 'ap_lo'] /= 10
df_test.loc[(df_test['ap_lo'] >= 500) & (df_test['ap_lo'] <= 1600), 'ap_lo'] /= 10

In [77]:
df_train.loc[df_train['ap_hi'] > 2600, 'ap_hi'] /= 100
df_train.loc[df_train['ap_lo'] > 1600, 'ap_lo'] /= 100

df_test.loc[df_test['ap_hi'] > 2600, 'ap_hi'] /= 100
df_test.loc[df_test['ap_lo'] > 1600, 'ap_lo'] /= 100

In [78]:
# Some of the values may miss a zero, therefore, they should be multiplied by 10
df_train.loc[(df_train['ap_hi'] >= 7) & (df_train['ap_hi'] <= 26), 'ap_hi'] *= 10
df_train.loc[(df_train['ap_lo'] >= 5) & (df_train['ap_lo'] <= 16), 'ap_lo'] *= 10

df_test.loc[(df_test['ap_hi'] >= 7) & (df_test['ap_hi'] <= 26), 'ap_hi'] *= 10
df_test.loc[(df_test['ap_lo'] >= 5) & (df_test['ap_lo'] <= 16), 'ap_lo'] *= 10

In [79]:
# Saving median values of ap_hi & ap_lo of each dataset to separate variables
train_median_ap_hi = df_train['ap_hi'].median()
test_median_ap_hi = df_test['ap_hi'].median()

train_median_ap_lo = df_train['ap_lo'].median()
test_median_ap_lo = df_test['ap_lo'].median()

In [80]:
# If ap_hi > 260 / ap_hi < 70 or ap_lo > 160 / ap_lo < 50, change to median values
df_train.loc[(df_train['ap_hi'] > 260) | (df_train['ap_hi'] < 70), 'ap_hi'] = train_median_ap_hi
df_test.loc[(df_test['ap_hi'] > 260) | (df_test['ap_hi'] < 70), 'ap_hi'] = test_median_ap_hi

df_train.loc[(df_train['ap_lo'] > 160) | (df_train['ap_lo'] < 50), 'ap_lo'] = train_median_ap_lo
df_test.loc[(df_test['ap_lo'] > 160) | (df_test['ap_lo'] < 50), 'ap_lo'] = test_median_ap_lo

In [81]:
def change_blood_pressure(df):
    for i, row in df.iterrows():
        if row['ap_hi'] < row['ap_lo']:
            ap_hi = row['ap_hi']
            ap_lo = row['ap_lo']
            df.at[i, 'ap_hi'] = ap_lo
            df.at[i, 'ap_lo'] = ap_hi

change_blood_pressure(df_train)
change_blood_pressure(df_test)

In [82]:
df_train.loc[df_train['ap_hi'] == df_train['ap_lo'], 'ap_hi'] = train_median_ap_hi
df_test.loc[df_test['ap_hi'] == df_test['ap_lo'], 'ap_hi'] = test_median_ap_hi

In [83]:
df_train.columns

Index(['age', 'female', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke',
       'alco', 'active', 'cardio', 'bmi'],
      dtype='object')

In [84]:
# Dividing the df_train into train and valid datasets.
X = df_train.drop('cardio', axis=1)
y = df_train['cardio']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, random_state=42, test_size=0.25
)

In [85]:
# Preparing column transformer
numeric_columns = ['age', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'bmi']
skip_columns = ['female', 'smoke', 'alco', 'active']
scoring = 'roc_auc'
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(transformers=[
    ('numeric_transformer', numeric_transformer, numeric_columns),
    ('skip', 'passthrough', skip_columns)
])

In [86]:
best_forest = RandomForestClassifier(random_state=42, criterion='entropy', n_estimators=150, max_depth=12, min_samples_leaf=10)
best_forest_pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', best_forest)])
best_forest_pipe.fit(X_train, y_train)

In [87]:
from pickle import dump

with open("./main.pcl", "wb") as model_file:
    dump(best_forest_pipe, model_file)