Predicting a patient's probability of having a heart attack using health data.

## Libraries

In [None]:
import joblib
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

## CSV dosyasını okuma
CSV dosyası pandas kütüphanesi ile DataFrame olarak okunuyor.

In [None]:
df = pd.read_csv("heart_attack_prediction_dataset.csv")

# Sütun İsimleri

In [None]:
df.columns

Index(['Patient ID', 'Age', 'Sex', 'Cholesterol', 'Blood Pressure',
       'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity',
       'Alcohol Consumption', 'Exercise Hours Per Week', 'Diet',
       'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Country',
       'Continent', 'Hemisphere', 'Heart Attack Risk'],
      dtype='object')

# İlk Birkaç Satır
Veriyi anlamak için ilk birkaç satırın nasıl göründüğüne bakabiliriz

In [None]:
df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


# Bir Satırı Seçme
Bazı sütunların gösterilmediğini görüyoruz (...) veri yapısını anlamak için ilk satırı inceleyelim.

In [None]:
df.iloc[0]

Patient ID                                     BMW7812
Age                                                 67
Sex                                               Male
Cholesterol                                        208
Blood Pressure                                  158/88
Heart Rate                                          72
Diabetes                                             0
Family History                                       0
Smoking                                              1
Obesity                                              0
Alcohol Consumption                                  0
Exercise Hours Per Week                       4.168189
Diet                                           Average
Previous Heart Problems                              0
Medication Use                                       0
Stress Level                                         9
Sedentary Hours Per Day                       6.615001
Income                                          261404
BMI       

# Veri Ön İşleme
Blood Pressure (tansiyon) sütununun iki değeri a/b şeklinde gösterdiğini görüyoruz. Bunları büyük ve küçük tansiyon verilerini iki sütuna bölecek şekilde ayırmamız gerekiyor.

In [None]:
df[["Blood Pressure (systolic)", "Blood Pressure (diastolic)"]] = df["Blood Pressure"].str.split("/", expand=True)

In [None]:
df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,Blood Pressure (systolic),Blood Pressure (diastolic)
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0,158,88
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0,165,93
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,28.176571,587,4,4,France,Europe,Northern Hemisphere,0,174,99
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0,163,100
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0,91,88


# One Hot Encoder
Cinsiyet, diyet gibi metin olan sütunları düzenliyoruz.

In [None]:
data = pd.get_dummies(df.drop(columns=["Blood Pressure", "Patient ID"]), columns = [ "Sex", "Diet", "Country", "Continent", "Hemisphere" ])

In [None]:
data.head()

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,...,Country_United States,Country_Vietnam,Continent_Africa,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America,Hemisphere_Northern Hemisphere,Hemisphere_Southern Hemisphere
0,67,208,72,0,0,1,0,0,4.168189,0,...,False,False,False,False,False,False,False,True,False,True
1,21,389,98,1,1,1,1,1,1.813242,1,...,False,False,False,False,False,False,True,False,True,False
2,21,324,72,1,0,0,0,0,2.078353,1,...,False,False,False,False,False,True,False,False,True,False
3,84,383,73,1,1,1,0,1,9.82813,1,...,False,False,False,False,False,False,True,False,True,False
4,66,318,93,1,1,1,1,0,5.804299,1,...,False,False,False,True,False,False,False,False,True,False


## Makine Öğrenmesi
Veriyi train ve test olarak bölüyoruz

In [None]:
train_columns = [
    "Age", "Cholesterol", "Smoking", "Alcohol Consumption", "Diabetes", "Obesity",
    "Previous Heart Problems",
    "Blood Pressure (systolic)", "Blood Pressure (diastolic)",
    "Sex_Male", "Sex_Female"
]

predict_column = "Heart Attack Risk"

In [None]:
regr = linear_model.LinearRegression()

Veriyi train ve test olarak bölüyoruz (%20 train)

In [None]:
train, test = train_test_split(data, test_size=0.2)

Train verilerinde diğer sütunları Hearth Rate'i tahmin etmek için eğitiyoruz.

In [None]:
model = regr.fit(train[train_columns], train[predict_column])

In [None]:
joblib.dump(model, "model.pkl")

['model.pkl']

# Modeli Test Etme
Bu kısımda eğitilmiş modeli test verilerinden Heart Rate sütununu çıkararak ediyoruz.

In [None]:
sonuclar = regr.predict(test[train_columns])

In [None]:
sonuclar

array([0.34997906, 0.38340381, 0.36959762, ..., 0.38145423, 0.35069714,
       0.35502835])

# Model Başarısını Ölçme
Bulunan sonuçları gerçek Heart Rate değerleriyle kıyaslayarak model başarısını ölçüyoruz.

Tahmin edilen sonuç ile gerçek sonuç arasındaki mutlak hata ortalaması

In [None]:
mean_absolute_error(test[predict_column], sonuclar)

0.45998254410892464

# Modeli Kullanma
API için tahmin

In [None]:
def heart_attack_risk(age, cholesterol, blood_pressure, sex,
    smoking=False, alcohol=False, diabetes=False, obesity=False, previous = 0):
    regr = joblib.load("model.pkl")
    tansiyon = blood_pressure.split("/")
    return regr.predict(pd.DataFrame(data = {
        "Age": [age],
        "Cholesterol": [cholesterol],
        "Smoking": int(smoking),
        "Alcohol Consumption": int(alcohol),
        "Diabetes": int(diabetes),
        "Obesity": int(obesity),
        "Previous Heart Problems": previous,
        "Blood Pressure (systolic)": [float(tansiyon[0])],
        "Blood Pressure (diastolic)": [float(tansiyon[1])],
        "Sex_Male": [int(sex == "Male")],
        "Sex_Female": [int(sex == "Female")]
    }))

# Örnek API çağrıları

In [None]:
heart_attack_risk(age=20, cholesterol=300, blood_pressure='120/80', sex='Male', alcohol=True, previous=1)

array([0.35046476])

In [None]:
heart_attack_risk(age=26, cholesterol=80, blood_pressure='130/70', sex='Female', smoking=True, diabetes=True)

array([0.3401804])

In [None]:
heart_attack_risk(age=69, cholesterol=379, blood_pressure='173/75', sex='Male', smoking=True, alcohol=True, obesity=True, diabetes=True)

array([0.39150718])

In [None]:
data[ data["Heart Rate"] == 40 ].iloc[0]

Age                                       69
Cholesterol                              379
Heart Rate                                40
Diabetes                                   1
Family History                             1
Smoking                                    1
Obesity                                    1
Alcohol Consumption                        1
Exercise Hours Per Week             4.184648
Previous Heart Problems                    1
Medication Use                             0
Stress Level                               5
Sedentary Hours Per Day             9.060509
Income                                267997
BMI                                28.332747
Triglycerides                             68
Physical Activity Days Per Week            3
Sleep Hours Per Day                        6
Heart Attack Risk                          0
Blood Pressure (systolic)                173
Blood Pressure (diastolic)                75
Sex_Female                             False
Sex_Male  