In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


- age : age of a person
- sex : gender of a person
- cp : chest pain . it has 4 values -> typical angine - 0 , atypical angina - 1 , non-anginal pain - 2 , asymptotatic - 3
- trestbps : blood pressure
- chol : cholestrol
- fbs : Fasting blood sugar > 120 mg/dl (1 for true, 0 for false)
- restecg : Resting electrocardiographic results . it has 3 values -> normal - 0 , ST-T wave abnormality - 2 , Left ventricular hypertrophy - 3
- thalach : Maximum heart rate achieved to the patient
- exang : Exercise-induced angina -> Exercise-Induced Angina Present (Value: 1): This means that the individual experiences angina (chest pain or discomfort) during physical activity or exercise , Exercise-Induced Angina Absent (Value: 0): This means that the individual does not experience angina during physical activity or exercise.
- oldpeak : Oldpeak (ST depression): This column typically contains numerical values that represent the magnitude of ST depression observed during exercise relative to rest. The values could be positive or negative, depending on the direction of the ST segment changes.
- slope :  the "slope" column represents the slope of the peak exercise ST segment . upsloping - 1 , flat - 2 , downsloping - 3
- ca : number of major vessels (0-3) colored by flourosopy
- thal : Thalassemia type. 0 - normal , 1 - fixed defect , 2 - reversable defect
- target : Presence or absence of heart disease (1 for presence, 0 for absence)

In [3]:
np.unique(df['cp'])

array([0, 1, 2, 3], dtype=int64)

In [4]:
np.unique(df['restecg'])

array([0, 1, 2], dtype=int64)

In [5]:
np.unique(df['slope'])

array([0, 1, 2], dtype=int64)

In [6]:
np.unique(df['thal'])

array([0, 1, 2, 3], dtype=int64)

In [7]:
x = df.drop('target',axis=1)
y = df['target']

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
logreg_model = LogisticRegression()
logreg_model.fit(x_train, y_train)
logreg_predictions = logreg_model.predict(x_test)

accuracy = accuracy_score(y_test, logreg_predictions)
report = classification_report(y_test, logreg_predictions)

print(f"Logistic Regression Accuracy: {accuracy}")
print("Classification Report:\n", report)

Logistic Regression Accuracy: 0.7804878048780488
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.69      0.76       102
           1       0.74      0.87      0.80       103

    accuracy                           0.78       205
   macro avg       0.79      0.78      0.78       205
weighted avg       0.79      0.78      0.78       205



In [10]:
model2 = DecisionTreeClassifier()
model2.fit(x_train,y_train)
model2_pred = model2.predict(x_test)

accuracy = accuracy_score(y_test, model2_pred)
report = classification_report(y_test, model2_pred)

print(f"Logistic Regression Accuracy: {accuracy}")
print("Classification Report:\n", report)

Logistic Regression Accuracy: 0.9853658536585366
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205



In [11]:
model3 = RandomForestClassifier()
model3.fit(x_train,y_train)
model3_pred = model3.predict(x_test)

accuracy = accuracy_score(y_test, model3_pred)
report = classification_report(y_test, model3_pred)

print(f"Logistic Regression Accuracy: {accuracy}")
print("Classification Report:\n", report)

Logistic Regression Accuracy: 0.9853658536585366
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205



In [12]:
# Assuming model is your trained RandomForestClassifier
feature_importances = model3.feature_importances_
print("Feature Importances:\n", feature_importances)

Feature Importances:
 [0.07970902 0.02953324 0.14787794 0.07083752 0.08203603 0.00934804
 0.02027481 0.10584953 0.04937247 0.11960239 0.04308242 0.12506861
 0.11740799]


In [12]:
import pickle
with open('heart_disease_trained.pkl','wb') as file:
    pickle.dump(model3,file)

In [13]:
with open('heart_disease_trained.pkl','rb') as file:
    loadedmodel = pickle.load(file)

In [14]:
loadedmodel.predict(np.array([[52,1,0,125,212,0,1,168,0,1.0,2,2,3]]))

array([0], dtype=int64)