In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

Data Collection and Processing

In [51]:
# loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('heart.csv')

In [52]:
# print first 5 rows of the dataset
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [53]:
# print last 5 rows of the dataset
heart_data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
70127,43,1,0,150,248,0,1,172,0,1.50704,2,0,2,1
70128,43,1,0,120,178,0,0,121,1,2.507041,1,0,3,0
70129,65,1,0,120,178,0,1,141,0,0.407041,2,0,3,1
70130,41,0,2,112,269,0,0,173,1,0.007041,2,0,2,1
70131,65,1,0,135,255,0,0,128,0,2.807041,1,1,3,0


In [54]:
# number of rows and columns in the dataset
heart_data.shape

(70132, 14)

In [55]:
# getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70132 entries, 0 to 70131
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       70132 non-null  int64  
 1   sex       70132 non-null  int64  
 2   cp        70132 non-null  int64  
 3   trestbps  70132 non-null  int64  
 4   chol      70132 non-null  int64  
 5   fbs       70132 non-null  int64  
 6   restecg   70132 non-null  int64  
 7   thalach   70132 non-null  int64  
 8   exang     70132 non-null  int64  
 9   oldpeak   70132 non-null  float64
 10  slope     70132 non-null  int64  
 11  ca        70132 non-null  int64  
 12  thal      70132 non-null  int64  
 13  target    70132 non-null  int64  
dtypes: float64(1), int64(13)
memory usage: 7.5 MB


In [56]:
# checking for missing values
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [57]:
# statistical measures about the data
heart_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,70132.0,70132.0,70132.0,70132.0,70132.0,70132.0,70132.0,70132.0,70132.0,70132.0,70132.0,70132.0,70132.0,70132.0
mean,54.422674,0.684894,0.937475,129.825543,248.43401,0.137455,0.550305,151.85128,0.314393,0.99502,1.408031,0.603975,2.301146,0.611447
std,9.495172,0.464561,1.017204,16.235747,45.73167,0.34433,0.519782,21.375304,0.464277,1.092893,0.646497,0.924992,0.548951,0.487425
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.0,0.0,0.0,120.0,215.0,0.0,0.0,139.0,0.0,0.00536,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,245.0,0.0,1.0,154.0,0.0,0.800231,1.0,0.0,2.0,1.0
75%,62.0,1.0,2.0,140.0,278.0,0.0,1.0,169.0,1.0,1.603678,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,395.0,1.0,2.0,203.0,1.0,6.207006,2.0,4.0,3.0,1.0


In [58]:
# checking the distribution of Target Variable
heart_data['target'].value_counts()

target
1    42882
0    27250
Name: count, dtype: int64

1 --> Defective Heart

0 --> Healthy Heart

Splitting the Features and Target

In [59]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [60]:
print(X)

       age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang   oldpeak  \
0       63    1   3       145   233    1        0      150      0  2.300000   
1       37    1   2       130   250    0        1      187      0  3.500000   
2       41    0   1       130   204    0        0      172      0  1.400000   
3       56    1   1       120   236    0        1      178      0  0.800000   
4       57    0   0       120   354    0        1      163      1  0.600000   
...    ...  ...  ..       ...   ...  ...      ...      ...    ...       ...   
70127   43    1   0       150   248    0        1      172      0  1.507040   
70128   43    1   0       120   178    0        0      121      1  2.507041   
70129   65    1   0       120   178    0        1      141      0  0.407041   
70130   41    0   2       112   269    0        0      173      1  0.007041   
70131   65    1   0       135   255    0        0      128      0  2.807041   

       slope  ca  thal  
0          0   0     1  
1

In [61]:
print(Y)

0        1
1        1
2        1
3        1
4        1
        ..
70127    1
70128    0
70129    1
70130    1
70131    0
Name: target, Length: 70132, dtype: int64


Splitting the Data into Training data & Test Data

In [62]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [63]:
print(X.shape, X_train.shape, X_test.shape)

(70132, 13) (56105, 13) (14027, 13)


Model Training

In [64]:
# Khởi tạo các mô hình
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [65]:
# Đánh giá các mô hình
for name, model in models.items():
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    print(f'{name} Accuracy: {accuracy_score(Y_test, Y_pred):.2f}')
    print(classification_report(Y_test, Y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      5450
           1       0.97      0.99      0.98      8577

    accuracy                           0.98     14027
   macro avg       0.98      0.98      0.98     14027
weighted avg       0.98      0.98      0.98     14027

Decision Tree Accuracy: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5450
           1       1.00      1.00      1.00      8577

    accuracy                           1.00     14027
   macro avg       1.00      1.00      1.00     14027
weighted avg       1.00      1.00      1.00     14027

Random Forest Accuracy: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5450
           1       1.00      1.00      1.00      8577

    accuracy                           1.00     14027
   macro avg       1.00      1.00  

Check overfitting

In [67]:
from sklearn.model_selection import cross_val_score

# Giả sử model là mô hình của bạn
cv_scores = cross_val_score(model, X, Y, cv=5)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.98773793 0.99985742 0.9999287  1.         0.99786112]
Mean cross-validation score: 0.9970770341458459


Đánh giá mô hình:
|-Logistic Regression là một mô hình đơn giản và dễ giải thích, có độ chính xác cao và ít khả năng overfitting.
|-Random Forest và Gradient Boosting: Có tiềm năng mạnh mẽ, nhưng cần điều chỉnh đúng cách để tránh overfitting.
|-SVM: Cần phải điều chỉnh các tham số hoặc sử dụng các kỹ thuật xử lý dữ liệu.
|-Decision Tree và KNN: Dễ bị overfitting, nên sẽ cần kiểm tra kỹ lưỡng hoặc tránh sử dụng nếu không thể điều chỉnh tốt.

Đề xuất mô hình:
Dựa trên các kết quả trên, Logistic Regression là mô hình khả thi nhất và một mô hình dễ hiểu và giải thích, Logistic Regression là lựa chọn tốt nhất. 

Logistic Regression

In [68]:
model = LogisticRegression()

In [69]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [70]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [71]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9824971036449515


In [72]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [73]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.980109788265488


Building a Predictive System

In [74]:
input_data = (62,0,0,140,268,0,0,160,0,3.6,0,2,2)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

[0]
The Person does not have a Heart Disease




Saving the trained model

In [75]:
import pickle

In [76]:
filename = 'heart_disease_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [77]:
# loading the saved model
loaded_model = pickle.load(open('heart_disease_model.sav', 'rb'))

In [78]:
for column in X.columns:
  print(column)

age
sex
cp
trestbps
chol
fbs
restecg
thalach
exang
oldpeak
slope
ca
thal
