In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('heart_disease_data.csv')
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Here is a description of each column:

age: age of the patient                                     
sex: gender of the patient (0 = female, 1 = male)                                                 
cp: chest pain type (0 = typical angina, 1 = atypical angina, 2 = non-anginal pain, 3 = asymptomatic)                                                
trestbps: resting blood pressure (in mm Hg)                                          
chol: serum cholesterol (in mg/dl)                                                           
fbs: fasting blood sugar > 120 mg/dl (1 = true, 0 = false)                                                  
restecg: resting electrocardiographic results (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)
thalach: maximum heart rate achieved                                                   
exang: exercise induced angina (1 = yes, 0 = no)                                                 
oldpeak: ST depression induced by exercise relative to rest                                           
slope: the slope of the peak exercise ST segment (0 = downsloping, 1 = flat, 2 = upsloping)               
ca: number of major vessels colored by fluoroscopy (0-3)                                              
thal: thalassemia (0 = normal, 1 = fixed defect, 2 = reversible defect)                                
target: presence of heart disease (1 = yes, 0 = no)                                      

In [6]:
# checking for missing values
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [19]:
# statistical measures about the data
# heart_data.describe()

In [8]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [10]:
model = LogisticRegression()
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print(training_data_accuracy)

0.8512396694214877


In [12]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
test_data_accuracy

0.819672131147541

In [13]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [14]:
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Load your dataset and perform data preprocessing (X and Y).

# Split the data into training and test sets.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Create a CatBoost classifier.
model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss')

# Training the CatBoost model with training data.
model.fit(X_train, Y_train)

# Accuracy on the training data.
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print("Training Data Accuracy:", training_data_accuracy)

# Accuracy on the test data.
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print("Test Data Accuracy:", test_data_accuracy)


0:	learn: 0.6645517	total: 47.4ms	remaining: 4.69s
1:	learn: 0.6409540	total: 48.4ms	remaining: 2.37s
2:	learn: 0.6196643	total: 50.1ms	remaining: 1.62s
3:	learn: 0.6013568	total: 51.1ms	remaining: 1.23s
4:	learn: 0.5822471	total: 52ms	remaining: 988ms
5:	learn: 0.5675466	total: 52.8ms	remaining: 828ms
6:	learn: 0.5493878	total: 53.6ms	remaining: 713ms
7:	learn: 0.5367728	total: 54.5ms	remaining: 627ms
8:	learn: 0.5243598	total: 55.4ms	remaining: 560ms
9:	learn: 0.5119162	total: 56.2ms	remaining: 506ms
10:	learn: 0.5031497	total: 57ms	remaining: 461ms
11:	learn: 0.4931284	total: 57.7ms	remaining: 423ms
12:	learn: 0.4840162	total: 58.7ms	remaining: 393ms
13:	learn: 0.4735327	total: 59.2ms	remaining: 364ms
14:	learn: 0.4626056	total: 60.1ms	remaining: 340ms
15:	learn: 0.4550936	total: 60.9ms	remaining: 320ms
16:	learn: 0.4474164	total: 61.8ms	remaining: 302ms
17:	learn: 0.4397248	total: 62.7ms	remaining: 286ms
18:	learn: 0.4321199	total: 63.5ms	remaining: 271ms
19:	learn: 0.4261718	total

# Prediction System

In [18]:
input_data = (43,1,0,120,177,0,0,120,1,2.5,1,0,3)
np_data = np.asarray(input_data)
reshaped_df = np_data.reshape(1,-1)
pred = model.predict(reshaped_df)
if pred[0] == 0:
    print("the person doesnt have a heart disease")
else:
    print('the person does have a heart desease')

the person doesnt have a heart disease


In [17]:
# Get user input for the features
# age = float(input("Enter age: "))
# sex = float(input("Enter sex (0 for female, 1 for male): "))
# cp = float(input("Enter chest pain type (0-3): "))
# trestbps = float(input("Enter resting blood pressure: "))
# chol = float(input("Enter serum cholesterol (mg/dl): "))
# fbs = float(input("Enter fasting blood sugar (0 for <= 120 mg/dl, 1 for > 120 mg/dl): "))
# restecg = float(input("Enter resting electrocardiographic results (0-2): "))
# thalach = float(input("Enter maximum heart rate achieved: "))
# exang = float(input("Enter exercise-induced angina (0 for no, 1 for yes): "))
# oldpeak = float(input("Enter ST depression induced by exercise relative to rest: "))
# slope = float(input("Enter slope of the peak exercise ST segment (0-2): "))
# ca = float(input("Enter number of major vessels colored by fluoroscopy (0-3): "))
# thal = float(input("Enter thalassemia type (0-2): "))

# Create a NumPy array with the input data
inputs = (43,1,0,120,177,0,0,120,1,2.5,1,0,3)
# input_data = np.array([age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal])
np_data = np.asarray(inputs)
# Reshape the input data to match the model's input shape (assuming your model expects a 2D array)
reshaped_data = np_data.reshape(1, -1)

# Make predictions
pred = model.predict(reshaped_data)

if pred[0] == 0:
    print("The person doesn't have heart disease.")
else:
    print("The person does have heart disease.")

The person doesn't have heart disease.
