### 1. Importing the Libraries

In [1]:
import pandas as pd

### 2. Importing the Dataset

In [2]:
data = pd.read_csv("csv/heart.csv")
data.tail()
value_counts = data["target"].value_counts()
print(value_counts)

1    526
0    499
Name: target, dtype: int64


### 3. Taking Care of Missing Values

In [3]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

### 4. Taking Care of Duplicate Values

In [4]:
data_dup = data.duplicated().any()

In [5]:
data_dup

True

In [6]:
data = data.drop_duplicates()

In [7]:
data_dup = data.duplicated().any()

In [8]:
data_dup

False

### 5. Data Processing

In [9]:
cate_val = []
cont_val = []
for column in data.columns:
    if data[column].nunique() <=10:
        cate_val.append(column)
    else:
        cont_val.append(column)

In [10]:
cate_val

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

In [11]:
cont_val

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

### 6. Encoding Categorical Data

In [12]:
cate_val

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

In [13]:
data['cp'].unique()

array([0, 1, 2, 3], dtype=int64)

In [14]:
cate_val.remove('sex')
cate_val.remove('target')
data = pd.get_dummies(data,columns = cate_val,drop_first=True)

In [15]:
import pandas as pd

def data_processing(df):
    # Check for missing values and remove rows with null values
    df = df.dropna()

    # Remove duplicate rows
    df = df.drop_duplicates()

    # Initialize lists for categorical and binary columns
    categorical_columns = []
    binary_columns = []

    # Categorize columns based on the number of unique values
    for column in df.columns:
        if df[column].nunique() > 10:
            # More than 10 unique values, consider it numerical
            continue
        elif df[column].nunique() == 2:
            # 2 unique values, consider it binary
            binary_columns.append(column)
        else:
            # Less than or equal to 10 unique values, consider it categorical
            categorical_columns.append(column)

    # Remove binary columns from the categorical list
    categorical_columns = [col for col in categorical_columns if col not in binary_columns]

    # Create dummy data for non-binary categorical columns
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

    return df

# Example usage:
# Create a sample dataframe


df = pd.DataFrame(data)

processed_data = data_processing(data)
processed_data


Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,52,1,125,212,168,1.0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
1,53,1,140,203,155,3.1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,70,1,145,174,125,2.6,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,61,1,148,203,161,0.0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
4,62,0,138,294,106,1.9,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,68,0,120,211,115,1.5,1,0,1,0,...,0,1,0,0,0,0,0,0,1,0
733,44,0,108,141,175,0.6,1,0,1,0,...,0,1,0,0,0,0,0,0,1,0
739,52,1,128,255,161,0.0,0,0,0,0,...,1,0,1,1,0,0,0,0,0,1
843,59,1,160,273,125,0.0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0


### 7. Feature Scaling

In [16]:
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,52,1,125,212,168,1.0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
1,53,1,140,203,155,3.1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,70,1,145,174,125,2.6,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,61,1,148,203,161,0.0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
4,62,0,138,294,106,1.9,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
st = StandardScaler()
data[cont_val] = st.fit_transform(data[cont_val])

In [19]:
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,-0.267966,1,-0.376556,-0.667728,0.806035,-0.037124,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.15726,1,0.47891,-0.841918,0.237495,1.773958,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,1.724733,1,0.764066,-1.403197,-1.074521,1.342748,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0.728383,1,0.935159,-0.841918,0.499898,-0.899544,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
4,0.839089,0,0.364848,0.919336,-1.905464,0.739054,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0


### 8. Splitting The Dataset Into The Training Set And Test Set

In [20]:
X = data.drop('target',axis=1)

In [21]:
y = data['target']

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,
                                               random_state=42)

In [24]:
y_test

245    1
349    0
135    0
389    1
66     1
      ..
402    1
123    1
739    0
274    1
256    1
Name: target, Length: 61, dtype: int64

### 9. Logistic Regression

In [25]:
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,-0.267966,1,-0.376556,-0.667728,0.806035,-0.037124,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.15726,1,0.47891,-0.841918,0.237495,1.773958,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,1.724733,1,0.764066,-1.403197,-1.074521,1.342748,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0.728383,1,0.935159,-0.841918,0.499898,-0.899544,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
4,0.839089,0,0.364848,0.919336,-1.905464,0.739054,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [26]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm


In [27]:
log = LogisticRegression()
log.fit(X_train,y_train)
y_pred_log = log.predict(X_test)
f1_score(y_test,y_pred_log)

0.7868852459016394

In [28]:
svm = svm.SVC()
svm.fit(X_train,y_train)
y_pred_svm = svm.predict(X_test)
f1_score(y_test,y_pred_svm)

0.8064516129032258

In [29]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred_knn=knn.predict(X_test)
accuracy_score(y_test,y_pred_knn)

0.7377049180327869

In [30]:
score = []
# for_later
for k in range(1,40):
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    y_pred=knn.predict(X_test)
    score.append(accuracy_score(y_test,y_pred))

In [31]:
score

[0.7213114754098361,
 0.8032786885245902,
 0.7049180327868853,
 0.7049180327868853,
 0.7377049180327869,
 0.8032786885245902,
 0.7868852459016393,
 0.8032786885245902,
 0.7704918032786885,
 0.7540983606557377,
 0.7704918032786885,
 0.7540983606557377,
 0.7377049180327869,
 0.7377049180327869,
 0.7540983606557377,
 0.7704918032786885,
 0.7540983606557377,
 0.7540983606557377,
 0.7377049180327869,
 0.7540983606557377,
 0.7377049180327869,
 0.7213114754098361,
 0.7377049180327869,
 0.7377049180327869,
 0.7213114754098361,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869]

In [32]:
# knn=KNeighborsClassifier(n_neighbors=2)
# knn.fit(X_train,y_train)
# y_pred=knn.predict(X_test)
# accuracy_score(y_test,y_pred)

### Non-Linear ML Algorithms

In [33]:
data = pd.read_csv("csv/heart.csv")


In [34]:
data = data.drop_duplicates()

In [35]:
X = data.drop('target',axis=1)
y=data['target']

In [36]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,
                                                random_state=42)

### 12. Decision Tree Classifier

In [37]:
from sklearn.tree import DecisionTreeClassifier

In [38]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_pred_dtc= dt.predict(X_test)

### 13. Random Forest Classifier

In [39]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred_rf= rf.predict(X_test)


In [40]:
from sklearn.ensemble import GradientBoostingClassifier

In [41]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
y_pred_xgb = gbc.predict(X_test)
accuracy_score(y_test,y_pred_xgb)

0.8032786885245902

In [42]:
import numpy as np


final_data = pd.DataFrame({'Models':['LR','SVM','KNN','DT','RF','GB'],
                          'F1 SCORE':[f1_score(y_test,y_pred_log)*100,
                                f1_score(y_test,y_pred_svm)*100,
                                f1_score(y_test,y_pred_knn)*100,
                                f1_score(y_test,y_pred_dtc)*100,
                                f1_score(y_test,y_pred_rf)*100,
                                f1_score(y_test,y_pred_xgb)*100],
                            'RMSE':[np.sqrt(mean_squared_error(y_test,y_pred_log)),
                                np.sqrt(mean_squared_error(y_test,y_pred_svm)),
                                np.sqrt(mean_squared_error(y_test,y_pred_knn)),
                                np.sqrt(mean_squared_error(y_test,y_pred_dtc)),
                                np.sqrt(mean_squared_error(y_test,y_pred_rf)),
                                np.sqrt(mean_squared_error(y_test,y_pred_xgb))]})


In [43]:
final_data

Unnamed: 0,Models,F1 SCORE,RMSE
0,LR,78.688525,0.461644
1,SVM,80.645161,0.443533
2,KNN,75.757576,0.512148
3,DT,66.666667,0.5581
4,RF,85.245902,0.384111
5,GB,80.645161,0.443533


In [44]:
import seaborn as sns

In [45]:
X=data.drop('target',axis=1)
y=data['target']

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
# rf = RandomForestClassifier()
# rf.fit(X,y)

In [48]:
import pickle
from sklearn.ensemble import RandomForestClassifier
X=data.drop('target',axis=1)
y=data['target']
# Create and train your RandomForestClassifier model
rf = RandomForestClassifier()
rf.fit(X, y)  # Replace X and y with your training data

# Specify the file path where you want to save the model
model_file = 'random_forest_model.pkl'

# Open the file in binary write mode ('wb')
with open(model_file, 'wb') as file:
    # Use pickle.dump() to serialize and save the model to the file
    pickle.dump(rf, file)

print(f'RandomForestClassifier model saved to {model_file}')


RandomForestClassifier model saved to random_forest_model.pkl


In [49]:

new_data_2=pd.read_csv("csv\heart.csv")
new_data_2.shape
value_counts = new_data_2['target'].value_counts()
print(value_counts)
new_data_2=new_data_2.drop(["target"],axis=1)
new_data_2.shape

1    526
0    499
Name: target, dtype: int64


(1025, 13)

In [50]:
import pickle
from sklearn.ensemble import RandomForestClassifier
X=data.drop('target',axis=1)
y=data['target']
# Create and train your RandomForestClassifier model
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train) 

# Specify the file path where you want to save the model
model_file = 'random_forest_model.pkl'

# Open the file in binary write mode ('wb')
with open(model_file, 'wb') as file:
    # Use pickle.dump() to serialize and save the model to the file
    pickle.dump(rf, file)

print(f'RandomForestClassifier model saved to {model_file}')


RandomForestClassifier model saved to random_forest_model.pkl


In [51]:
import pickle


model_file = 'random_forest_model.pkl'


with open(model_file, 'rb') as file:
    
    loaded_model = pickle.load(file)
predictions = loaded_model.predict(new_data_2)  
print("RandomForestClassifier model loaded and used for inference:", predictions)
len(predictions)
import numpy as np
predictions_array = np.array(predictions)
count_ones = np.count_nonzero(predictions_array == 1)
count_zeros = np.count_nonzero(predictions_array == 0)
print("Number of postives:", count_ones)
print("Number of negatives:", count_zeros)
new_data_3=pd.read_csv("csv\heart.csv")
new_data_3['Predictions'] = predictions
new_data_3


RandomForestClassifier model loaded and used for inference: [0 0 0 ... 0 1 0]
Number of postives: 526
Number of negatives: 499


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,Predictions
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1,1


In [52]:
import pickle
import pandas as pd
from sklearn.metrics import confusion_matrix

# Load the random forest model as you've shown
model_file = 'random_forest_model.pkl'
with open(model_file, 'rb') as file:
    loaded_model = pickle.load(file)

# Assuming 'new_data_2' is your DataFrame
# Predict using the loaded model
predictions = loaded_model.predict(new_data_2)

# Assuming 'actual_labels' is a Series or list containing the true labels (0 or 1)
# You should replace this with your actual ground truth labels
actual_labels = new_data_3["target"]

# Calculate the confusion matrix
confusion_matrix_result = confusion_matrix(actual_labels, predictions)

# Create a DataFrame to display the confusion matrix
classification_table = pd.DataFrame(confusion_matrix_result, columns=["Predicted 0", "Predicted 1"], index=["Actual 0", "Actual 1"])

classification_table


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,499,0
Actual 1,0,526


### 15. Prediction on New Data

In [53]:
import pandas as pd

In [54]:
new_data = pd.DataFrame({
    'age':52,
    'sex':1,
    'cp':0,
    'trestbps':126,
    'chol':100,
    'fbs':0,
    'restecg':1,
    'thalach':168,
    'exang':0,
    'oldpeak':1.0,
     'slope':2,
    'ca':2,
    'thal':3,    
},index=[0])

In [55]:
new_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,126,100,0,1,168,0,1.0,2,2,3


In [56]:
p = rf.predict(new_data)

if p[0]==0:
    print("No Disease")
else:
    print("Disease")

No Disease


### 16. Save Model Using Joblib

In [57]:
import joblib

In [58]:
joblib.dump(rf,'model_joblib_heart')

['model_joblib_heart']

In [59]:
model = joblib.load('model_joblib_heart')

In [60]:
z=model.predict(new_data)
z[0]

0

In [61]:
data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
723,68,0,2,120,211,0,0,115,0,1.5,1,0,2,1
733,44,0,2,108,141,0,1,175,0,0.6,1,0,2,1
739,52,1,0,128,255,0,1,161,1,0.0,2,1,3,0
843,59,1,3,160,273,0,0,125,0,0.0,2,0,2,0
878,54,1,0,120,188,0,1,113,0,1.4,1,1,3,0


### GUI

In [62]:
from tkinter import *
import joblib

In [63]:
from tkinter import *
import joblib
import numpy as np
from sklearn import *
def show_entry_fields():
    p1=int(e1.get())
    p2=int(e2.get())
    p3=int(e3.get())
    p4=int(e4.get())
    p5=int(e5.get())
    p6=int(e6.get())
    p7=int(e7.get())
    p8=int(e8.get())
    p9=int(e9.get())
    p10=float(e10.get())
    p11=int(e11.get())
    p12=int(e12.get())
    p13=int(e13.get())
    model = joblib.load('model_joblib_heart')
    result=model.predict([[p1,p2,p3,p4,p5,p6,p7,p8,p8,p10,p11,p12,p13]])
    
    if result == 0:
        Label(master, text="No Heart Disease").grid(row=31)
    else:
        Label(master, text="Possibility of Heart Disease").grid(row=31)
    
    
master = Tk()
master.title("Heart Disease Prediction System")
# master.configure(bg='red')

label = Label(master, text = "Heart Disease Prediction System"
                          , bg = "black", fg = "white"). \
                               grid(row=0,columnspan=2)


Label(master, text="Enter Your Age").grid(row=1)
Label(master, text="Male Or Female [1/0]").grid(row=2)
Label(master, text="Enter Value of CP").grid(row=3)
Label(master, text="Enter Value of trestbps").grid(row=4)
Label(master, text="Enter Value of chol").grid(row=5)
Label(master, text="Enter Value of fbs").grid(row=6)
Label(master, text="Enter Value of restecg").grid(row=7)
Label(master, text="Enter Value of thalach").grid(row=8)
Label(master, text="Enter Value of exang").grid(row=9)
Label(master, text="Enter Value of oldpeak").grid(row=10)
Label(master, text="Enter Value of slope").grid(row=11)
Label(master, text="Enter Value of ca").grid(row=12)
Label(master, text="Enter Value of thal").grid(row=13)



e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)
e9 = Entry(master)
e10 = Entry(master)
e11 = Entry(master)
e12 = Entry(master)
e13 = Entry(master)

e1.grid(row=1, column=1)
e2.grid(row=2, column=1)
e3.grid(row=3, column=1)
e4.grid(row=4, column=1)
e5.grid(row=5, column=1)
e6.grid(row=6, column=1)
e7.grid(row=7, column=1)
e8.grid(row=8, column=1)
e9.grid(row=9, column=1)
e10.grid(row=10, column=1)
e11.grid(row=11, column=1)
e12.grid(row=12, column=1)
e13.grid(row=13, column=1)



Button(master, text='Predict', command=show_entry_fields).grid()

mainloop()

Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python311\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\AppData\Local\Temp\ipykernel_2868\3193530594.py", line 6, in show_entry_fields
    p1=int(e1.get())
       ^^^^^^^^^^^^^
ValueError: invalid literal for int() with base 10: ''
