# Import the dependencies

In [423]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Data Collection

In [424]:
data = pd.read_csv('D:/try2/heart.csv')

In [425]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [426]:
data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [427]:
data.shape

(303, 14)

In [428]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


# Checking for missing values

In [429]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [430]:
# No missing values

# Handling Duplicate data

In [431]:
data_dup = data.duplicated().any()
data_dup

True

In [432]:
data = data.drop_duplicates()

In [433]:
data_dup = data.duplicated().any()
data_dup

False

# Data processing

In [434]:
cate_val = []
cont_val = []

for col in data.columns:
    if data[col].nunique() <= 10:
        cate_val.append(col)
    else:
        cont_val.append(col)

In [435]:
cate_val

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

In [436]:
cont_val

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Encoding Categorical data

In [437]:
cate_val

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

In [438]:
cate_val.remove('sex')
cate_val.remove('fbs')
cate_val.remove('restecg')
cate_val.remove('exang')
cate_val.remove('target')

In [439]:
data = pd.get_dummies(data,columns=cate_val,drop_first=False)

In [440]:
data.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,target,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,63,1,145,233,1,0,150,0,2.3,1,...,False,True,False,False,False,False,False,True,False,False
1,37,1,130,250,0,1,187,0,3.5,1,...,False,True,False,False,False,False,False,False,True,False
2,41,0,130,204,0,0,172,0,1.4,1,...,True,True,False,False,False,False,False,False,True,False
3,56,1,120,236,0,1,178,0,0.8,1,...,True,True,False,False,False,False,False,False,True,False
4,57,0,120,354,0,1,163,1,0.6,1,...,True,True,False,False,False,False,False,False,True,False


In [441]:
data.describe()

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,131.602649,246.5,0.149007,0.52649,149.569536,0.327815,1.043046,0.543046
std,9.04797,0.466426,17.563394,51.753489,0.356686,0.526027,22.903527,0.470196,1.161452,0.49897
min,29.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0
25%,48.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,0.0
50%,55.5,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0
75%,61.0,1.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,1.0
max,77.0,1.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,1.0


# Checking the distribution of target variable

In [442]:
data['target'].value_counts()

target
1    164
0    138
Name: count, dtype: int64

# Feature Scaling

In [443]:
from sklearn.preprocessing import StandardScaler

In [444]:
st = StandardScaler()
data[cont_val] = st.fit_transform(data[cont_val])

In [445]:
data.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,target,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,0.949794,1,0.764066,-0.261285,1,0,0.018826,0,1.084022,1,...,False,True,False,False,False,False,False,True,False,False
1,-1.928548,1,-0.091401,0.067741,0,1,1.636979,0,2.118926,1,...,False,True,False,False,False,False,False,False,True,False
2,-1.485726,0,-0.091401,-0.822564,0,0,0.980971,0,0.307844,1,...,True,True,False,False,False,False,False,False,True,False
3,0.174856,1,-0.661712,-0.203222,0,1,1.243374,0,-0.209608,1,...,True,True,False,False,False,False,False,False,True,False
4,0.285561,0,-0.661712,2.080602,0,1,0.587366,1,-0.382092,1,...,True,True,False,False,False,False,False,False,True,False


# Split into training and testing dataset

In [446]:
X = data.drop(columns='target', axis=1)
Y = data['target']

In [447]:
from sklearn.model_selection import train_test_split

In [448]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42,stratify=Y)

# Logistic Regression

In [449]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [450]:
log=LogisticRegression()
log.fit(X_train,Y_train)

In [420]:
rfe = RFE(log, n_features_to_select=10)
rfe.fit(X, Y)

In [421]:
print(rfe.support_)

[False  True False False False False False  True  True  True False  True
  True False  True False  True False False False  True False False False
  True]


In [422]:
X = data.drop(columns='age', axis=1)
# X = data.drop(columns='cp_0', axis=1)
# X = data.drop(columns='cp_1', axis=1)
# X = data.drop(columns='cp_2', axis=1)
# X = data.drop(columns='cp_3', axis=1)
X = data.drop(columns='trestbps', axis=1)
X = data.drop(columns='chol', axis=1)
# X = data.drop(columns='thalach', axis=1)
# X = data.drop(columns='slope_0', axis=1)
# X = data.drop(columns='slope_2', axis=1)
# X = data.drop(columns='ca_1', axis=1)
# X = data.drop(columns='ca_2', axis=1)
# X = data.drop(columns='ca_3', axis=1)
# X = data.drop(columns='thal_1', axis=1)
# X = data.drop(columns='thal_2', axis=1)
# X = data.drop(columns='thal_3', axis=1)

In [230]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42,stratify=Y)

In [231]:
log=LogisticRegression()
log.fit(X_train,Y_train)

In [451]:
ypred1=log.predict(X_test)

In [452]:
unique_values, counts = np.unique(ypred1, return_counts=True)

# Print the results
for value, count in zip(unique_values, counts):
    print(f"{value} occurs {count} times")

0 occurs 26 times
1 occurs 35 times


In [453]:
accuracy_score(Y_test,ypred1)

0.8688524590163934

# SVC

In [454]:
from sklearn import svm

In [455]:
svm=svm.SVC()

In [456]:
svm.fit(X_train,Y_train)

In [457]:
ypred2=svm.predict(X_test)

In [458]:
accuracy_score(Y_test,ypred2)

0.8032786885245902

# KNeighbours Classifier


In [459]:
from sklearn.neighbors import KNeighborsClassifier

In [460]:
knn=KNeighborsClassifier(n_neighbors=9)

In [461]:
knn.fit(X_train,Y_train)

In [462]:
X_test.shape

(61, 25)

In [463]:
if not isinstance(X_test, np.ndarray):
    X_test = np.array(X_test)

# Check the shape of 'X_test'
if X_test.ndim == 1:
    # If 'X_test' is a 1D array, reshape it to a 2D array with one row
    X_test = X_test.reshape(1, -1)

In [464]:
ypred5=knn.predict(X_test)



In [465]:
accuracy_score(Y_test,ypred5)

0.8032786885245902

# Non-Linear ML Algorithms

In [371]:
data = pd.read_csv('D:/try2/heart.csv')

In [372]:
data = data.drop_duplicates()

In [373]:
X = data.drop(columns='target', axis=1)
Y = data['target']

In [374]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42,stratify=Y)

# Random Forest Classifier

In [466]:
from sklearn.ensemble import RandomForestClassifier

In [467]:
rt=RandomForestClassifier()

In [468]:
rt.fit(X_train,Y_train)

In [469]:
ypred3=rt.predict(X_test)



In [470]:
accuracy_score(Y_test,ypred3)

0.7704918032786885

# Gradient Boosting Classifier

In [471]:
from sklearn.ensemble import GradientBoostingClassifier

In [472]:
gbc = GradientBoostingClassifier()

In [473]:
gbc.fit(X_train,Y_train)

In [474]:
ypred4=gbc.predict(X_test)



In [475]:
accuracy_score(Y_test,ypred4)

0.7704918032786885

# Decision Tree Classifier

In [476]:
from sklearn.tree import DecisionTreeClassifier

In [477]:
dt=DecisionTreeClassifier()

In [478]:
dt.fit(X_train,Y_train)

In [479]:
ypred6=dt.predict(X_test)



In [480]:
accuracy_score(Y_test,ypred6)

0.7049180327868853

In [390]:
final_data=pd.DataFrame({'Models':['LR','SVM','RF','GC','KNN','DT'],'Acc':[accuracy_score(Y_test,ypred1),accuracy_score(Y_test,ypred2),accuracy_score(Y_test,ypred3),accuracy_score(Y_test,ypred4),accuracy_score(Y_test,ypred5),accuracy_score(Y_test,ypred6)]})

In [391]:
final_data

Unnamed: 0,Models,Acc
0,LR,0.868852
1,SVM,0.803279
2,RF,0.754098
3,GC,0.803279
4,KNN,0.803279
5,DT,0.721311
