Importing the necessary libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as wr
wr.filterwarnings('ignore')

Import the dataset

In [5]:
df=pd.read_csv('heart.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'heart.csv'

Understanding and displaying the data

Label encoding is performed on the feature 'SEX'

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df = df[df['sex'] != 0]
df = df.reset_index(drop=True)

In [None]:
print('The information about the dataset is : \n',df.info(),'\n')
print('The uniqueness of the dataset is : \n',df.nunique(),'\n')
print('Checking if the datatype is numeric or object type, true when object : \n',df.dtypes=='object','\n')
print('Checking for the missing values : \n',df.isnull().sum(),'\n')
print('The basic statistics of the data, that is data description : \n',df.describe(),'\n')

Exploratory Data Analysis
Includes univariate, bivariate and multivariate analysis as part of data preprocessing

Univariate Analysis 
Univariate analysis is being done using box plot to detect the outliers

In [None]:
plt.figure(figsize=(20,20))
sns.set_style('darkgrid')
sns.set_palette('pastel')
sns.boxplot(data=df,orient='h',width=0.5)

Count plot to check the class balance in the target feature

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='target',data=df)
plt.xlabel('count')
plt.ylabel('target')
plt.show()

Kernel Density plots for skewness and the data distribution

In [None]:
sns.set_style('darkgrid')
numerical_columns=df.select_dtypes(include=['int64','float64']).columns
plt.figure(figsize=(14,len(numerical_columns)*3))
for idx, feature in enumerate(numerical_columns,1):
    plt.subplot(len(numerical_columns),2,idx)
    sns.histplot(df[feature],kde=True)
    plt.title(f"{feature}| skewness:{round(df[feature].skew(),2)}")

plt.tight_layout()
plt.show()

Bivariate analysis is being performed using pairplots

In [None]:
sns.set_palette('pastel')
sns.pairplot(df,hue='target')
plt.title('Pair Plot Of dataframe')
plt.show()

Multivariate analysis using coreelation matrix as heatmap

In [None]:
plt.figure(figsize=(10,8))
matrix=df.corr()
sns.heatmap(matrix,annot=True,cmap='Pastel2',fmt='.2f',annot_kws={'size':10})
plt.title('Correlation matrix')
plt.show()

Splitting the data and extracting the target features

Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc=MinMaxScaler()
scaled=sc.fit_transform(df)
df_scaled=pd.DataFrame(scaled)

In [None]:
x=df_scaled.iloc[:,:-1]
y=df_scaled.iloc[:,-1]

Feature Transformation

In [None]:
from sklearn.decomposition import PCA
nums = np.arange(12)
var_ratio = []
for num in nums:
  pca = PCA(n_components=num)
  pca.fit(x)
  var_ratio.append(np.sum(pca.explained_variance_ratio_))

plt.figure(figsize=(4,2),dpi=150)
plt.grid()
plt.plot(nums,var_ratio,marker='o')
plt.xlabel('n_components')
plt.ylabel('Explained variance ratio')
plt.title('n_components vs. Explained Variance Ratio')
plt.show()

In [None]:
PC_values = np.arange(pca.n_components_) + 1
plt.grid()
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

The optimal number of principal components will be 8 or 9.

In [None]:
pca=PCA(n_components=8)
x_pca=pca.fit_transform(x)
print(sum(pca.explained_variance_ratio_))

Splitting the data into training and testing data

In [None]:
from sklearn.model_selection import train_test_split
x_train_pca,x_test_pca,y_train_pca,y_test_pca=train_test_split(x_pca,y,test_size=0.30,random_state=10)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=10,stratify=y) #stratified random sampling is done
print(x_train.head())

Model selection and fitting the model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

Decision Tree Classifier

In [None]:
dt=DecisionTreeClassifier(criterion='gini',random_state=42)
model1=dt.fit(x_train,y_train)
y_pred1_tr=model1.predict(x_train)
acc1_tr=accuracy_score(y_train,y_pred1_tr)*100
print(acc1_tr)
y_pred1=model1.predict(x_test)
acc1=accuracy_score(y_test,y_pred1)*100
print(acc1)

In [None]:
#Visualizing the Decision Tree
from sklearn import tree
tree.plot_tree(dt,rounded=True,max_depth=5,filled=True)
plt.show()

In [None]:
cm1=confusion_matrix(y_test,y_pred1)
print(cm1)

In [None]:
rep1=classification_report(y_test,y_pred1)
print(rep1)

In [None]:
svm=SVC(kernel='rbf',C=1.0,gamma=0.5)
model2=svm.fit(x_train,y_train)
y_pred2_tr=model2.predict(x_train)
acc2_tr=accuracy_score(y_train,y_pred2_tr)*100
print(acc2_tr)
y_pred2=model2.predict(x_test)
acc2=accuracy_score(y_test,y_pred2)*100
print(acc2)

In [None]:
cm2=confusion_matrix(y_test,y_pred2)
print(cm2)

In [None]:
rep3=classification_report(y_test,y_pred2)
print(rep3)

In [None]:
rf=RandomForestClassifier(n_estimators=100)
model3=rf.fit(x_train,y_train)
y_pred3_tr=model3.predict(x_train)
acc3_tr=accuracy_score(y_train,y_pred3_tr)*100
print(acc3_tr)
y_pred3=model3.predict(x_test)
acc3=accuracy_score(y_test,y_pred3)*100
print(acc3)

In [None]:
cm3=confusion_matrix(y_test,y_pred3)
print(cm3)

In [None]:
rep3=classification_report(y_test,y_pred3)
print(rep3)

In [None]:
logr=LogisticRegression(penalty='l2',C=1.0)
model4=logr.fit(x_train,y_train)
y_pred4_tr=model4.predict(x_train)
acc4_tr=accuracy_score(y_train,y_pred4_tr)*100
print(acc4_tr)
y_pred4=model4.predict(x_test)
acc4=accuracy_score(y_test,y_pred4)*100
print(acc4)

In [None]:
cm4=confusion_matrix(y_test,y_pred4)
print(cm4)

In [None]:
rep4=classification_report(y_test,y_pred4)
print(rep4)

In [None]:
adb=AdaBoostClassifier(n_estimators=100,learning_rate=1)
model5=adb.fit(x_train,y_train)
y_pred5_tr=model1.predict(x_train)
acc5_tr=accuracy_score(y_train,y_pred5_tr)*100
print(acc5_tr)
y_pred5=model5.predict(x_test)
acc5=accuracy_score(y_test,y_pred5)*100
print(acc5)

In [None]:
cm5=confusion_matrix(y_test,y_pred5)
print(cm5)

In [None]:
rep5=classification_report(y_test,y_pred5)
print(rep5)

In [None]:
hgb=HistGradientBoostingClassifier(learning_rate=0.1)
model6=hgb.fit(x_train,y_train)
y_pred6_tr=model6.predict(x_train)
acc6_tr=accuracy_score(y_train,y_pred6_tr)*100
print(acc6_tr)
y_pred6=model6.predict(x_test)
acc6=accuracy_score(y_test,y_pred6)*100
print(acc6)

In [None]:
cm6=confusion_matrix(y_test,y_pred6)
print(cm6)

In [None]:
rep6=classification_report(y_test,y_pred6)
print(rep6)

In [None]:
dt_pca=DecisionTreeClassifier(criterion='gini',random_state=42)
model1_pca=dt_pca.fit(x_train_pca,y_train_pca)
y_pred1_tr_pca=model1_pca.predict(x_train_pca)
acc1_tr_pca=accuracy_score(y_train,y_pred1_tr_pca)*100
print(acc1_tr_pca)
y_pred1_pca=model1_pca.predict(x_test_pca)
acc1_pca=accuracy_score(y_test_pca,y_pred1_pca)*100
print(acc1_pca)

In [None]:
#Visualizing the Decision Tree
from sklearn import tree
tree.plot_tree(dt_pca,rounded=True,max_depth=5,filled=True)
plt.show()

In [None]:
cm1_pca=confusion_matrix(y_test_pca,y_pred1_pca)
print(cm1_pca)

In [None]:
rep1_pca=classification_report(y_test_pca,y_pred1_pca)
print(rep1_pca)

In [None]:
svm_pca=SVC(kernel='rbf',C=1.0,gamma=0.5)
model2_pca=svm_pca.fit(x_train_pca,y_train_pca)
y_pred2_tr_pca=model2_pca.predict(x_train_pca)
acc2_tr_pca=accuracy_score(y_train_pca,y_pred2_tr_pca)*100
print(acc2_tr_pca)
y_pred2_pca=model2_pca.predict(x_test_pca)
acc2_pca=accuracy_score(y_test_pca,y_pred2_pca)*100
print(acc2_pca)

In [None]:
cm2_pca=confusion_matrix(y_test_pca,y_pred2_pca)
print(cm2_pca)

In [None]:
rep2_pca=classification_report(y_test_pca,y_pred2_pca)
print(rep2_pca)

In [None]:
rf_pca=RandomForestClassifier(n_estimators=100)
model3_pca=rf_pca.fit(x_train_pca,y_train_pca)
y_pred3_tr_pca=model3_pca.predict(x_train_pca)
acc3_tr_pca=accuracy_score(y_train_pca,y_pred3_tr_pca)*100
print(acc3_tr_pca)
y_pred3_pca=model3_pca.predict(x_test_pca)
acc3_pca=accuracy_score(y_test_pca,y_pred3_pca)*100
print(acc3_pca)

In [None]:
cm3_pca=confusion_matrix(y_test_pca,y_pred3_pca)
print(cm3_pca)

In [None]:
rep3_pca=classification_report(y_test_pca,y_pred3_pca)
print(rep3_pca)

In [None]:
logr_pca=LogisticRegression(penalty='l2',C=1.0)
model4_pca=logr_pca.fit(x_train_pca,y_train_pca)
y_pred4_tr_pca=model4_pca.predict(x_train_pca)
acc4_tr_pca=accuracy_score(y_train_pca,y_pred4_tr_pca)*100
print(acc4_tr_pca)
y_pred4_pca=model4_pca.predict(x_test_pca)
acc4_pca=accuracy_score(y_test_pca,y_pred4_pca)*100
print(acc4_pca)

In [None]:
cm4_pca=confusion_matrix(y_test_pca,y_pred4_pca)
print(cm4_pca)

In [None]:
rep4_pca=classification_report(y_test_pca,y_pred4_pca)
print(rep4_pca)

In [None]:
adb_pca=AdaBoostClassifier(n_estimators=100,learning_rate=1)
model5_pca=adb_pca.fit(x_train_pca,y_train_pca)
y_pred5_tr_pca=model5_pca.predict(x_train_pca)
acc5_tr_pca=accuracy_score(y_train_pca,y_pred5_tr_pca)*100
print(acc5_tr_pca)
y_pred5_pca=model5_pca.predict(x_test_pca)
acc5_pca=accuracy_score(y_test_pca,y_pred5_pca)*100
print(acc5_pca)

In [None]:
cm5_pca=confusion_matrix(y_test_pca,y_pred5_pca)
print(cm5_pca)

In [None]:
rep5_pca=classification_report(y_test_pca,y_pred5_pca)
print(rep5_pca)

In [None]:
hgb_pca=HistGradientBoostingClassifier(learning_rate=0.1)
model6_pca=hgb_pca.fit(x_train_pca,y_train_pca)
y_pred6_tr_pca=model6_pca.predict(x_train_pca)
acc6_tr_pca=accuracy_score(y_train_pca,y_pred6_tr_pca)*100
print(acc6_tr_pca)
y_pred6_pca=model6_pca.predict(x_test_pca)
acc6_pca=accuracy_score(y_test_pca,y_pred6_pca)*100
print(acc6_pca)

In [None]:
cm6_pca=confusion_matrix(y_test_pca,y_pred6_pca)
print(cm6_pca)

In [None]:
rep6_pca=classification_report(y_test_pca,y_pred6_pca)
print(rep6_pca)

Feature importance based on mean decrease in impurity in Random Forest

In [None]:
features=list(df.columns)
feature_names = [f"feature {i}" for i in range(x.shape[1])]
import time
start_time=time.time()
importances=rf.feature_importances_
std=np.std([tree.feature_importances_ for tree in rf.estimators_],axis=0)
elasped_time=time.time()-start_time
print(f'Elasped time to compute the importances : {elasped_time: .3f} seconds')

In [None]:
forest_importances=pd.Series(importances,index=feature_names)
fig,ax=plt.subplots()
forest_importances.plot.bar(yerr=std,ax=ax)
ax.set_title("Feature Importances Using MDI")
ax.set_ylabel('Mean decrease in impurity')
fig.tight_layout()
plt.show()

DICE (Diverse Counterfactual Explanations)

In [None]:
train_df1,test_df1=train_test_split(df,test_size=0.30)
print(train_df1)
train_df=train_df1.drop('target',axis=1)
test_df=test_df1.drop('target',axis=1)
import dice_ml
d=dice_ml.Data(dataframe=train_df1, continuous_features=['age','trestbps','chol','thalach','oldpeak','slope','ca','thal'], outcome_name='target')

In [None]:
m=dice_ml.Model(model=dt,backend='sklearn')
exp=dice_ml.Dice(d,m,method='random')

In [None]:
e1=exp.generate_counterfactuals(test_df[0:1],total_CFs=2,desired_class='opposite')
e1.visualize_as_dataframe(show_only_changes=True)

In [None]:
m2=dice_ml.Model(model=rf,backend='sklearn')
exp2=dice_ml.Dice(d,m,method='random')

In [None]:
e2=exp2.generate_counterfactuals(test_df[0:1],total_CFs=2,desired_class='opposite')
e2.visualize_as_dataframe(show_only_changes=True)

In [None]:
e3=exp2.generate_counterfactuals(test_df[0:1],total_CFs=10,desired_class='opposite', permitted_range={'chol':[100,300]})
e3.visualize_as_dataframe(show_only_changes=True)

In [None]:
query_instances=test_df[0:10]
imp=exp2.local_feature_importance(query_instances,total_CFs=10)
print(imp.summary_importance)

In [None]:
query_instances=test_df[0:20]
imp=exp2.global_feature_importance(query_instances,total_CFs=10)
print(imp.summary_importance)

In [None]:
query_instances=test_df[0:20]
imp=exp.global_feature_importance(query_instances,total_CFs=10)
print(imp.summary_importance)

In [None]:
query_instances=test_df[0:10]
imp=exp.local_feature_importance(query_instances,total_CFs=10)
print(imp.summary_importance)