In [48]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [49]:
# import and read the csv file and create Dataframe
Path="/content/heart.csv"
df=pd.read_csv(Path)
print(df.shape)
df.head(10)

(918, 12)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [50]:
# Check the null values in each column
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [51]:
# check the total values in each column
df.count()

Age               918
Sex               918
ChestPainType     918
RestingBP         918
Cholesterol       918
FastingBS         918
RestingECG        918
MaxHR             918
ExerciseAngina    918
Oldpeak           918
ST_Slope          918
HeartDisease      918
dtype: int64

In [52]:
# check te no of unique values in eac column
df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

In [53]:
# Identify the datatypes
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [54]:
# Generate the categorial variable lists
heart_cat=df.dtypes[df.dtypes=="object"].index.tolist()
df[heart_cat].nunique()

Sex               2
ChestPainType     4
RestingECG        3
ExerciseAngina    2
ST_Slope          3
dtype: int64

After reviewing the Object datatype columns, we can safely conclude that no further binning is required as the max no of unique values is less than 5. 

In [55]:
# Create a OneHotEncoder instance
enc=OneHotEncoder(sparse=False)

# Fit and transform 
fit_enc=enc.fit_transform(df[heart_cat])
# create a dataframe
heart_df=pd.DataFrame(fit_enc)

# adding headers to the columns using the categorial list
heart_df.columns=enc.get_feature_names(heart_cat)
print(heart_df.shape)
heart_df.head()

(918, 14)




Unnamed: 0,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [56]:
# merging the hot encoded vfeatures and dropping the originals
df=df.merge(heart_df, left_index=True, right_index=True).drop(heart_cat, axis=1)
print(df.shape)
df.head(10)

(918, 21)


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,160,180,0,156,1.0,1,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,130,283,0,98,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,48,138,214,0,108,1.5,1,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,54,150,195,0,122,0.0,0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
5,39,120,339,0,170,0.0,0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
6,45,130,237,0,170,0.0,0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
7,54,110,208,0,142,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
8,37,140,207,0,130,1.5,1,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
9,48,120,284,0,120,0.0,0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [57]:
# confirming that the daatypes haave been converted successfully to numerical values
df.dtypes

Age                    int64
RestingBP              int64
Cholesterol            int64
FastingBS              int64
MaxHR                  int64
Oldpeak              float64
HeartDisease           int64
Sex_F                float64
Sex_M                float64
ChestPainType_ASY    float64
ChestPainType_ATA    float64
ChestPainType_NAP    float64
ChestPainType_TA     float64
RestingECG_LVH       float64
RestingECG_Normal    float64
RestingECG_ST        float64
ExerciseAngina_N     float64
ExerciseAngina_Y     float64
ST_Slope_Down        float64
ST_Slope_Flat        float64
ST_Slope_Up          float64
dtype: object

In [58]:
# spliting the data into training and testing data
X=df.drop(["HeartDisease"],1)
y=df["HeartDisease"].copy()
print(X.shape)

(918, 20)


  X=df.drop(["HeartDisease"],1)


In [60]:
# creating an instance for RandomForestClassifier to check feature importance
rf_model=RandomForestClassifier(n_estimators=128, random_state=42)
rf_model=rf_model.fit(X,y)

In [61]:
# sorting the features based on their importance
importance=sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importance

[(0.14638916836866278, 'ST_Slope_Up'),
 (0.0986586099442513, 'ST_Slope_Flat'),
 (0.09855405092355378, 'Oldpeak'),
 (0.09416404480387255, 'Cholesterol'),
 (0.08909008115592841, 'MaxHR'),
 (0.07789605313493954, 'ChestPainType_ASY'),
 (0.07131157370247568, 'Age'),
 (0.06899488984229563, 'RestingBP'),
 (0.05658076279474096, 'ExerciseAngina_N'),
 (0.054077256123423706, 'ExerciseAngina_Y'),
 (0.025790001933562748, 'Sex_F'),
 (0.022033433306620286, 'FastingBS'),
 (0.02018033708569506, 'Sex_M'),
 (0.017900295257714933, 'ChestPainType_ATA'),
 (0.013980800988716346, 'RestingECG_LVH'),
 (0.012304087907864717, 'ChestPainType_NAP'),
 (0.011091784976410555, 'RestingECG_Normal'),
 (0.008415913021095962, 'RestingECG_ST'),
 (0.00683797464263075, 'ChestPainType_TA'),
 (0.00574888008554417, 'ST_Slope_Down')]

In [62]:
# Creating a dataaframe for the feature importance
importance_df=pd.DataFrame(importance, columns=["Relevance", "Features"])
print(importance_df.shape)
importance_df.head()

(20, 2)


Unnamed: 0,Relevance,Features
0,0.146389,ST_Slope_Up
1,0.098659,ST_Slope_Flat
2,0.098554,Oldpeak
3,0.094164,Cholesterol
4,0.08909,MaxHR


In [65]:
# choosing the columns with relevance below 1%
importance_df=importance_df[importance_df["Relevance"]<0.01]
least_relevant=importance_df["Features"].tolist()
least_relevant


['RestingECG_ST', 'ChestPainType_TA', 'ST_Slope_Down']

In [67]:
# Dropping all columns with least relevance (leass than 1%)
X=X.drop(least_relevant, axis=1)
print(X.shape)
X.head()

(918, 17)


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,RestingECG_LVH,RestingECG_Normal,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,49,160,180,0,156,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
2,37,130,283,0,98,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,48,138,214,0,108,1.5,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,54,150,195,0,122,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0


In [68]:
# split the dataset into training and testing dataset
X_train, X_test, y_train, y_test=train_test_split(X,y, random_state=42, stratify=y)

In [69]:
# creating a standard scaler
scaler=StandardScaler()

# fit the standard scaler
X_scaler=scaler.fit(X_train)

# Transform/Scale the data
X_train_scaled=X_scaler.transform(X_train)
X_test_scaled=X_scaler.transform(X_test)

# Recreating an RandomForestClassiifier for training and testing on split data

In [70]:
rf_model1=RandomForestClassifier(n_estimators=128, random_state=42)
rf_model1

RandomForestClassifier(n_estimators=128, random_state=42)

In [73]:
# training and calculating the accuracy score
rf_model1=rf_model1.fit(X_train_scaled, y_train)

# evaluate the model
from sklearn.metrics import accuracy_score
y_pred=rf_model1.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.891


In [74]:
# display confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 87,  16],
       [  9, 118]])

In [78]:
# print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.84      0.93      0.87      0.89      0.78       103
          1       0.88      0.93      0.84      0.90      0.89      0.79       127

avg / total       0.89      0.89      0.88      0.89      0.89      0.79       230



0=Less risk
1=More risk