In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.ensemble import RandomForestClassifier

ModuleNotFoundError: No module named 'numpy'

In [None]:
data = pd.read_csv('heart_2020_cleaned.csv')

In [None]:
#To check number of rows & columns in the dataset
print(data.shape)

#To look at the top 5 rows of dataset
data.head()

In [None]:
#To checks number of null values in dataset
print(data.isnull().sum())

#To look at the number of unique values in each variable
print(data.nunique())

In [None]:
#Dropping duplicates
data.drop_duplicates(inplace= True)

#Checking number of row & column after dropping duplicates
data.shape

In [None]:
#Check data description of each variable
pd.set_option("display.max_columns", None)
print(data.describe(include = 'all'))

#EXPLORATORY DATA ANALYSIS

In [None]:
#checking for an imbalance
# target count
heart_disease_count_0, heart_disease_count_1 = data['HeartDisease'].value_counts()

# Separate target 
hd_0 = data[data['HeartDisease'] == 'No']
hd_1 = data[data['HeartDisease'] == 'Yes']# print the shape of the class
print('Negative:', hd_0.shape)
print('Positive:', hd_1.shape)

#DATA PREPROCESSING

In [None]:
#Creating an object list including object datatype
obj_list = data.select_dtypes(include='object').columns
obj_list

In [None]:
#we would encode target labels with values 0 and n-classes-1. This would transform the labels into the form of numbers that can be easily read by the machine.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for obj in obj_list:
        data[obj] = le.fit_transform(data[obj].astype(str))
data

In [None]:
#Checking missing values before proceeding further
data.isna().sum()

In [None]:
#I will be performing Random Over-Sampling
from sklearn.model_selection import train_test_split
y = data['HeartDisease']
X = data.loc[:, data.columns != 'HeartDisease']

In [None]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy=0.5)
X_over, y_over = oversample.fit_resample(X, y)

In [None]:
#Splitting the data 
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size = 0.3,random_state=42)
print(np.mean(y_train), np.mean(y),np.mean(y_test))

##Random Forest

In [None]:
clf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

In [None]:
my_matrix = (confusion_matrix(y_test, y_pred))

In [None]:
print("TP is:", my_matrix[1, 1])
print("TN is:", my_matrix[0, 0])
print("FP is:", my_matrix[0, 1])
print("FN is:", my_matrix[1, 0])

In [None]:
print('Precision score:', precision_score(y_test, y_pred))
print('Recall score:', recall_score(y_test, y_pred))
print('Accuracy score:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

In [None]:
feature_importances = clf.feature_importances_
features = sorted(zip(X.columns, clf.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(12,8)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()