# 1.0 Begin by importing dataset 
MD, R. (2020). UCI Heart Disease Data. Www.kaggle.com. https://www.kaggle.com/datasets/redwankarimsony/heart-disease-data h.D.

Acknowledgements <br>
Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D. <br>
University Hospital, Zurich, Switzerland: William Steinbrunn, M.D. <br>
University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D. <br>
V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: Robert Detrano, M.D., Ph.D. <br>

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot

plt.show()
sb.set() # set the default Seaborn style for graphics

dataframe = pd.read_csv('HeartDisease.csv')
dataframe.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


# 2.0 Identify and get rid of any duplicates.
Identified 0 duplicate rows, no need to drop duplicates

In [2]:
dataframe.duplicated().sum()

#dataframe = dataframe.drop_duplicates()
#dataframe.duplicated().sum()
#dataframe

0

# 2.1 Data cleaning - drop irrelevant data & make data more readable
Drop ID, it is not needed in prediction of heart disease <br>
Note slope = the slope of the peak exercise ST segment <br>
Oldpeak = ST depression induced by exercise relative to rest

In [3]:
dataframe.drop(['id' , 'dataset'], axis='columns', inplace=True)

dataframe = dataframe.rename(columns = 
                        {'cp':'chest_pain',
                        'trestbps':'rest_blood_pressure',
                         'chol':'cholesterol_level',
                         'fbs':'diabetic',
                         'restecg':'resting_ecg',
                         'thalch':'max_heart_rate',
                         'exang':'exercise_angina',
                         'ca':'number_major_vessels',
                         'num':'number'}).copy()

dataframe

Unnamed: 0,age,sex,chest_pain,rest_blood_pressure,cholesterol_level,diabetic,resting_ecg,max_heart_rate,exercise_angina,oldpeak,slope,number_major_vessels,thal,number
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,62,Male,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,58,Male,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


# 2.2 Getting rid / fixing NaN values (0) 

In [4]:
dataframe.isnull().sum()

age                       0
sex                       0
chest_pain                0
rest_blood_pressure      59
cholesterol_level        30
diabetic                 90
resting_ecg               2
max_heart_rate           55
exercise_angina          55
oldpeak                  62
slope                   309
number_major_vessels    611
thal                    486
number                    0
dtype: int64

**Drop columns with too many NaN - this can affect statistical data.** <br>
**Drop number_major_vessels, 66% NaN. Drop Thal, 52% NaN. Drop Slope, 33% NaN. Drop oldpeak - related to slope.**

In [5]:
dataframe.drop(['number_major_vessels' , 'thal' , 'slope' , 'oldpeak'], axis='columns', inplace=True)
dataframe.isnull().sum()

age                     0
sex                     0
chest_pain              0
rest_blood_pressure    59
cholesterol_level      30
diabetic               90
resting_ecg             2
max_heart_rate         55
exercise_angina        55
number                  0
dtype: int64

**Drop rows with NaN values so statistical data is not affected later.**

In [6]:
dataframe.dropna(inplace=True)
dataframe.reset_index(drop=True, inplace=True)

dataframe.isnull().sum()

age                    0
sex                    0
chest_pain             0
rest_blood_pressure    0
cholesterol_level      0
diabetic               0
resting_ecg            0
max_heart_rate         0
exercise_angina        0
number                 0
dtype: int64

In [7]:
dataframe.shape

(744, 10)

#### Lets check the data's min to ensure there is no 0 for numerical values - We have found 0's in rest_blood_pressure and cholestrol_levels, decide to drop these rows. BP = 0 would mean the person would be having a heart attack. Cholestrol level = 0 is not possible, assume data as faulty

In [8]:
dataframe.describe()

Unnamed: 0,age,rest_blood_pressure,cholesterol_level,max_heart_rate,number
count,744.0,744.0,744.0,744.0,744.0
mean,53.127688,132.762097,219.822581,138.821237,0.924731
std,9.398811,18.610367,93.735536,25.843072,1.129433
min,28.0,0.0,0.0,60.0,0.0
25%,46.0,120.0,197.0,120.0,0.0
50%,54.0,130.0,231.0,140.0,1.0
75%,60.0,140.0,270.25,160.0,1.0
max,77.0,200.0,603.0,202.0,4.0


#### Dropping the rows with 0 for rest_blood_pressure and cholesterol_level

In [9]:
dataframe = dataframe.drop(dataframe[dataframe['rest_blood_pressure'] == 0].index)
dataframe = dataframe.drop(dataframe[dataframe['cholesterol_level'] == 0].index)

In [10]:
dataframe.describe()

Unnamed: 0,age,rest_blood_pressure,cholesterol_level,max_heart_rate,number
count,664.0,664.0,664.0,664.0,664.0
mean,52.631024,132.759036,246.307229,141.278614,0.813253
std,9.4421,17.816792,57.561657,25.046787,1.079665
min,28.0,92.0,85.0,69.0,0.0
25%,46.0,120.0,210.0,123.0,0.0
50%,54.0,130.0,239.5,143.0,0.0
75%,59.0,140.0,275.0,160.0,1.0
max,77.0,200.0,603.0,202.0,4.0


In [11]:
dataframe

Unnamed: 0,age,sex,chest_pain,rest_blood_pressure,cholesterol_level,diabetic,resting_ecg,max_heart_rate,exercise_angina,number
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,0
...,...,...,...,...,...,...,...,...,...,...
739,62,Male,asymptomatic,158.0,170.0,False,st-t abnormality,138.0,True,1
740,46,Male,asymptomatic,134.0,310.0,False,normal,126.0,False,2
741,54,Female,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,1
742,55,Male,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,2


**After cleaning, we are left with 664 rows and 10 columns.**

# Exporting as xlsx for easier opening in other documents

In [12]:
import xlsxwriter

#Path has to be changed to direct path if permission error occurs.
#path = r"C:\Users\Username\Documents\GitHub\SC1015_MiniProject\HeartDiseaseCleaned.xlsx"
path = r"HeartDiseaseCleaned.xlsx"

writer = pd.ExcelWriter(path, engine = 'xlsxwriter')
dataframe.to_excel(writer, sheet_name = 'HeartDiseaseCleaned')
writer.close()
print("Finished Exporting")

Finished Exporting
