### DSCI 100: GROUP PROJECT PROPOSAL

GROUP 10 - SECTION 007: Michelle Ho, Arian Afshari, Kyle Wise, Zidong Zhou

In [14]:
library(tidyverse)
library(tidymodels)
library(dplyr)
library(repr)
options(repr.matrix.max.rows = 7)

**Introduction:** One of the leading causes of death in Hungary are cardiovascular diseases (CVDs) (OECD and WHO, 2017). It constitutes approximately 52.4% of the main diagnoses underlying cause of death and 47.6% of the secondary diagnoses prior to death. Several risk factors are known to be associated with CVDs; the most prominent among them are high blood pressure, high low-density lipoprotein (LDL) cholesterol levels, smoking, and overall, leading an unhealthy lifestyle (CDC, 2022). Moreover, susceptibility to heart disease increases with age and other uncontrollable factors such as sex (McLaren, 2023).
As data science becomes progressively useful in analyzing and interpreting patient outcomes, we plan to use the tools that we’ve learnt in class to predict the angiographic disease status of CVDs. Our project will use the Heart Disease data set from the UCI Machine Learning Repository to build a classification model that will predict CVD diagnoses in presenting patients. The diagnosis of heart disease in the data set is defined as a binary label, whether the patient has the disease or not. The disease is present if there is greater than 50% diameter narrowing in any of the major blood vessels in a patient. Ie., if the value in the num parameter is 1, then more than 50% of the diameter of the major blood vessels are narrowing (cite UCI). When blood vessels narrow, the heart does not receive the blood that it requries and thus, results in heart disease. Overall, heart failure is a common result caused by CVDs. The dataset that we will be using features 14 attributes that will be examined to determine which are possible indicators of heart disease which will then be used to predict the presence of heart disease in patients.

In [3]:
# setting the seed such that the numbers used in the analysis are reasonably random
set.seed(3789)

In [4]:
# loading the data and mutating the class label column to factor 
heart_disease_data <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data", 
                          col_names = c("age", "sex", "cp", "trestbps", "chol", "fbs","restecg", "thalach", "exang", "oldpeak", 
                                        "slope", "ca", "thal", "num")) |>
mutate(num = as_factor(num))
heart_disease_data

[1mRows: [22m[34m294[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (9): trestbps, chol, fbs, restecg, thalach, exang, slope, ca, thal
[32mdbl[39m (5): age, sex, cp, oldpeak, num

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<fct>
28,1,2,130,132,0,2,185,0,0,?,?,?,0
29,1,2,120,243,0,0,160,0,0,?,?,?,0
29,1,2,140,?,0,0,170,0,0,?,?,?,0
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
56,1,4,155,342,1,0,150,1,3,2,?,?,1
58,0,2,180,393,0,0,110,1,1,2,?,7,1
65,1,4,130,275,0,1,115,1,1,2,?,?,1


In [5]:
# partitioning the data into a training (75%) and testing (25%) set; num will be used as the class label
heart_disease_split <- initial_split(heart_disease_data, prop = 0.75, strata = num)

heart_disease_train <- training(heart_disease_split)
heart_disease_train

heart_disease_test <- testing(heart_disease_split)
heart_disease_test

age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<fct>
28,1,2,130,132,0,2,185,0,0,?,?,?,0
29,1,2,140,?,0,0,170,0,0,?,?,?,0
30,0,1,170,237,0,1,170,0,0,?,?,6,0
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
54,0,3,130,294,0,1,100,1,0,2,?,?,1
58,0,2,180,393,0,0,110,1,1,2,?,7,1
65,1,4,130,275,0,1,115,1,1,2,?,?,1


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<fct>
29,1,2,120,243,0,0,160,0,0,?,?,?,0
32,0,2,105,198,0,0,165,0,0,?,?,?,0
32,1,2,110,225,0,0,184,0,0,?,?,?,0
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
47,1,4,160,291,0,1,158,1,3.0,2,?,?,1
52,1,4,160,331,0,0,94,1,2.5,?,?,?,1
56,1,4,155,342,1,0,150,1,3.0,2,?,?,1


In [6]:
# identifying the proportion of cases with a diagnosis of heart disease (positive angiographic disease status) and the proportion without in the full data
# set and the training data set to ensure that the proportions are preserved during the split
heart_disease_data_proportions <- heart_disease_data |>
                      group_by(num) |>
                      summarize(people = n()) |>
                      mutate(percent = 100*people/nrow(heart_disease_data))
heart_disease_data_proportions

heart_disease_train_proportions <- heart_disease_train |>
                      group_by(num) |>
                      summarize(people = n()) |>
                      mutate(percent = 100*people/nrow(heart_disease_train))
heart_disease_train_proportions

num,people,percent
<fct>,<int>,<dbl>
0,188,63.94558
1,106,36.05442


num,people,percent
<fct>,<int>,<dbl>
0,141,64.09091
1,79,35.90909


In [18]:
# summarize the training data with a table that reports the number of observations in each class, the means of the predictor variables, and how many rows have missing data
heart_disease_class_observations <- heart_disease_train |>
                      group_by(diagnosis = num) |>
                      summarize(people = n())
heart_disease_class_observations

# those that have a diagnoses have a num value of 1, indicating that there is more than 50% blood vessels diameter narrowing
# those that do not have a diagnoses have a num value of 0, indicating that there is more than 50% blood vessels diameter narrowing

heart_disease_train_summary <- data.frame(unclass(summary(heart_disease_train)),
                                            check.names = FALSE)
index_name<-c("min","first_quatile","median","mean","third_quatile","max","number of missig data")
rownames(heart_disease_train_summary)<-index_name
heart_disease_train_summary

diagnosis,people
<fct>,<int>
0,141
1,79


Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
min,Min. :28.00,0: 56,1: 9,Min. : 92.0,Min. : 85.0,0 :197,0 :171,Min. : 82.0,0 :153,Min. :0.0000,1 : 11,0 : 1,3 : 6,0:141
first_quatile,1st Qu.:43.00,1:164,2:73,1st Qu.:120.0,1st Qu.:211.0,1 : 16,1 : 42,1st Qu.:123.0,1 : 66,1st Qu.:0.0000,2 : 68,NA's:219,6 : 7,1: 79
median,Median :49.00,,3:46,Median :130.0,Median :238.0,NA's: 7,2 : 6,Median :140.0,NA's: 1,Median :0.0000,3 : 1,,7 : 10,
mean,Mean :48.11,,4:92,Mean :132.3,Mean :248.9,,NA's: 1,Mean :139.4,,Mean :0.5877,NA's:140,,NA's:197,
third_quatile,3rd Qu.:54.00,,,3rd Qu.:140.0,3rd Qu.:277.0,,,3rd Qu.:155.0,,3rd Qu.:1.0000,,,,
max,Max. :65.00,,,Max. :180.0,Max. :603.0,,,Max. :190.0,,Max. :5.0000,,,,
number of missig data,,,,NA's :1,NA's :15,,,NA's :1,,,,,,


In [19]:
heart_disease_train[heart_disease_train=="?"]<-NA
names <- c("sex","cp","fbs","restecg","exang","ca","thal","num","slope")
heart_disease_train[,names] <- lapply(heart_disease_train[,names] , factor)
heart_disease_train$trestbps=as.numeric(heart_disease_train$trestbps)
heart_disease_train$thalach=as.numeric(heart_disease_train$thalach)
heart_disease_train$oldpeak=as.numeric(heart_disease_train$oldpeak)
heart_disease_train$chol=as.numeric(heart_disease_train$chol)
summary(heart_disease_train)

      age        sex     cp        trestbps          chol         fbs     
 Min.   :28.00   0: 56   1: 9   Min.   : 92.0   Min.   : 85.0   0   :197  
 1st Qu.:43.00   1:164   2:73   1st Qu.:120.0   1st Qu.:211.0   1   : 16  
 Median :49.00           3:46   Median :130.0   Median :238.0   NA's:  7  
 Mean   :48.11           4:92   Mean   :132.3   Mean   :248.9             
 3rd Qu.:54.00                  3rd Qu.:140.0   3rd Qu.:277.0             
 Max.   :65.00                  Max.   :180.0   Max.   :603.0             
                                NA's   :1       NA's   :15                
 restecg       thalach       exang        oldpeak        slope        ca     
 0   :171   Min.   : 82.0   0   :153   Min.   :0.0000   1   : 11   0   :  1  
 1   : 42   1st Qu.:123.0   1   : 66   1st Qu.:0.0000   2   : 68   NA's:219  
 2   :  6   Median :140.0   NA's:  1   Median :0.0000   3   :  1             
 NA's:  1   Mean   :139.4              Mean   :0.5877   NA's:140             
          