# Load Data From The Original Source On The Web 

In [None]:
#loading in the library necessary
library(tidyverse)
install.packages("tidyverse")
library(repr)
library(tidymodels)
install.packages("themis")
library(themis)
options(repr.matrix.max.rows = 6)

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [10]:
#loads the data
cleveland <- read_csv("https://raw.githubusercontent.com/JennWan/Group_Project/main/new%20data/newcleveland_data.csv", col_names = F)

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m15[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (10): X2, X3, X6, X7, X9, X11, X12, X13, X14, X15
[32mdbl[39m  (5): X1, X4, X5, X8, X10

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


# Wrangling and Cleaning The Data

In [None]:
#renaming variables for readability 
colnames(cleveland) <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", 
                         "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num")
cleveland
cleveland_tidy <- cleveland |>
                select(age, cp, trestbps, chol, fbs, thalach, exang, ca, thal, num) |>
                mutate(age = as.integer(age), 
                       trestbps = as.integer(trestbps), 
                       chol = as.integer(chol), 
                       thalach = as.integer(thalach), 
                       cp = as_factor(cp), 
                       fbs = as_factor(fbs), 
                       exang = as_factor(exang), 
                       ca = as.integer(ca), 
                       thal = as_factor(thal), 
                       num = as_factor(num))

In [None]:
cleveland_tidy <- cleveland_tidy |>
                mutate(ca = as_factor(ca)) |>
                filter(!is.na(ca), !is.na(thal))
cleveland_tidy

In [None]:
# filtering the heart disease data to only include our class and predictors
cleveland_filter <- cleveland |>
            select(age, chol, thalach, num) |>
            mutate(num = as_factor(num)) |>
            sample_n(300)
cleveland_filter

In [None]:
# find the number and percentage of differing presence of heart disease observations in our data set
# to check for class imbalance

num_obs <- nrow(cleveland_filter)
    cleveland_filter |>
    group_by(num) |>
    summarize(
        count = n(),
        percentage = n() / num_obs * 100)

In [None]:
# center, scaling and balancing the heart disease data
recipe <- recipe(num ~ ., data = cleveland_filter) |>
    step_scale(chol, thalach) |>
    step_center(chol, thalach) |>
    step_upsample(num, over_ratio = 1, skip = FALSE) |>
    prep()

preprocessed_cleveland <- bake(recipe, cleveland_filter)
preprocessed_cleveland

In [None]:
# find the number and percentage of differing presence of heart disease observations in our data set
# double check class imbalance

num_obs <- nrow(preprocessed_cleveland)
    preprocessed_cleveland |>
    group_by(num) |>
    summarize(
        count = n(),
        percentage = n() / num_obs * 100)

In [None]:
# create the TRAIN SET and TEST SET
set.seed(2000)

cleveland_split <- initial_split(preprocessed_cleveland, prop = 0.75, strata = num)
cleveland_train <- training(cleveland_split) 
cleveland_test <- testing(cleveland_split)

## Preliminary Exploratory Data Analysis

# Data Analysis