In [1]:
library(ggplot2)
library(costsensitive)
library(caret)
library(tidyverse)
library(fastDummies)

Loading required package: lattice

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     [32m✔[39m [34mtidyr    [39m 1.3.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mlift()[39m   masks [34mcaret[39m::lift()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Thank you for using fastDummies!

To acknowledge our work, please cite the package:

Kaplan, J. & S

https://cran.r-project.org/web/packages/costsensitive/costsensitive.pdf

https://www.kaggle.com/code/datawrangler/cost-sensitive-decision-tree

In [2]:
fram <- read.csv('Framingham.csv')

In [10]:
str(fram)

'data.frame':	2561 obs. of  16 variables:
 $ male           : int  1 0 1 0 0 0 0 0 1 1 ...
 $ age            : int  39 46 48 61 46 43 63 45 52 43 ...
 $ education      : int  4 2 1 3 3 2 1 2 1 1 ...
 $ currentSmoker  : int  0 0 1 1 1 0 0 1 0 1 ...
 $ cigsPerDay     : int  0 0 20 30 23 0 0 20 0 30 ...
 $ BPMeds         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ prevalentStroke: int  0 0 0 0 0 0 0 0 0 0 ...
 $ prevalentHyp   : int  0 0 0 1 0 1 0 0 1 1 ...
 $ diabetes       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ totChol        : int  195 250 245 225 285 228 205 313 260 225 ...
 $ sysBP          : num  106 121 128 150 130 ...
 $ diaBP          : num  70 81 80 95 84 110 71 71 89 107 ...
 $ BMI            : num  27 28.7 25.3 28.6 23.1 ...
 $ heartRate      : int  80 95 75 65 85 77 60 79 76 93 ...
 $ glucose        : int  77 76 70 103 85 99 85 78 79 88 ...
 $ TenYearCHD     : int  0 0 0 1 0 0 1 0 0 0 ...


In [11]:
fcols <- c('male', 'education', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'TenYearCHD')
fram[fcols] <- fram[fcols] %>% lapply(factor)

In [7]:
options(repr.plot.width = 16, repr.plot.height = 10)
png(filename='classes.png', width=1300, height=800)

ggplot(fram, aes(x=TenYearCHD)) +
    geom_bar(fill='brown4') +
    ggtitle('Target class distribution') +
    theme(
        plot.title = element_text(size = 24),
        axis.title = element_text(size = 20),         
        axis.text.x = element_text(size = 18),        
        axis.text.y = element_text(size = 18)
        )

dev.off()

#### Train-test split, categorical features encoding

In [3]:
dcols <- c('male', 'education', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes')
fram <- dummy_cols(fram, select_columns = dcols)
fram <- fram[, !names(fram) %in% dcols]

In [13]:
head(fram)

Unnamed: 0_level_0,age,cigsPerDay,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD,male_0,⋯,currentSmoker_0,currentSmoker_1,BPMeds_0,BPMeds_1,prevalentStroke_0,prevalentStroke_1,prevalentHyp_0,prevalentHyp_1,diabetes_0,diabetes_1
Unnamed: 0_level_1,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,39,0,195,106.0,70,26.97,80,77,0,0,⋯,1,0,1,0,1,0,1,0,1,0
2,46,0,250,121.0,81,28.73,95,76,0,1,⋯,1,0,1,0,1,0,1,0,1,0
3,48,20,245,127.5,80,25.34,75,70,0,0,⋯,0,1,1,0,1,0,1,0,1,0
4,61,30,225,150.0,95,28.58,65,103,1,1,⋯,0,1,1,0,1,0,0,1,1,0
5,46,23,285,130.0,84,23.1,85,85,0,1,⋯,0,1,1,0,1,0,1,0,1,0
6,43,0,228,180.0,110,30.3,77,99,0,1,⋯,1,0,1,0,1,0,0,1,1,0


In [4]:
set.seed(1234)

In [5]:
train_indices <- createDataPartition(fram$TenYearCHD, p = 0.75, list = FALSE)
training <- fram[train_indices,]
test  <- fram[-train_indices,]
training$TenYearCHD <- as.factor(training$TenYearCHD)
test$TenYearCHD <- as.factor(test$TenYearCHD)

#### Cost-insensitive classifier

In [17]:
knn_grid <- data.frame(k=seq(1, 10, by=1))
knn_control <- trainControl(method='cv', number=5)
knn <- caret::train(TenYearCHD ~ ., data=training, method='knn', trControl=knn_control, tuneGrid=knn_grid)

In [24]:
knn_pred <- predict(knn, newdata = test)
knn_cm <- confusionMatrix(reference=test$TenYearCHD, data=knn_pred, positive = '1')
print(knn_cm)

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 538  93
         1   6   3
                                          
               Accuracy : 0.8453          
                 95% CI : (0.8149, 0.8725)
    No Information Rate : 0.85            
    P-Value [Acc > NIR] : 0.6548          
                                          
                  Kappa : 0.0323          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.031250        
            Specificity : 0.988971        
         Pos Pred Value : 0.333333        
         Neg Pred Value : 0.852615        
             Prevalence : 0.150000        
         Detection Rate : 0.004687        
   Detection Prevalence : 0.014063        
      Balanced Accuracy : 0.510110        
                                          
       'Positive' Class : 1               
                              

#### Rejection sampling

In [104]:
grep('TenYearCHD', colnames(fram))

In [58]:
weights <- ifelse(training$TenYearCHD == 1, 0.7, 0.3)
weights2 <- ifelse(training$TenYearCHD == 1, 0.95, 0.05)
classifier <- caret::train
X_train <- training[, c(-9)]
y_train <- training$TenYearCHD
X_test <- test[, c(-9)]
y_test <- test$TenYearCHD

In [51]:
class(y_train)

In [59]:
knn_rs <- cost.proportionate.classifier(X_train, y_train, weights, classifier, method = 'knn', trControl=knn_control, tuneGrid=knn_grid)
knn_pred_rs <- predict(knn_rs, X_test, aggregation = 'weighted', type = 'prob', output_type='class')
knn_pred_rs <- as.factor(knn_pred_rs)
knn_cm_rs <- confusionMatrix(reference=y_test, data=knn_pred_rs, positive = '1')
print(knn_cm_rs)

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 510  77
         1  34  19
                                         
               Accuracy : 0.8266         
                 95% CI : (0.795, 0.8551)
    No Information Rate : 0.85           
    P-Value [Acc > NIR] : 0.9548         
                                         
                  Kappa : 0.166          
                                         
 Mcnemar's Test P-Value : 6.707e-05      
                                         
            Sensitivity : 0.19792        
            Specificity : 0.93750        
         Pos Pred Value : 0.35849        
         Neg Pred Value : 0.86882        
             Prevalence : 0.15000        
         Detection Rate : 0.02969        
   Detection Prevalence : 0.08281        
      Balanced Accuracy : 0.56771        
                                         
       'Positive' Class : 1              
                                         


In [60]:
knn_rs2 <- cost.proportionate.classifier(X_train, y_train, weights2, classifier, method = 'knn', trControl=knn_control, tuneGrid=knn_grid)
knn_pred_rs2 <- predict(knn_rs2, X_test, aggregation = 'weighted', type = 'prob', output_type='class')
knn_pred_rs2 <- as.factor(knn_pred_rs2)
knn_cm_rs2 <- confusionMatrix(reference=y_test, data=knn_pred_rs2, positive = '1')
print(knn_cm_rs2)

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0  57   3
         1 487  93
                                          
               Accuracy : 0.2344          
                 95% CI : (0.2021, 0.2692)
    No Information Rate : 0.85            
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.0239          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.9688          
            Specificity : 0.1048          
         Pos Pred Value : 0.1603          
         Neg Pred Value : 0.9500          
             Prevalence : 0.1500          
         Detection Rate : 0.1453          
   Detection Prevalence : 0.9062          
      Balanced Accuracy : 0.5368          
                                          
       'Positive' Class : 1               
                              