In [1]:
# install.packages("ipred")
library(ipred)

# Load the dataset
credit <- read.csv("credit.csv", stringsAsFactors = TRUE)

# Set seed for reproducibility
set.seed(2023)

# Train the model using "bootstrap aggregating" (bagging) with 25 trees
mybag <- bagging(default ~ ., data = credit, nbagg = 25, coob = TRUE)

# Display basic model information
mybag


Bagging classification trees with 25 bootstrap replications 

Call: bagging.data.frame(formula = default ~ ., data = credit, nbagg = 25, 
    coob = TRUE)

Out-of-bag estimate of misclassification error:  0.265 


In [2]:
# Generate resubstitution predictions
credit_pred <- predict(mybag, credit)

# Load "caret" for the confusion matrix below
library(caret)

# Generate the confusion matrix and other info
confusionMatrix(data = credit_pred, reference = credit$default, positive = "yes")

Loading required package: ggplot2

Loading required package: lattice



Confusion Matrix and Statistics

          Reference
Prediction  no yes
       no  700   3
       yes   0 297
                                          
               Accuracy : 0.997           
                 95% CI : (0.9913, 0.9994)
    No Information Rate : 0.7             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.9928          
                                          
 Mcnemar's Test P-Value : 0.2482          
                                          
            Sensitivity : 0.9900          
            Specificity : 1.0000          
         Pos Pred Value : 1.0000          
         Neg Pred Value : 0.9957          
             Prevalence : 0.3000          
         Detection Rate : 0.2970          
   Detection Prevalence : 0.2970          
      Balanced Accuracy : 0.9950          
                                          
       'Positive' Class : yes             
                              

In [3]:
# Load the package
#library(caret)

# Set seed for reproducibility
set.seed(2023)

# Set up the control object: 10-fold cross-validation
ctrl <- trainControl(method = "cv", number = 10)

# Train the model ("accuracy" is the default metric)
m <- train(default ~ ., data = credit, method = "treebag", trControl = ctrl, metric = "Kappa")

# Display basic model information
m

Bagged CART 

1000 samples
  16 predictor
   2 classes: 'no', 'yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 900, 900, 900, 900, 900, 900, ... 
Resampling results:

  Accuracy  Kappa    
  0.738     0.3270854


In [4]:
# Generate resubstitution predictions
credit_pred <- predict(m, credit)

# Generate the confusion matrix and other info
confusionMatrix(data = credit_pred, reference = credit$default, positive = "yes")

Confusion Matrix and Statistics

          Reference
Prediction  no yes
       no  700   7
       yes   0 293
                                          
               Accuracy : 0.993           
                 95% CI : (0.9856, 0.9972)
    No Information Rate : 0.7             
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.9832          
                                          
 Mcnemar's Test P-Value : 0.02334         
                                          
            Sensitivity : 0.9767          
            Specificity : 1.0000          
         Pos Pred Value : 1.0000          
         Neg Pred Value : 0.9901          
             Prevalence : 0.3000          
         Detection Rate : 0.2930          
   Detection Prevalence : 0.2930          
      Balanced Accuracy : 0.9883          
                                          
       'Positive' Class : yes             
                              

In [8]:
# Adaboost won't work because it's not available for my version of R

In [9]:
# Load the package
library(caret)

# Set seed for reproducibility
set.seed(2023)

# Set up the control object: 10-fold cross-validation
ctrl <- trainControl(method = "cv", number = 10)

# Train the model ("accuracy" is the default metric)
m <- train(default ~ ., data = credit, method = "adaboost", trControl = ctrl, metric = "Kappa")

# Display basic model information
m

ERROR: Error: Required packages are missing: fastAdaboost


In [11]:
# install.packages("randomForest")
library(randomForest)

# Load the dataset
credit <- read.csv("credit.csv", stringsAsFactors = TRUE)

# Set seed for reproducibility
set.seed(2023)

# Train the model using "random forest" with 500 trees and sqrt(16) = 4 features in each split
m_rf <- randomForest(default ~ ., data = credit)

# Display basic model information
m_rf


Call:
 randomForest(formula = default ~ ., data = credit) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 4

        OOB estimate of  error rate: 23.2%
Confusion matrix:
     no yes class.error
no  636  64  0.09142857
yes 168 132  0.56000000

In [12]:
# Generate resubstitution predictions
p_rf <- predict(m_rf, credit)

# Load "caret" for the confusion matrix below
library(caret)

# Generate the confusion matrix and other info
confusionMatrix(data = p_rf, reference = credit$default, positive = "yes")

Confusion Matrix and Statistics

          Reference
Prediction  no yes
       no  700   0
       yes   0 300
                                     
               Accuracy : 1          
                 95% CI : (0.9963, 1)
    No Information Rate : 0.7        
    P-Value [Acc > NIR] : < 2.2e-16  
                                     
                  Kappa : 1          
                                     
 Mcnemar's Test P-Value : NA         
                                     
            Sensitivity : 1.0        
            Specificity : 1.0        
         Pos Pred Value : 1.0        
         Neg Pred Value : 1.0        
             Prevalence : 0.3        
         Detection Rate : 0.3        
   Detection Prevalence : 0.3        
      Balanced Accuracy : 1.0        
                                     
       'Positive' Class : yes        
                                     

In [13]:
# Load the package
library(caret)

# Set seed for reproducibility
set.seed(2023)

# Set up a grid of values for "mtry"
grid <- expand.grid(mtry = c(2, 4, 6, 8, 10, 12, 14, 16))

# Set up the control object: 10-fold cross-validation
ctrl <- trainControl(method = "cv", number = 10)

# Train the model ("accuracy" is the default metric)
m <- train(default ~ ., data = credit, method = "rf", trControl = ctrl, metric = "Kappa", tuneGrid = grid)

# Display basic model information
m

Random Forest 

1000 samples
  16 predictor
   2 classes: 'no', 'yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 900, 900, 900, 900, 900, 900, ... 
Resampling results across tuning parameters:

  mtry  Accuracy  Kappa    
   2    0.720     0.1044429
   4    0.745     0.2778118
   6    0.755     0.3300787
   8    0.753     0.3450902
  10    0.753     0.3466416
  12    0.752     0.3558284
  14    0.753     0.3614863
  16    0.753     0.3668012

Kappa was used to select the optimal model using the largest value.
The final value used for the model was mtry = 16.

In [14]:
# Generate resubstitution predictions
credit_pred <- predict(m, credit)

# Generate the confusion matrix and other info
confusionMatrix(data = credit_pred, reference = credit$default, positive = "yes")

Confusion Matrix and Statistics

          Reference
Prediction  no yes
       no  700   0
       yes   0 300
                                     
               Accuracy : 1          
                 95% CI : (0.9963, 1)
    No Information Rate : 0.7        
    P-Value [Acc > NIR] : < 2.2e-16  
                                     
                  Kappa : 1          
                                     
 Mcnemar's Test P-Value : NA         
                                     
            Sensitivity : 1.0        
            Specificity : 1.0        
         Pos Pred Value : 1.0        
         Neg Pred Value : 1.0        
             Prevalence : 0.3        
         Detection Rate : 0.3        
   Detection Prevalence : 0.3        
      Balanced Accuracy : 1.0        
                                     
       'Positive' Class : yes        
                                     

In [15]:
str(credit)

'data.frame':	1000 obs. of  17 variables:
 $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
 $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
 $ credit_history      : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
 $ purpose             : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
 $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
 $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
 $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
 $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
 $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
 $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
 $ other_credit        : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ housing             : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
 $ exi