In [1]:
# Download the dataset
credit <- read.csv("credit.csv", stringsAsFactors = TRUE)

# Display the structure of the dataset
str(credit)
# Default is the thing we want to predict

'data.frame':	1000 obs. of  17 variables:
 $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
 $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
 $ credit_history      : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
 $ purpose             : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
 $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
 $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
 $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
 $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
 $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
 $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
 $ other_credit        : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ housing             : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
 $ exi

In [2]:
# Set a seed for reproducibility
set.seed(2023)

# Select 900 values from 1:1000 without replacement
train_sample <- sample(1000, 900)

# Check the result
str(train_sample)

 int [1:900] 885 464 431 361 968 755 282 556 866 497 ...


In [4]:
# Select the 900 examples (rows) randomly chosen above for the training set
credit_train <- credit[train_sample, ]

# Put the remaining 100 examples into the test set
credit_test <- credit[-train_sample, ]
# Generate proportion tables for default/non-defaults for training and test datasets
round(proportions(table(credit_train$default)) * 100, 1)
round(proportions(table(credit_test$default)) * 100, 1)


  no  yes 
69.9 30.1 


 no yes 
 71  29 

In [5]:
# install.package("C50")
library(C50)

# Train the model on training data (remove column 17 -- class labels) and training labels (column 17, "default")
credit_model <- C5.0(credit_train[-17], credit_train$default)

# Alternative way to train the mode using a formula
#credit_model <- C5.0(default ~ ., data = credit_train)

In [6]:
# Display model info
credit_model


Call:
C5.0.default(x = credit_train[-17], y = credit_train$default)

Classification Tree
Number of samples: 900 
Number of predictors: 16 

Tree size: 45 

Non-standard options: attempt to group attributes


In [7]:
# Display the summary of the decision tree model
summary(credit_model)



Call:
C5.0.default(x = credit_train[-17], y = credit_train$default)


C5.0 [Release 2.07 GPL Edition]  	Wed Feb  8 12:40:47 2023
-------------------------------

Class specified by attribute `outcome'

Read 900 cases (17 attributes) from undefined.data

Decision tree:

checking_balance in {> 200 DM,unknown}: no (414/53)
checking_balance in {< 0 DM,1 - 200 DM}:
:...months_loan_duration <= 22:
    :...credit_history in {perfect,very good}:
    :   :...years_at_residence <= 1: no (2)
    :   :   years_at_residence > 1:
    :   :   :...phone = no: yes (15/1)
    :   :       phone = yes:
    :   :       :...amount <= 6148: no (3)
    :   :           amount > 6148: yes (2)
    :   credit_history in {critical,good,poor}:
    :   :...purpose in {business,car0,renovations}: no (27/4)
    :       purpose = education:
    :       :...savings_balance in {< 100 DM,> 1000 DM,100 - 500 DM,
    :       :   :                   500 - 1000 DM}: yes (9/1)
    :       :   savings_balance = unknown: no (4)

In [10]:
credit_pred <- predict(credit_model, credit_train)

# Load the library with CrossTable()
library(gmodels)

# Runs it on same data it was trained on
# Generate the confusion matrix with the test labels and predictions
CrossTable(credit_train$default, credit_pred, dnn = c("Actual", "Predictions"), prop.chisq = FALSE, prop.r = FALSE, prop.c = FALSE)


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  900 

 
             | Predictions 
      Actual |        no |       yes | Row Total | 
-------------|-----------|-----------|-----------|
          no |       597 |        32 |       629 | 
             |     0.663 |     0.036 |           | 
-------------|-----------|-----------|-----------|
         yes |        96 |       175 |       271 | 
             |     0.107 |     0.194 |           | 
-------------|-----------|-----------|-----------|
Column Total |       693 |       207 |       900 | 
-------------|-----------|-----------|-----------|

 


In [11]:
# Apply our C5.0 model to the test dataset now
credit_pred <- predict(credit_model, credit_test)

# Generate the confusion matrix
CrossTable(credit_test$default, credit_pred, dnn = c("Actual default", "Predicted default"), prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE)


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
               | Predicted default 
Actual default |        no |       yes | Row Total | 
---------------|-----------|-----------|-----------|
            no |        65 |         6 |        71 | 
               |     0.650 |     0.060 |           | 
---------------|-----------|-----------|-----------|
           yes |        19 |        10 |        29 | 
               |     0.190 |     0.100 |           | 
---------------|-----------|-----------|-----------|
  Column Total |        84 |        16 |       100 | 
---------------|-----------|-----------|-----------|

 


In [12]:
# Build a boosted model consisting of an ensemble of up to 10 trees
credit_boost10 <- C5.0(credit_train[-17], credit_train$default, trials = 10)

# Examine the model
credit_boost10


Call:
C5.0.default(x = credit_train[-17], y = credit_train$default, trials = 10)

Classification Tree
Number of samples: 900 
Number of predictors: 16 

Number of boosting iterations: 10 
Average tree size: 39.6 

Non-standard options: attempt to group attributes


In [13]:
# Use the boosted model to generate predictions for the training dataset
credit_boost10_train_pred <- predict(credit_boost10, credit_train)

# Display the confusion matrix for the training dataset
CrossTable(credit_train$default, credit_boost10_train_pred, dnn = c("Actual default", "Predicted default"), prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE)


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  900 

 
               | Predicted default 
Actual default |        no |       yes | Row Total | 
---------------|-----------|-----------|-----------|
            no |       625 |         4 |       629 | 
               |     0.694 |     0.004 |           | 
---------------|-----------|-----------|-----------|
           yes |        38 |       233 |       271 | 
               |     0.042 |     0.259 |           | 
---------------|-----------|-----------|-----------|
  Column Total |       663 |       237 |       900 | 
---------------|-----------|-----------|-----------|

 


In [14]:
# Use the boosted model to generate predictions for the test dataset
credit_boost10_test_pred <- predict(credit_boost10, credit_test)

# Display the confusion matrix for the test dataset
CrossTable(credit_test$default, credit_boost10_test_pred, dnn = c("Actual default", "Predicted default"), prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE)


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
               | Predicted default 
Actual default |        no |       yes | Row Total | 
---------------|-----------|-----------|-----------|
            no |        60 |        11 |        71 | 
               |     0.600 |     0.110 |           | 
---------------|-----------|-----------|-----------|
           yes |        17 |        12 |        29 | 
               |     0.170 |     0.120 |           | 
---------------|-----------|-----------|-----------|
  Column Total |        77 |        23 |       100 | 
---------------|-----------|-----------|-----------|

 


In [15]:
# Specify the matrix dimensions labels (to make sure the classifier has the right numbers for each class)
matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))

# Specify the dimension names (not really needed, but better for checking we put the true classes into columns)
names(matrix_dimensions) <- c("Predicted", "Actual")

# Check for correctness
matrix_dimensions

# Define the matrix; it is defined by filling column-by-column, so the order needs to be 0, 1, 4, 0
error_cost <- matrix(c(0, 1, 4, 0), nrow = 2, dimnames = matrix_dimensions)

# Check for correctness
error_cost

Unnamed: 0,no,yes
no,0,4
yes,1,0


In [16]:
# Train the model with the cost matrix
credit_cost <- C5.0(credit_train[-17], credit_train$default, costs = error_cost)

# Make predictions with the test dataset
credit_cost_pred <- predict(credit_cost, credit_test)

# Generate the confusion matrix
CrossTable(credit_test$default, credit_cost_pred, dnn = c("Actual default", "Predicted default"), prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE)


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
               | Predicted default 
Actual default |        no |       yes | Row Total | 
---------------|-----------|-----------|-----------|
            no |        28 |        43 |        71 | 
               |     0.280 |     0.430 |           | 
---------------|-----------|-----------|-----------|
           yes |         6 |        23 |        29 | 
               |     0.060 |     0.230 |           | 
---------------|-----------|-----------|-----------|
  Column Total |        34 |        66 |       100 | 
---------------|-----------|-----------|-----------|

 
