# Model Tuning
![](banner_model_tuning.jpg)
_<p style="text-align: center;"> Which model performs best? </p>_

In [None]:
f = "setup.R"; for (i in 1:10) { if (file.exists(f)) break else f = paste0("../", f) }; source(f)


## Introduction

Motivation, context, history, related topics ...

## Synopsis

## Exposition

### Data

Consider this pedagogical dataset.

In [None]:
data = data.frame(x1=c(1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 6.5, 8.0, 9.0, 9.5, 1.0, 1.5, 2.0, 3.0, 1.5, 2.0, 3.0, 6.5, 8.0, 9.0, 9.5),
                  x2=c(1.0, 0.5, 3.0, 4.0, 5.5, 5.0, 3.5, 4.0, 7.0, 6.5, 9.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 6.5, 8.0, 9.0, 9.5),
                  x3=c(5.0, 5.5, 5.0, 5.0, 5.5, 5.0, 3.5, 4.0, 7.0, 6.5, 9.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 6.5, 8.0, 9.0, 9.5),
                  class=c("A","A","B","B","A","A","A","B","B","A","B", "A","A","B","B","A","A","A","B","B","A","B"))

p1 = ggplot(data) + xlim(0,10) + ylim(0,10) + geom_point(aes(x=x1, y=x2, color=class)) + theme.legend_below
p2 = ggplot(data) + xlim(0,10) + ylim(0,10) + geom_point(aes(x=x1, y=x3, color=class)) + theme.legend_below
p3 = ggplot(data) + xlim(0,10) + ylim(0,10) + geom_point(aes(x=x2, y=x3, color=class)) + theme.legend_below

grid.arrange(p1, p2, p3, nrow=1)

### Automatic Iteration

In [None]:
x = list()
for (i in 1:5) { x[[i]] = i*10 }
x # a list
unlist(x) # a vector

### Cross Validation (Revisited)

#### Choose Number of Folds 

In [None]:
nfold = 5
fmt(nfold)

#### Training Data & Testing Data for Each Fold 

In [None]:
set.seed(0)
fold = createFolds(data$class, k=nfold)

data.train = list()
data.test  = list()
for (i in 1:nfold) { data.train[[i]] = data[setdiff(1:nrow(data), fold[[i]]),]
                     data.test[[i]]  = data[fold[[i]],] }

layout(fmt(data.train[[1]]), fmt(data.train[[2]]), fmt(data.train[[3]]), fmt(data.train[[4]]), fmt(data.train[[5]]))
layout(fmt(data.test[[1]]),  fmt(data.test[[2]]),  fmt(data.test[[3]]),  fmt(data.test[[4]]),  fmt(data.test[[5]]))

#### A Confusion Matrix for Each Fold 

In [None]:
cm = list()
for (i in 1:nfold) { set.seed(0)
                     model = svm(class ~ x1+x2+x3, data.train[[i]], kernel="polynomial", degree=3, cost=10, probability=TRUE)
                     prob = attr(predict(model, data.test[[i]], probability=TRUE), "probabilities")
                     class.predicted = as.class(prob, "A", cutoff=0.5)
                     CM = confusionMatrix(class.predicted, data.test[[i]]$class)$table
                     cm[[i]] = CM/sum(CM) }

layout(fmt.cm(cm[[1]]), fmt.cm(cm[[2]]), fmt.cm(cm[[3]]), fmt.cm(cm[[4]]), fmt.cm(cm[[5]]))

#### A Performance Measurement for Each Fold 

In [None]:
accuracy = list()
for (i in 1:nfold) { accuracy[[i]] = cm[[i]][1,1]+cm[[i]][2,2] }

layout(fmt(accuracy[[1]]), fmt(accuracy[[2]]), fmt(accuracy[[3]]), fmt(accuracy[[4]]), fmt(accuracy[[5]]))

#### A Performance Measurement for the Model

In [None]:
cv_accuracy = mean(unlist(accuracy))
fmt(cv_accuracy)

#### Put It All Together

In [None]:
nfold = 5

# Training Data & Testing Data for Each Fold 
set.seed(0)
fold = createFolds(data$class, k=nfold)
data.train = list()
data.test  = list()
for (i in 1:nfold) { data.train[[i]] = data[setdiff(1:nrow(data), fold[[i]]),]
                     data.test[[i]]  = data[fold[[i]],] }

# A Confusion Matrix for Each Fold
cm = list()
for (i in 1:nfold) { set.seed(0)
                     model = svm(class ~ x1+x2, data.train[[i]], kernel="polynomial", degree=3, cost=10, probability=TRUE)
                     prob = attr(predict(model, data.test[[i]], probability=TRUE), "probabilities")
                     class.predicted = as.class(prob, "A", cutoff=0.5)
                     CM = confusionMatrix(class.predicted, data.test[[i]]$class)$table
                     cm[[i]] = CM/sum(CM) }

# A Performance Measurement for Each Fold 
accuracy = list()
for (i in 1:nfold) { accuracy[[i]] = cm[[i]][1,1]+cm[[i]][2,2] }

# A Performance Measurement for the Model
cv_accuracy = mean(unlist(accuracy))
fmt(cv_accuracy)

### Model Tuning by Cut-off Value Selection

In [None]:
tune = data.frame()
for (q in c(0.10, 0.25, 0.50, 0.75, 0.90))
{
   tune = rbind(tune, data.frame(cutoff=q)) 
}

tune

In [None]:
tune = data.frame()
for (q in c(0.10, 0.25, 0.50, 0.75, 0.90))  # try several values for cut-off
{ 

    nfold = 5

    # Training Data & Testing Data for Each Fold 
    set.seed(0)
    fold = createFolds(data$class, k=nfold)
    data.train = list()
    data.test  = list()
    for (i in 1:nfold) { data.train[[i]] = data[setdiff(1:nrow(data), fold[[i]]),]
                         data.test[[i]]  = data[fold[[i]],] }

    # A Confusion Matrix for Each Fold
    cm = list()
    for (i in 1:nfold) { set.seed(0)
                         model = svm(class ~ x1+x2+x3, data.train[[i]], kernel="polynomial", degree=3, cost=10, probability=TRUE)
                         prob = attr(predict(model, data.test[[i]], probability=TRUE), "probabilities")
                         class.predicted = as.class(prob, "A", cutoff=q)
                         CM = confusionMatrix(class.predicted, data.test[[i]]$class)$table
                         cm[[i]] = CM/sum(CM) }

    # A Performance Measurement for Each Fold 
    accuracy = list()
    for (i in 1:nfold) { accuracy[[i]] = cm[[i]][1,1]+cm[[i]][2,2] }

    # A Performance Measurement for the Model
    cv_accuracy = mean(unlist(accuracy))
     
    # Gather Results
    tune = rbind(tune, data.frame(method="svm", cutoff=q, nfold, cv_accuracy))  
    
}
    
tune

In [None]:
tune[which.max(tune$cv_accuracy), ]

### Model Tuning by Hyper-Parameter Value Selection

In [None]:
tune = data.frame()
for (j in 1:4)  # try several values for degree
for (k in c(0.1, 1, 10, 100, 1000))  # try several values for cost   
{
    tune = rbind(tune, data.frame(degree=j, cost=k))   
}
    
tune

In [None]:
tune = data.frame()
for (j in 1:4)  # try several values for degree hyper-parameter
for (k in c(0.1, 1, 10, 100, 1000))  # try several values for cost hyper-parameter
{ 

    nfold = 5

    # Training Data & Testing Data for Each Fold 
    set.seed(0)
    fold = createFolds(data$class, k=nfold)
    data.train = list()
    data.test  = list()
    for (i in 1:nfold) { data.train[[i]] = data[setdiff(1:nrow(data), fold[[i]]),]
                         data.test[[i]]  = data[fold[[i]],] }

    # A Confusion Matrix for Each Fold
    cm = list()
    for (i in 1:nfold) { set.seed(0)
                         model = svm(class ~ x1+x2+x3, data.train[[i]], kernel="polynomial", degree=j, cost=k, probability=TRUE)
                         prob = attr(predict(model, data.test[[i]], probability=TRUE), "probabilities")
                         class.predicted = as.class(prob, "A", cutoff=0.5)
                         CM = confusionMatrix(class.predicted, data.test[[i]]$class)$table
                         cm[[i]] = CM/sum(CM) }

    # A Performance Measurement for Each Fold 
    accuracy = list()
    for (i in 1:nfold) { accuracy[[i]] = cm[[i]][1,1]+cm[[i]][2,2] }

    # A Performance Measurement for the Model
    cv_accuracy = mean(unlist(accuracy))
     
    # Gather Results
    tune = rbind(tune, data.frame(method="svm", degree=j, cost=k, nfold, cv_accuracy))  
    
}
    
tune

In [None]:
tune[which.max(tune$cv_accuracy), ]

### Model Tuning by Exhaustive Variable Selection

Also called "feature selection" and "attribute selection". 

In [None]:
exhaustive(names(data), keep="class") # a list

In [None]:
tune = data.frame()
for (f in exhaustive(names(data), keep="class"))
{
    tune = rbind(tune, data.frame(variables=paste(f, collapse=", ")))
}

tune # a data.frame

In [None]:
tune = data.frame()
for (f in exhaustive(names(data), keep="class")) # try several combinations of variables
{ 

    nfold = 5

    # Training Data & Testing Data for Each Fold 
    set.seed(0)
    fold = createFolds(data$class, k=nfold)
    data.train = list()
    data.test  = list()
    for (i in 1:nfold) { data.train[[i]] = data[setdiff(1:nrow(data), fold[[i]]),]
                         data.test[[i]]  = data[fold[[i]],] }

    # A Confusion Matrix for Each Fold
    cm = list()
    for (i in 1:nfold) { set.seed(0)
                         model = svm(class ~ ., data.train[[i]][,f], kernel="polynomial", degree=3, cost=10, probability=TRUE)
                         prob = attr(predict(model, data.test[[i]], probability=TRUE), "probabilities")
                         class.predicted = as.class(prob, "A", cutoff=0.5)
                         CM = confusionMatrix(class.predicted, data.test[[i]]$class)$table
                         cm[[i]] = CM/sum(CM) }

    # A Performance Measurement for Each Fold 
    accuracy = list()
    for (i in 1:nfold) { accuracy[[i]] = cm[[i]][1,1]+cm[[i]][2,2] }

    # A Performance Measurement for the Model
    cv_accuracy = mean(unlist(accuracy))
     
    # Gather Results
    tune = rbind(tune, data.frame(method="svm", variables=paste(f, collapse=", "), nfold, cv_accuracy))  
    
}
    
tune

In [None]:
tune[which.max(tune$cv_accuracy), ]

### Generalized Model Tuning

In [None]:
tune = data.frame()
for (q in c(0.10, 0.25, 0.50, 0.75, 0.90))  # try several values for cut-off
for (f in exhaustive(names(data), keep="class")) # try several combinations of variables
for (j in 1:4)  # try several values for degree
for (k in c(0.1, 1, 10, 100, 1000))  # try several values for cost
{ 

    nfold = 5

    # Training Data & Testing Data for Each Fold 
    set.seed(12345)
    fold = createFolds(data$class, k=nfold)
    data.train = list()
    data.test  = list()
    for (i in 1:nfold) { data.train[[i]] = data[setdiff(1:nrow(data), fold[[i]]),]
                         data.test[[i]]  = data[fold[[i]],] }

    # A Confusion Matrix for Each Fold
    cm = list()
    for (i in 1:nfold) { set.seed(0)
                         model = svm(class ~ ., data.train[[i]][,f], kernel="polynomial", degree=j, cost=k, probability=TRUE)
                         prob = attr(predict(model, data.test[[i]], probability=TRUE), "probabilities")
                         class.predicted = as.class(prob, "A", cutoff=q)
                         CM = confusionMatrix(class.predicted, data.test[[i]]$class)$table
                         cm[[i]] = CM/sum(CM) }

    # A Performance Measurement for Each Fold 
    accuracy = list()
    for (i in 1:nfold) { accuracy[[i]] = cm[[i]][1,1]+cm[[i]][2,2] }

    # A Performance Measurement for the Model
    cv_accuracy = mean(unlist(accuracy))
     
    # Gather Results
    tune = rbind(tune, data.frame(method="svm", cutoff=q, degree=j, cost=k, variables=paste(f, collapse=", "), nfold, cv_accuracy))  
    
}
    
size(tune)
tune

In [None]:
tune[which.max(tune$cv_accuracy), ]

## Code

### Useful Functions

In [None]:
# as.class              # from setup.R
# exhaustive            # from setup.R

# help(confusionMatrix) # from caret library
# help(createFolds)     # from caret library
# help(list)            # from base library
# help(paste)           # from base library
# help(rbind)           # from base library
# help(setdiff)         # from base library
# help(unlist)          # from base library
# help(which.max)       # from base library

## Expectations

Know about this:
* How to evaluate model performance by cross validation, conceptually and using R.
* How to tune a model by hyper-parameter value selection, conceptually and using R.
* How to tune a model by variable selection, conceptually and using R.
* How to tune a model by hyper-parameter value and variable selection, conceptually and using R.
* Compute time effects of tuning, conceptually in general.

## Further Reading

* http://www-stat.wharton.upenn.edu/~stine/mich/DM_03.pdf
* http://www.milanor.net/blog/cross-validation-for-predictive-analytics-using-r/
* http://www.cs.cmu.edu/~./awm/tutorials/overfit10.pdf

<p style="text-align:left; font-size:10px;">
Copyright (c) Berkeley Data Analytics Group, LLC
<span style="float:right;">
Document revised April 5, 2020
</span>
</p>