# Forward Variable Selection
![](banner_forward_variable_selection.jpg)
_<p style="text-align: center;"> Which combination of instruments sounds best? </p>_

In [1]:
f = "setup.R"; for (i in 1:10) { if (file.exists(f)) break else f = paste0("../", f) }; source(f)


## Introduction

Motivation, context, history, related topics ...

## Terms

## Data

Consider this pedagogical dataset.

In [2]:
data = data.frame(x1=c(1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 6.5, 8.0, 9.0, 9.5, 1.0, 1.5, 2.0, 3.0, 1.5, 2.0, 3.0, 6.5, 8.0, 9.0, 9.5),
                  x2=c(1.0, 0.5, 3.0, 4.0, 5.5, 5.0, 3.5, 4.0, 7.0, 6.5, 9.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 6.5, 8.0, 9.0, 9.5),
                  x3=c(5.0, 5.5, 5.0, 5.0, 5.5, 5.0, 3.5, 4.0, 7.0, 6.5, 9.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 6.5, 8.0, 9.0, 9.5),
                  x4=c(3.0, 2.5, 4.0, 4.0, 1.5, 1.0, 1.5, 3.0, 7.0, 6.5, 9.0, 1.0, 3.5, 1.0, 3.0, 4.0, 3.0, 6.0, 7.5, 8.0, 9.0, 2.5),
                  x5=c(3.0, 2.5, 2.0, 4.0, 1.5, 1.0, 1.5, 3.0, 5.0, 6.5, 9.0, 1.0, 3.5, 1.0, 3.0, 4.0, 3.0, 6.0, 7.5, 8.0, 9.0, 2.5),
                  x6=c(3.0, 2.5, 4.0, 3.0, 1.5, 1.0, 1.5, 2.0, 7.0, 6.5, 9.0, 1.0, 3.5, 1.0, 3.0, 4.0, 3.0, 6.0, 7.0, 8.0, 9.0, 2.5),
                  x7=c(3.0, 2.5, 4.0, 4.0, 1.5, 3.0, 1.5, 3.0, 7.0, 6.5, 7.0, 1.0, 3.5, 1.0, 3.0, 4.0, 3.0, 6.0, 7.0, 8.0, 9.0, 2.5),
                  x8=c(3.0, 2.5, 4.0, 4.0, 1.5, 1.0, 1.5, 3.0, 7.0, 6.5, 9.0, 8.0, 3.5, 3.0, 3.0, 4.0, 3.0, 6.0, 3.5, 8.0, 9.0, 2.5),
                  x9=c(3.0, 2.5, 4.0, 4.0, 2.5, 1.0, 1.5, 3.0, 7.0, 6.5, 9.0, 1.0, 9.5, 1.0, 3.0, 4.0, 3.0, 6.0, 7.5, 2.0, 9.0, 2.5),
                  class=c("A","A","B","B","A","A","A","B","B","A","B", "A","A","B","B","A","A","A","B","B","A","B"))

data

x1,x2,x3,x4,x5,x6,x7,x8,x9,class
1.0,1.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,A
1.5,0.5,5.5,2.5,2.5,2.5,2.5,2.5,2.5,A
2.0,3.0,5.0,4.0,2.0,4.0,4.0,4.0,4.0,B
3.0,4.0,5.0,4.0,4.0,3.0,4.0,4.0,4.0,B
4.0,5.5,5.5,1.5,1.5,1.5,1.5,1.5,2.5,A
5.0,5.0,5.0,1.0,1.0,1.0,3.0,1.0,1.0,A
6.0,3.5,3.5,1.5,1.5,1.5,1.5,1.5,1.5,A
6.5,4.0,4.0,3.0,3.0,2.0,3.0,3.0,3.0,B
8.0,7.0,7.0,7.0,5.0,7.0,7.0,7.0,7.0,B
9.0,6.5,6.5,6.5,6.5,6.5,6.5,6.5,6.5,A


## Forward Variable Selection

Also called "feature selection" and "attribute selection". 

Consider some possible selections of 3 variables.

In [3]:
selective(names(data), keep="class") # a list

In [4]:
selective(names(data), keep=c("x2","class")) # a list

In [5]:
selective(names(data), keep=c("x2","x1","class")) # a list

Select the one variable that produces a better model than the other variables do.

In [6]:
tune = data.frame()
for (f in selective(names(data), keep="class"))
{ 

    nfold = 5

    set.seed(0)
    fold = createFolds(data$class, k=nfold)

    accuracy = c()
    for (i in 1:nfold) { data.train = data[setdiff(1:nrow(data), fold[[i]]),]
                         data.test  = data[fold[[i]],]

                         set.seed(0)
                         model = svm(class ~ ., data.train[,f], kernel="polynomial", degree=3, cost=10, probability=TRUE)
                         prob = attr(predict(model, data.test, probability=TRUE), "probabilities")
                         class.predicted = as.class(prob, "A", cutoff=0.5)
                         CM = confusionMatrix(class.predicted, data.test$class)$table
                         cm = CM/sum(CM)
                         accuracy[i] = cm[1,1]+cm[2,2] }

                        
    accuracy.cv = mean(accuracy)
    tune = rbind(tune, data.frame(method="svm", variables=paste(f, collapse=", "), nfold, accuracy.cv))  
    
}
    
fmt(tune)

method,variables,nfold,accuracy.cv
svm,"x1, class",5,0.37
svm,"x2, class",5,0.41
svm,"x3, class",5,0.49
svm,"x4, class",5,0.5
svm,"x5, class",5,0.5
svm,"x6, class",5,0.5
svm,"x7, class",5,0.46
svm,"x8, class",5,0.46
svm,"x9, class",5,0.54


In [7]:
best = tune[which.max(tune$accuracy.cv),]
fmt(best)

method,variables,nfold,accuracy.cv
svm,"x9, class",5,0.54


Select a second variable that, in combination with the first selected variable, produces a better model than the other variables do.

In [8]:
tune = data.frame()
for (f in selective(names(data), keep=string2vector(best$variables)))
{ 
    nfold = 5

    set.seed(0)
    fold = createFolds(data$class, k=nfold)

    accuracy = c()
    for (i in 1:nfold) { data.train = data[setdiff(1:nrow(data), fold[[i]]),]
                         data.test  = data[fold[[i]],]

                         set.seed(0)
                         model = svm(class ~ ., data.train[,f], kernel="polynomial", degree=3, cost=10, probability=TRUE)
                         prob = attr(predict(model, data.test, probability=TRUE), "probabilities")
                         class.predicted = as.class(prob, "A", cutoff=0.5)
                         CM = confusionMatrix(class.predicted, data.test$class)$table
                         cm = CM/sum(CM)
                         accuracy[i] = cm[1,1]+cm[2,2] }

                        
    accuracy.cv = mean(accuracy)
    tune = rbind(tune, data.frame(method="svm", variables=paste(f, collapse=", "), nfold, accuracy.cv))  
    
}
    
fmt(tune)

method,variables,nfold,accuracy.cv
svm,"x9, x1, class",5,0.36
svm,"x9, x2, class",5,0.4
svm,"x9, x3, class",5,0.5
svm,"x9, x4, class",5,0.45
svm,"x9, x5, class",5,0.51
svm,"x9, x6, class",5,0.45
svm,"x9, x7, class",5,0.51
svm,"x9, x8, class",5,0.46


In [9]:
best = tune[which.max(tune$accuracy.cv),]
fmt(best)

method,variables,nfold,accuracy.cv
svm,"x9, x5, class",5,0.51


Select a third variable that, in combination with the first and second selected variables, produces a better model than the other variables do.

In [10]:
tune = data.frame()
for (f in selective(names(data), keep=string2vector(best$variables)))
{ 
    nfold = 5

    set.seed(0)
    fold = createFolds(data$class, k=nfold)

    accuracy = c()
    for (i in 1:nfold) { data.train = data[setdiff(1:nrow(data), fold[[i]]),]
                         data.test  = data[fold[[i]],]

                         set.seed(0)
                         model = svm(class ~ ., data.train[,f], kernel="polynomial", degree=3, cost=10, probability=TRUE)
                         prob = attr(predict(model, data.test, probability=TRUE), "probabilities")
                         class.predicted = as.class(prob, "A", cutoff=0.5)
                         CM = confusionMatrix(class.predicted, data.test$class)$table
                         cm = CM/sum(CM)
                         accuracy[i] = cm[1,1]+cm[2,2] }

                        
    accuracy.cv = mean(accuracy)
    tune = rbind(tune, data.frame(method="svm", variables=paste(f, collapse=", "), nfold, accuracy.cv))  
    
}
    
fmt(tune)

method,variables,nfold,accuracy.cv
svm,"x9, x5, x1, class",5,0.54
svm,"x9, x5, x2, class",5,0.45
svm,"x9, x5, x3, class",5,0.46
svm,"x9, x5, x4, class",5,0.46
svm,"x9, x5, x6, class",5,0.46
svm,"x9, x5, x7, class",5,0.56
svm,"x9, x5, x8, class",5,0.51


In [13]:
best = tune[which.max(tune$accuracy.cv),]
fmt(best)

method,variables,nfold,accuracy.cv
svm,"x9, x5, x7, class",5,0.56


## Code

### Useful Functions

In [12]:
# as.class              # from setup.R
# selective             # from setup.R

# help(confusionMatrix) # from caret library
# help(createFolds)     # from caret library
# help(list)            # from base library
# help(paste)           # from base library
# help(rbind)           # from base library
# help(setdiff)         # from base library
# help(which.max)       # from base library

## Expectations



## Further Reading

* http://www-stat.wharton.upenn.edu/~stine/mich/DM_03.pdf
* http://www.milanor.net/blog/cross-validation-for-predictive-analytics-using-r/
* http://www.cs.cmu.edu/~./awm/tutorials/overfit10.pdf

<p style="text-align:left; font-size:10px;">
Copyright (c) Berkeley Data Analytics Group, LLC
<span style="float:right;">
Document revised April 5, 2021
</span>
</p>