In [1]:
# This assignment is adapted and accomplished with the tutorial "Your First Machine Learning Project in R Step-By-Step" by Jason Brownlee.

## https://machinelearningmastery.com/machine-learning-in-r-step-by-step/Links to an external site.

In [None]:
install.packages("caret")

In [None]:
library(caret)

In [None]:
# attach the iris dataset to the environment
data(iris)
# rename the dataset
dataset <- iris

In [None]:
# define the filename
filename <- "iris.csv"
# load the CSV file from the local directory
dataset <- read.csv(filename, header=FALSE)
# set the column names in the dataset
colnames(dataset) <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species")

In [None]:
# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(dataset$Species, p=0.80, list=FALSE)
# select 20% of the data for validation
validation <- dataset[-validation_index,]
# use the remaining 80% of data to training and testing the models
dataset <- dataset[validation_index,]

In [None]:
# dimensions of dataset
dim(dataset)

In [None]:
# list types for each attribute
sapply(dataset, class)

In [None]:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species 
 "numeric" "numeric" "numeric" "numeric" "factor"

In [None]:
# take a peek at the first 5 rows of the data
head(dataset)

In [None]:
# list the levels for the class
levels(dataset$Species)

In [None]:
# summarize the class distribution
percentage <- prop.table(table(dataset$Species)) * 100
cbind(freq=table(dataset$Species), percentage=percentage)

In [None]:
# summarize attribute distributions
summary(dataset)

In [None]:
# split input and output
x <- dataset[,1:4]
y <- dataset[,5]

In [None]:
# boxplot for each attribute on one image
par(mfrow=c(1,4))
  for(i in 1:4) {
  boxplot(x[,i], main=names(iris)[i])
}

In [None]:
# barplot for class breakdown
plot(y)

In [None]:
# scatterplot matrix
featurePlot(x=x, y=y, plot="ellipse")

In [None]:
# box and whisker plots for each attribute
featurePlot(x=x, y=y, plot="box")

In [None]:
# density plots for each attribute by class value
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)

In [None]:
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"

In [None]:
# a) linear algorithms
set.seed(7)
fit.lda <- train(Species~., data=dataset, method="lda", metric=metric, trControl=control)
# b) nonlinear algorithms
# CART
set.seed(7)
fit.cart <- train(Species~., data=dataset, method="rpart", metric=metric, trControl=control)
# kNN
set.seed(7)
fit.knn <- train(Species~., data=dataset, method="knn", metric=metric, trControl=control)
# c) advanced algorithms
# SVM
set.seed(7)
fit.svm <- train(Species~., data=dataset, method="svmRadial", metric=metric, trControl=control)
# Random Forest
set.seed(7)
fit.rf <- train(Species~., data=dataset, method="rf", metric=metric, trControl=control)

In [None]:
# summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)

In [None]:
# compare accuracy of models
dotplot(results)