In [1]:
knitr::opts_chunk$set(echo = TRUE)
library(caret)
library(ranger)
library(rpart)
library(rattle)
library(e1071)

df <- read.csv("adult.csv", header = FALSE, na.strings = c(" ?", "NA", "."))
names(df) = c("age", "workclass", "fnlwgt", "education", "educationNum", "maritalStatus", "occupation", "relationship", "race", "sex", "capitalGain", "capitalLoss", "hoursPerWeek", "nativeCountry", "target")
df$age <- as.numeric(df$age)
df$workclass <- as.factor(df$workclass)
df$fnlwgt <- as.numeric(df$fnlwgt)
df$education <- as.factor(df$education)
df$educationNum <- as.numeric(df$educationNum)
df$maritalStatus <- as.factor(df$maritalStatus)
df$occupation <- as.factor(df$occupation)
df$relationship <- as.factor(df$relationship)
df$race <- as.factor(df$race)
df$sex <- as.factor(df$sex)
df$capitalGain <- as.numeric(df$capitalGain)
df$capitalLoss <- as.numeric(df$capitalLoss)
df$hoursPerWeek <- as.numeric(df$hoursPerWeek)
df$nativeCountry <- as.factor(df$nativeCountry)

repcat = function(x){
  for(i in 1:ncol(x)){
    if(is.factor(x[,i])){
     x[,i][is.na(x[,i])] = levels(x[,i])[which.max(table(x[,i]))]
    }
  }
  return(x)
}
df = repcat(df)

levels(df$education) = c("HS", "HS", "HS", "Ele", "Middle", "Middle", "HS", "UGrad", "UGrad", "UGrad", "PGrad", "HS", "PGrad", "Ele", "UGrad", "UGrad")
levels(df$occupation) = c("WCollar", "Military", "BCollar", "WCollar", "BCollar", "BCollar", "BCollar", "Other", "BCollar", "WCollar", "BCollar", "WCollar", "WCollar", "BCollar")
levels(df$nativeCountry) = c("Asia", "NAmerica", "Asia", "SAmerica", "CAmerica", "CAmerica", "SAmerica", "SAmerica", "Europe", "Europe", "Europe", "Europe", "CAmerica", "CAmerica", "Europe", "CAmerica", "Asia", "Europe", "Asia", "Asia", "Europe", "Europe", "CAmerica", "Asia", "Asia", "NAmerica", "CAmerica", "NAmerica", "SAmerica", "Asia", "Europe", "Europe", "CAmerica", "Europe", "Asia", "Asia", "Asia", "SAmerica", "NAmerica", "Asia", "Europe")

df$workclass <- is.numeric(df$workclass)
df$relationship <- is.numeric(df$relationship)

preProcess_missingdata_model <- preProcess(df, method= c('medianImpute', "center", "scale"))
trainData <- predict(preProcess_missingdata_model, newdata = df)
dummies_model <- dummyVars(target ~ ., data=trainData)
trainData_mat <- predict(dummies_model, newdata = trainData)
trainData <- data.frame(trainData_mat)
trainData$target <- df$target

set.seed(2018)
splitIndex <- createDataPartition(trainData$target, p = .70, list = FALSE, times = 1)
train <- trainData[ splitIndex,]
test <- trainData[-splitIndex,]
dfmodel <- train(target~.,data = train, method = "rpart")
pred=predict(dfmodel,test)
cm=confusionMatrix(pred, test$target, positive=" >50K")
cm

dfmodel=ranger(target ~., data = train)
levels(test$target) = c("0", "1")
pred2=predict(dfmodel, data = test)$predictions
levels(pred2) = c("0", "1")
cm2=confusionMatrix(pred2, test$target, positive="1")
cm2

repnum = function(x){
  for(i in 1:ncol(x)){
    if(is.numeric(x[,i])){
     x[is.na(x[,i]), i] <- mean(x[,i], na.rm = TRUE)
    }
  }
  return(x)
}
repnum(df)

df = repnum(df)

preProcess_missingdata_model <- preProcess(df, method= c("center", "scale"))
trainData <- predict(preProcess_missingdata_model, newdata = df)
dummies_model <- dummyVars(target ~ ., data=trainData)
trainData_mat <- predict(dummies_model, newdata = trainData)
trainData <- data.frame(trainData_mat)
trainData$target <- df$target

set.seed(2018)
splitIndex <- createDataPartition(trainData$target, p = .70, list = FALSE, times = 1)
train <- trainData[ splitIndex,]
test <- trainData[-splitIndex,]
dfmodel <- train(target~.,data = train, method = "rpart")
pred=predict(dfmodel,test)
cm=confusionMatrix(pred, test$target, positive=" >50K")
cm

dfmodel=ranger(target ~., data = train)
levels(test$target) = c("0", "1")
pred2=predict(dfmodel, data = test)$predictions
levels(pred2) = c("0", "1")
cm2=confusionMatrix(pred2, test$target, positive="1")
cm2

preProcess_missingdata_model <- preProcess(df, method= c('knnImpute', "center", "scale"))
trainData <- predict(preProcess_missingdata_model, newdata = df)
dummies_model <- dummyVars(target ~ ., data=trainData)
trainData_mat <- predict(dummies_model, newdata = trainData)
trainData <- data.frame(trainData_mat)
trainData$target <- df$target

set.seed(2018)
splitIndex <- createDataPartition(trainData$target, p = .70, list = FALSE, times = 1)
train <- trainData[ splitIndex,]
test <- trainData[-splitIndex,]
dfmodel <- train(target~.,data = train, method = "rpart")
pred=predict(dfmodel,test)
cm=confusionMatrix(pred, test$target, positive=" >50K")
cm

dfmodel=ranger(target ~., data = train)
levels(test$target) = c("0", "1")
pred2=predict(dfmodel, data = test)$predictions
levels(pred2) = c("0", "1")
cm2=confusionMatrix(pred2, test$target, positive="1")
cm2

df1 <- df
df1$education <- as.numeric(df1$education)
df1$occupation <- as.numeric(df1$occupation)
df1$nativeCountry <- as.numeric(df1$nativeCountry)

preProcess_missingdata_model <- preProcess(df1, method= c('knnImpute', "center", "scale"))
trainData <- predict(preProcess_missingdata_model, newdata = df1)
dummies_model <- dummyVars(target ~ ., data=trainData)
trainData_mat <- predict(dummies_model, newdata = trainData)
trainData <- data.frame(trainData_mat)
trainData$target <- df$target

set.seed(2018)
splitIndex <- createDataPartition(trainData$target, p = .70, list = FALSE, times = 1)
train <- trainData[ splitIndex,]
test <- trainData[-splitIndex,]
dfmodel <- train(target~.,data = train, method = "rpart")
pred=predict(dfmodel,test)
cm=confusionMatrix(pred, test$target, positive=" >50K")
cm

dfmodel=ranger(target ~., data = train)
levels(test$target) = c("0", "1")
pred2=predict(dfmodel, data = test)$predictions
levels(pred2) = c("0", "1")
cm2=confusionMatrix(pred2, test$target, positive="1")
cm2

Loading required package: lattice
Loading required package: ggplot2
Rattle: A free graphical interface for data science with R.
Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
Type 'rattle()' to shake, rattle, and roll your data.

Attaching package: 'rattle'

The following object is masked from 'package:ranger':

    importance

"cannot open file 'adult.csv': No such file or directory"

ERROR: Error in file(file, "rt"): cannot open the connection
