In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [43]:
library("ggplot2")
library("dplyr")
library("corrplot")
library("plyr")
library("randomForest")
library(yardstick)
library(ggplot2)
# library("RSNNS")

In [3]:
df <- read.csv("../input/bank-marketing-analysis/bank-additional-full.csv", sep = ";")[26001:40000,]
# check the basic info of it
# we can see that there is 41188 rows and 21 columns
dim(df)

In [4]:
str(df)

In [5]:
count(df$y) %>%
  mutate(prop = round(freq/sum(freq), 1)) %>%
  ggplot(aes(x = "", y = prop, fill = x )) + 
  geom_bar(width = 1, stat = "identity",
           color = "black") +
  # change to polar system
  coord_polar("y", start = 0, direction = 1)

In [6]:
y_h <- c("no" = 0, "yes"= 1)
df$y = as.integer(revalue(df$y,y_h))
numericvars <- which(sapply(df, is.numeric))
all_num_var <- df[, numericvars]
# for all the numerical variable, change variable become 0 and 1,

# Then do the pearson correlation for all numerical data
corr_var <- cor(all_num_var, use = "pairwise.complete.obs")
# sort
corr_sort <- (as.matrix(sort(corr_var[,"y"], decreasing = TRUE)))
CorHigh <- names(which(apply(corr_sort, 1, function(x) abs(x)>0.2)))
cor_numVar <- corr_var[CorHigh, CorHigh]
corrplot(cor_numVar,tl.col = "black", number.cex = .7,addCoef.col = "black",method = 'color',type='lower')

In [7]:
final_numeric_var <- CorHigh
# check factor data
factorvars <- which(!sapply(df, is.numeric))
factorvars

In [8]:
# marital, default, housing,loan, contact, month, day of week, poutcome as unrank
# education as rank
df$contact = as.factor(revalue(df$contact,c("cellular"=1, "telephone"=0)))
# map unknown become 0, means no
df$loan = as.factor(revalue(df$loan,c("yes"=1,"unknown"=0,"no"=0)))
df$housing = as.factor(revalue(df$housing,c("yes"=1,"unknown"=0,"no"=0)))
df$default = as.factor(revalue(df$default,c("yes"=0,"unknown"=0,"no"=1)))
# change days to factor, as a rank part
df$pdays = as.factor(df$pdays) 
df$poutcome = as.factor(revalue(df$poutcome, c("nonexistent"=0, "failure"=0, "success"=1)))

In [9]:
# do the pca for numerical
numericvars <- which(sapply(df[,1:20], is.numeric))
numericvars
all_num_var <- df[, numericvars]
PCA1 <- prcomp(all_num_var, center = TRUE, scale. = TRUE)
# summary the importance of components
summary(PCA1)
# I will choose the first 7 since it's over all 95% variation of our dataset
final_num_value <- PCA1$x[,1:6]
final_num_value <- cbind(final_num_value, df[,"y"])
colnames(final_num_value) <- c("PC1" ,"PC2" ,"PC3", "PC4" ,"PC5","PC6","y")
colnames(final_num_value)
dim(final_num_value)
# then combine all factor value and use it as our final input dataset
factor_var <- which(!sapply(df, is.numeric))
final_dataset <- cbind(final_num_value, df[,factor_var])
head(final_dataset)

In [10]:
train <- sample(nrow(final_dataset), 0.8*nrow(final_dataset))
train_set <- final_dataset[train,]
valid <- final_dataset[-train,]
dim(train_set)
dim(valid)

## Random Forest

In [11]:
set.seed(1234)
# train_set$y = as.factor(train_set$y)
rf_model = randomForest(as.factor(y)~.,data=train_set,importance=TRUE,proximity=TRUE)
rf_model

### Mean Accuracy Decrease
#### By changing the value of a variable to a random number, the accuracy of random forest prediction is reduced. The greater the value, the greater the importance of the variable

In [12]:
## Look at variable importance:
# round(importance(rf_train), 3)
imp_acc <- importance(rf_model, type=1)
accImportance <- data.frame(Feature=row.names(imp_acc), Importance=imp_acc[,1])
accImportance

In [13]:
p <- ggplot(accImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
     geom_bar(stat="identity", fill="#53cfff") +
     coord_flip() + 
     theme_light(base_size=20) +
     xlab("") +
     ylab("Importance") + 
     ggtitle("Random Forest Feature Accuracy Importance\n") +
     theme(plot.title=element_text(size=18))
p

### Mean Gini Decrease
#### Calculate the influence of each variable on the observations at each node of the classification tree, that is, the ability to classify, so as to compare the importance of variables. The larger the value, the more important the variable is.

In [14]:
imp_gini <- importance(rf_model, type=2)
giniImportance <- data.frame(Feature=row.names(imp_gini), Importance=imp_gini[,1])
giniImportance

In [15]:
p <- ggplot(giniImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
     geom_bar(stat="identity", fill="#53cfff") +
     coord_flip() + 
     theme_light(base_size=20) +
     xlab("") +
     ylab("Importance") + 
     ggtitle("Random Forest Feature Gini Importance\n") +
     theme(plot.title=element_text(size=18))
p

### Predition of Validation Set

In [16]:
# valid$y = as.integer(valid$y)

In [34]:
pred <- as.integer(predict(rf_model, valid)) 

acc = 1- (sum((abs((pred-1)-valid[,"y"]))) / length(pred))
acc

rf_model

In [18]:
n<-length(names(train_set))     #计算数据集中自变量个数，等同n=ncol(train_data)
err_rate=1     #设置模型误判率向量初始值

for(i in 1:(n-1)){
  set.seed(12345)
  rf_train <- randomForest(as.factor(train_set$y)~.,data=train_set,mtry=i,ntree=1000)
  err_rate[i] <- mean(rf_train$err.rate[,"OOB"])   #计算基于OOB数据的模型误判率均值  
}

err_rate     #展示所有模型误判率的均值
m <- which.min(err_rate) 
m

In [19]:
plot(rf_model)

In [25]:
rf_model = randomForest(as.factor(y)~.,data=train_set,importance=TRUE,proximity=TRUE, mtry = 4, oob_score =TRUE)
rf_model

In [21]:
pls.cmx <- confusionMatrix(data = pls_predicting, test$type)
pls.cmx
confusion <- rf_model$confusion[, c("0", "1")]
autoplot(conf_mat(confusion), type = "heatmap")

In [46]:
pred <- as.integer(predict(rf_model, valid)) 

acc = 1- (sum((abs((pred-1)-valid[,"y"]))) / length(pred))
print(acc)
confusion = table(pred-1, valid$y)
confusion

In [40]:
acc = (confusion[1,1]+confusion[2,2]) / sum(confusion)
pred = confusion[1,1] / (confusion[1,1]+confusion[2,1])
recall = confusion[1,1] / (confusion[1,1]+confusion[1,2])
f_score = 2*(pred*recall)/(pred+recall)
f_score

In [None]:
library(yardstick)
library(ggplot2)

set.seed(123)
truth_predicted <- data.frame(
  obs = sample(0:1,100, replace = T),
  pred = sample(0:1,100, replace = T)
)
truth_predicted$obs <- as.factor(truth_predicted$obs)
truth_predicted$pred <- as.factor(truth_predicted$pred)

cm <- conf_mat(truth_predicted, obs, pred)
print(class(cm))
autoplot(cm, type = "heatmap") +
  scale_fill_gradient(low = "pink", high = "cyan")