Skip to content

MNoorFawi/reporting-to-word-with-R

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

23 Commits
 
 
 
 
 
 
 
 
 
 

Repository files navigation

reporting in word

Reporting Analysis done with R into a Word Document

We will use R to analyse the Bank Marketing dataset and apply a Logistic Regression model on it to predict if the client will subscribe a term deposit (variable y), then deliver results in a Word document using R officer & flextable libraries.

options(warn = -1)
library(officer)
library(flextable)
library(ggplot2)
library(reshape)
library(dplyr)
library(gridExtra)
library(ROCR)
data <- read.csv("bank.csv", sep = ";")
str(data)
## 'data.frame':    4521 obs. of  17 variables:
##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 2 3 2 2 3 2 2 2 2 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 1 2 3 3 2 3 3 2 3 1 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 2 1 2 1 1 1 1 1 2 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 11 9 1 7 9 4 9 9 9 1 ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
##  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
summary(data)
##       age                 job          marital         education   
##  Min.   :19.00   management :969   divorced: 528   primary  : 678  
##  1st Qu.:33.00   blue-collar:946   married :2797   secondary:2306  
##  Median :39.00   technician :768   single  :1196   tertiary :1350  
##  Mean   :41.17   admin.     :478                   unknown  : 187  
##  3rd Qu.:49.00   services   :417                                   
##  Max.   :87.00   retired    :230                                   
##                  (Other)    :713                                   
##  default       balance      housing     loan           contact    
##  no :4445   Min.   :-3313   no :1962   no :3830   cellular :2896  
##  yes:  76   1st Qu.:   69   yes:2559   yes: 691   telephone: 301  
##             Median :  444                         unknown  :1324  
##             Mean   : 1423                                         
##             3rd Qu.: 1480                                         
##             Max.   :71188                                         
##                                                                   
##       day            month         duration       campaign     
##  Min.   : 1.00   may    :1398   Min.   :   4   Min.   : 1.000  
##  1st Qu.: 9.00   jul    : 706   1st Qu.: 104   1st Qu.: 1.000  
##  Median :16.00   aug    : 633   Median : 185   Median : 2.000  
##  Mean   :15.92   jun    : 531   Mean   : 264   Mean   : 2.794  
##  3rd Qu.:21.00   nov    : 389   3rd Qu.: 329   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 293   Max.   :3025   Max.   :50.000  
##                  (Other): 571                                  
##      pdays           previous          poutcome      y       
##  Min.   : -1.00   Min.   : 0.0000   failure: 490   no :4000  
##  1st Qu.: -1.00   1st Qu.: 0.0000   other  : 197   yes: 521  
##  Median : -1.00   Median : 0.0000   success: 129             
##  Mean   : 39.77   Mean   : 0.5426   unknown:3705             
##  3rd Qu.: -1.00   3rd Qu.: 0.0000                            
##  Max.   :871.00   Max.   :25.0000                            
## 
docx <- read_docx()
officer::body_add_par(docx, value = "Table of Content", 
                      style = "heading 1")
body_add_toc(docx, level = 2)
body_add_break(docx)
title_case <- function(x) {
  lower_case <- tolower(x)
  without_underscore <- gsub("_", " ", lower_case, perl = TRUE)
  titled <- gsub("(^|[[:space:]])([[:alpha:]])",
                 "\\1\\U\\2",
                 without_underscore,
                 perl = TRUE)
  titled
}

# trim <- function(x) {
#   gsub("^\\s+|\\s+$", "", x)
# }

create_flextable <- function(df, theme = "box") {
  thm <- paste0("flextable::theme_", theme, "(ft)")
  ft <- flextable::flextable(df)
  ns <- title_case(names(df))
  ex <- paste0('"', names(df), '"', "=", '"', ns, '"', collapse = ",")
  expr <- paste0("flextable::set_header_labels(ft,")
  exp <- paste0(expr, ex, ")")
  ft <- eval(parse(text = exp))
  ft <- eval(parse(text = thm))
  ft <- ft %>%
    flextable::bold(part = "header") %>%
    flextable::bold(j = 1) %>%
    flextable::font(fontname = "Arrial Narrow", part = "all") %>%
    flextable::fontsize(size = 10, part = "all") %>%
    flextable::align(align = "left", part = "all") %>%
    flextable::autofit()
  ft
}
body_add_par(docx, "Exploring Data", style = "heading 1")
body_add_par(docx, "Summarizing Data", style = "heading 2")
body_add_par(docx, '- Examining the "age" and "balance" averages of each "Marital Status" group',
             style = "Normal")
age_balance_marital <- data %>% group_by(marital) %>%
  summarise(age_average = mean(age),
            balance_average = mean(balance))
abm <- create_flextable(age_balance_marital, "box")
flextable::body_add_flextable(docx, value = abm, align = "left")
body_add_par(docx, "", style = "Normal")
body_add_par(docx, '- Examining the "age" and "balance" averages of each "Job" who subscribed in this campaign',
             style = "Normal")
age_balance_job <-
  data %>% group_by(job, y) %>% filter(y == "yes") %>%
  summarise(age_average = mean(age),
            balance_average = mean(balance)) %>%
  arrange(desc(balance_average))
abj <- create_flextable(age_balance_job, "zebra")
flextable::body_add_flextable(docx, value = abj, align = "left")
body_add_par(docx, "", style = "Normal")
body_add_par(docx, '- Examining the "duration" of each "Contact Method"',
             style = "Normal")
contact_duration <- data %>% group_by(contact) %>%
  summarise(duration_average_min = mean(duration)) %>%
  mutate(duration_average_min = duration_average_min / 60)
cd <- create_flextable(contact_duration, "vanilla")
flextable::body_add_flextable(docx, value = cd, align = "left")
cursor_end(docx)
body_add_break(docx)
body_add_par(docx, "Visualizing Data", style = "heading 2")
body_add_par(docx, "- Visualizing distribution of numeric columns: ", 
             style = "Normal")
# in case we don't know the exact spelling of the column names we can 
# extract them with this regex
regex <- "(?=.*age)|(?=.*balanc)|(?=.*durat)"
cols <- colnames(data)
nums <- grep(regex, cols, perl = TRUE, value = TRUE)

age_balance_duration <- data[, c(nums, "y")] %>%
  mutate(balance = log(balance, base = 10),
         duration = duration / 60) %>%
  dplyr::rename(log10_balance = balance,
                duration_min = duration) %>%
  melt(id.vars = "y")

ggplot(age_balance_duration, aes(x = value, fill = y)) +
  geom_density(color = "white", alpha = 0.6) +
  facet_wrap( ~ variable, scales = "free") +
  theme_minimal() + scale_fill_brewer(palette = "Set1") +
  theme(legend.position = "top")

filepath <- "."
filename <- paste0(dirname(filepath), "/density.png")
ggsave(filename = filename, height = 4, width = 6)
body_add_img(docx, src = filename, height = 4, width = 6)
body_add_par(docx, "", style = "Normal")
body_add_par(docx, "- Visualizing relationship between numeric columns and y: ", style = "Normal")
ggplot(age_balance_duration, aes(x = value, y = as.numeric(y))) +
  geom_point(position = position_jitter(w = 0.05, h = 0.05),
             alpha = 0.3) +
  geom_smooth(method = "loess", se = FALSE) +
  facet_wrap( ~ variable, scales = "free", ncol = 2) +
  theme_minimal() + ylab("Subscription")

filename <- paste0(dirname(filepath), "/scatter.png")
ggsave(filename = filename, height = 4, width = 6)
body_add_img(docx, src = filename, height = 4, width = 6)
body_add_par(docx, "", style = "Normal")
body_add_par(docx, "- spotting days, months and jobs in which users tend to subscribe", style = "Normal")
data <- data %>%
  mutate(month =
           factor(gsub("(^[[:alpha:]])", "\\U\\1",
                       month, perl = TRUE),
                  month.abb))
gg <- ggplot(data, aes(x = as.numeric(month), color = y)) +
  geom_freqpoly(stat = "count") +
  xlab("month") + 
  scale_x_continuous(breaks = seq(1, 12, 1), labels = month.abb) +
  scale_y_continuous(breaks = seq(0, 1500, 250)) +
  theme_minimal() + scale_color_brewer(palette = "Set1") +
  xlab("last contact month of the year")

gg2 <- ggplot(data, aes(x = job, fill = y)) +
  geom_bar(width = 0.8,
           color = "white",
           alpha = 0.8) +
  theme_minimal() + scale_fill_brewer(palette = "Set1")

gg3 <- ggplot(data, aes(x = day, fill = y)) +
  geom_histogram(bins = 30, col = "white") + theme_minimal() +
  scale_fill_brewer(palette = "Accent") +
  xlab("last contact day of the month")

ggsave("daymonthjob.png", 
       arrangeGrob(gg, gg2, gg3), height = 4, width = 7)
body_add_img(docx, src = "daymonthjob.png", height = 4, width = 7)

body_add_par(docx, "", style = "Normal")
body_add_par(docx, "- Visualizing frequency of categorical variables and their relationship with y: ",
             style = "Normal")

# function (not in)
'%!in%' <- function(x, y)
  ! ('%in%'(x, y))

vars <- colnames(data)
# summary of numeric data
summary(data[, vars[sapply(data[, vars], class) %in% 
                      c("numeric", "integer")]])
##       age           balance           day           duration   
##  Min.   :19.00   Min.   :-3313   Min.   : 1.00   Min.   :   4  
##  1st Qu.:33.00   1st Qu.:   69   1st Qu.: 9.00   1st Qu.: 104  
##  Median :39.00   Median :  444   Median :16.00   Median : 185  
##  Mean   :41.17   Mean   : 1423   Mean   :15.92   Mean   : 264  
##  3rd Qu.:49.00   3rd Qu.: 1480   3rd Qu.:21.00   3rd Qu.: 329  
##  Max.   :87.00   Max.   :71188   Max.   :31.00   Max.   :3025  
##     campaign          pdays           previous      
##  Min.   : 1.000   Min.   : -1.00   Min.   : 0.0000  
##  1st Qu.: 1.000   1st Qu.: -1.00   1st Qu.: 0.0000  
##  Median : 2.000   Median : -1.00   Median : 0.0000  
##  Mean   : 2.794   Mean   : 39.77   Mean   : 0.5426  
##  3rd Qu.: 3.000   3rd Qu.: -1.00   3rd Qu.: 0.0000  
##  Max.   :50.000   Max.   :871.00   Max.   :25.0000
# change pdays variable to factor of intervals and delete it
data$days_after_last_contact <-
  cut(data$pdays,
      breaks = c(-1, 0, 150, seq(300, 900, 300)),
      include.lowest = T)
data$pdays <- NULL

# change campaign and previous to factor of intervals and rename them
data <- data %>% mutate(
  campaign = cut(
    campaign,
    breaks = c(seq(1, 10, 2),
               seq(25, 50, 25)),
    include.lowest = TRUE
  ),
  previous = cut(
    previous,
    breaks = c(seq(0, 6, 2),
               seq(15, 25, 10)),
    include.lowest = TRUE
  )
) %>% dplyr::rename(
    number_contacts_this_campaign = campaign,
    number_contacts_before_this_campaign = previous
  )

# new column names
vars <- colnames(data)
# categorical variables
cat_vars <-
  vars[sapply(data[, vars], class) %in% c("factor", "character")]
# we have already examined month and job
cat_vars <- cat_vars[cat_vars %!in% c("month", "job")]

cat_melted <- data[, cat_vars] %>%
  melt(id.vars = "y")
ggplot(cat_melted, aes(x = value, fill = y)) + 
  geom_bar(width = 0.5, color = "white") +
  theme_minimal() + theme(
    legend.position = c(0.6, 0.1),
    legend.direction = "horizontal",
    legend.key.size = unit(10, "mm"),
    legend.title = element_text("Subscription")
  ) +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  scale_fill_brewer(palette = "Paired")

ggsave("catvars.png", height = 5, width = 7)
body_add_img(docx, src = "catvars.png", height = 5, width = 7)
body_add_break(docx)
body_add_par(docx, "Modeling Data", style = "heading 1")
body_add_par(docx, "Preparing data for modeling", 
             style = "heading 2")
body_add_par(docx, "- one hot encoding of categorical variables and normalizing numeric ones", 
             style = "Normal")
body_add_par(docx, paste("dimenstions of original data:",
                         paste(dim(data), collapse = " , ")),
             style = "Normal")

data$day <- factor(data$day)
# predictors and outcome
pred_vars <- setdiff(vars, "y")

## to one hot encode factor values and normalize numeric ones if needed
cat <-
  pred_vars[sapply(data[, pred_vars], class) %in% c("factor", "character")]
num <-
  pred_vars[sapply(data[, pred_vars], class) %in% c("numeric", "integer")]

for (i in cat) {
  dict <- unique(data[, i])
  for (key in dict) {
    data[[paste0(i, "_", key)]] <- 1.0 * (data[, i] == key)
  }
}

data[, num] <- apply(data[, num], 2, function(x) {
  (x - min(x)) / (max(x) - min(x))
})

data <- data[, -which(colnames(data) %in% cat)]

body_add_par(docx, paste("dimenstions of new encoded data:",
                         paste(dim(data), collapse = " , ")),
             style = "Normal")
cursor_end(docx)
body_add_par(docx, "Train/Test Splitting", 
             style = "heading 2")
body_add_par(docx, "- splitting data to train and test datasets (80/20)", style = "Normal")

sample_size <- floor(0.8 * nrow(data))
## set the seed to make your splits reproducible
set.seed(13)
train_indices <- sample(seq(nrow(data)), size = sample_size)
train <- data[train_indices,]
test <- data[-train_indices,]

body_add_par(docx, paste("nrows of training dataset:", 
                         nrow(train)), 
             style = "Normal")

body_add_par(docx, paste("nrows of test dataset:", 
                         nrow(test)), 
             style = "Normal")
body_add_par(docx, "Fitting Classification Model", 
             style = "heading 2")
body_add_par(docx, "- Applying Logistic Regression algorithm to data: ", style = "Normal")

glmmodel <- glm(y ~ ., data = train, family = binomial(link = "logit"))

# how well the model fits the data it has seen
train$pred <- predict(glmmodel, newdata = train, type = "response")

# visualizing classification thresheold
body_add_par(docx, "", style = "Normal")
body_add_par(docx, "Spotting Classification Threshold Probability Value: ",
             style = "Normal")

ggplot(train, aes(x = pred, fill = y)) + 
  geom_density(alpha = 0.6, col = "white") +
  scale_fill_brewer(palette = "Set1") + theme_minimal()

ggsave("threshold.png", height = 4, width = 5)
body_add_img(docx, src = "threshold.png", height = 4, width = 5)
body_add_par(docx, "", style = "Normal")
predobj <- prediction(train$pred, train$y)
perf <- performance(predobj, "sens", "spec")
sensitivity <- perf@y.values[[1]]
specificity <- perf@x.values[[1]]
acc <- performance(predobj, "acc")
accuracy <- acc@y.values[[1]]
error.rate <- 1 - accuracy
threshold <- acc@x.values[[1]]
errors <- data.frame(cbind(threshold,
                           cbind(error.rate,
                                 cbind(
                                   accuracy,
                                   cbind(sensitivity, specificity)
                                 ))))

error.data <- melt(errors, id.vars = "threshold")
ggplot(error.data, aes(x = threshold, y = value, 
                       col = variable, linetype = variable)) + 
  geom_line() + theme_minimal() +
  scale_color_brewer(palette = "Set1") + 
  scale_x_continuous(breaks = seq(0, 1, 0.1)) +
  theme(legend.position = "top")

ggsave("threshold2.png", height = 4, width = 5)
body_add_img(docx, src = "threshold2.png", height = 4, width = 5)
body_add_par(docx, "The model seems to classify well at probablity: 0.12",
             style = "Normal")
body_add_par(docx, "", style = "Normal")
body_add_par(docx, "Evaluating the Model", 
             style = "heading 2")
body_add_par(docx, "- Measuring Model Accuracy: ", style = "Normal")

(tab <- table(train$pred >= 0.12, train$y))
##        
##           no  yes
##   FALSE 2690   74
##   TRUE   508  344
(`train accuracy` <- round((tab[1, 1] + tab[2, 2]) / sum(tab), 2))
## [1] 0.84
test$pred <- predict(glmmodel, newdata = test, type = "response")
(tab <- table(test$pred >= 0.12, test$y))
##        
##          no yes
##   FALSE 683  15
##   TRUE  119  88
(`test accuracy` <- round((tab[1, 1] + tab[2, 2]) / sum(tab), 2))
## [1] 0.85
accuracy <- t(cbind(`train accuracy`, `test accuracy`))
accuracy <- as.data.frame(cbind(rownames(accuracy), accuracy))

ns <- names(accuracy)
ft <- flextable(accuracy)
# to exclude headers in flextable
# set_header_labels(ft, V1 = NULL, V2 = NULL)
# names(df) <- paste0("X", 1:ncol(df))

# so we need to know in advance the column names but there's a trick 
# to apply it without knowing them
ex <- paste0('"', ns, '"', "=NULL", collapse = ",")
expr <- paste0('flextable::set_header_labels(ft,')
exp <- paste0(expr, ex, ")")
ft <- eval(parse(text = exp))

ft <- ft %>% bold(j = 1) %>% font(fontname = "Arial") %>%
  fontsize(size = 11) %>% border_remove() %>% autofit() %>%
  align(align = "left", part = "all") %>% 
  color(i = 2, j = 2, color = "darkblue") %>%
  bold(j = 2, i = 2)
flextable::body_add_flextable(docx, value = ft, align = "left")
print(docx, target = "report.docx")

Releases

No releases published

Packages

No packages published

Languages