-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_emails.R
107 lines (107 loc) 路 3.58 KB
/
get_emails.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#' Get emails and its contents
#'
#' @description Get the content of Hillary Rodham Clinton's emails by release.
#'
#' @param release Name of the batch of release of emails; see details.
#' @param save.dir Directory where to save the extracted text defaults to
#' \code{getwd()}
#' @param extractor Full path to pdf extractor (text to pdf), see details.
#'
#' @details Below are the valid values for \code{release}; follows the
#' \href{http://graphics.wsj.com/hillary-clinton-email-documents/}{WSJ} naming
#' convention.
#' \itemize{
#' \item Benghazi
#' \item June
#' \item July
#' \item August
#' \item September
#' \item October
#' \item November
#' \item January 7
#' \item January 29
#' \item February 19
#' \item february 29
#' \item December
#' \item Non-disclosure
#' }
#'
#' The \code{extractor} argument is the full path to your pdftotext.exe
#' extractor; visit \href{http://www.foolabs.com/xpdf/download.html}{xpdf} to
#' download or try \code{\link{get_xpdf}} which attempts to download and
#' unzip the text to pdf extractor. See examples.
#'
#' @return Fetches email zip file from the WSJ and extract text files in
#' \code{save.dir}, returns full path to directory that contains parsed txt
#' files.
#'
#' @examples
#' \dontrun{
#' # get xpdf extractor
#' ext <- get_xpdf()
#'
#' # create
#' dir.create("./emails")
#'
#' # get emails released in august
#' emails_aug <- get_emails(release = "August", save.dir = "./emails",
#' extractor = ext)
#'
#' # use manually downloaded extractor
#' ext <- "C:/xpdfbin-win-3.04/bin64/pdftotext.exe"
#'
#' # get emails related to Benghazi released in December
#' emails_bengh <- get_emails(release = "Benghazi", extractor = ext,
#' save.dir = "./emails")
#'
#' files <- list.files(emails_bengh)
#' content <- lapply(1:length(files), function(x){
#' readLines(paste0(emails_bengh, "/", files[[x]]))
#' })
#' }
#'
#' @seealso \code{\link{get_xpdf}}
#'
#' @author John Coene \email{jcoenep@gmail.com}
#'
#' @export
get_emails <- function(release, save.dir = getwd(), extractor){
if (missing(extractor)) {
stop("Missing extractor, see get_extractor")
}
if (!dir.exists(save.dir)) {
stop("save.dir does not exist")
}
if(!file.exists(extractor)){
stop("wrong path to extractor")
}
v <- suppressWarnings(system(paste(extractor, "-v"))) # test extractor
if (!v %in% c(99, 0, 1, 2, 3)) {
stop("incorrect path to extractor, see get_xpdf")
}
if (!dir.exists(save.dir)) {
stop("save.dir does not exist")
}
uri <- checkRelease(release) # check release input and return URL
temp_zip <- tempfile(fileext = ".zip") # create temp
download.file(uri, destfile = temp_zip) # download
temp_dir <- tempdir() # create dir
unzip(zipfile = temp_zip, exdir = temp_dir) # unzip
unlink("temp_zip", recursive = TRUE) # delete temp once unzipped
files <- list.files(temp_dir) # list files
files <- files[grep("pdf", files)] # only take pdf files
dest <- gsub(".pdf", ".txt", files) # name destinations for extraction
cat("Extracting content from", length(files), "pdf files...\n")
pb <- txtProgressBar(style = 3)
for (i in 1:length(files)) {
pdf <- paste0(temp_dir,"\\", files[i])
txt <- paste0(save.dir,"/", dest[i])
system(paste(extractor, "-nopgbrk" , pdf, txt, sep = " "),
wait = TRUE)
setTxtProgressBar(pb, i/length(files))
}
unlink("temp_dir", recursive = TRUE) # delete temp once extracted
close(pb)
message("emails .txt files saved at: ", save.dir)
return(save.dir)
}