-
Notifications
You must be signed in to change notification settings - Fork 5
/
sherlock-holmes.R
28 lines (22 loc) · 942 Bytes
/
sherlock-holmes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# install.packages("ngram")
# install.packages("tm")
library(ngram)
library(tm)
# read txt file
url <- "https://raw.githubusercontent.com/voltek62/RsparkleR-examples/master/examples/advs.txt"
txt <- readLines(url)
data.sentence <- concatenate(txt)
# remove punctuations & numbers, fix spacing
data.sentence.staging <- preprocess(data.sentence
,case='lower'
,remove.punct = TRUE
,remove.numbers = TRUE
,fix.spacing = TRUE
)
# remove stopwords
stopwords_regex = paste(c(stopwords('en'),'holmes'), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
data.sentence.prepared = stringr::str_replace_all(data.sentence.staging, stopwords_regex, '')
# bigram only
ng <- ngram(data.sentence.prepared, n=2)
print(head(get.phrasetable(ng)))