-
Notifications
You must be signed in to change notification settings - Fork 0
/
03_nlp.R
30 lines (22 loc) · 1.02 KB
/
03_nlp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
################################################################################
# Some NLP. Assumes 01_webscrape_site.R been run
################################################################################
library(ngrams) # for text analysis (Natural Language Processing NLP)
# bigrams -----------------------------------------------------------------
# get the titles as one long string separated by three spaces
title_text_str <- paste0(letters_tbl$title, collapse = " ")
# remove some stopwords (manual - could use pre-defined corpus)
title_text_str <- str_remove_all(title_text_str, " on | of | to | the | and | in ")
# find all word pairs
ng2 <- ngram(title_text_str, sep = " ",
n = 2)
# frequency table
ng2_tbl <- get.phrasetable(ng2) %>% as_tibble()
ng2_tbl # not really any surprises there!
# wordcloud ---------------------------------------------------------------
word_freq <- title_text_str %>%
ngram(sep = " ", n = 1) %>%
get.phrasetable() %>%
as_tibble()
word_freq
# doesn't look interesting