-
Notifications
You must be signed in to change notification settings - Fork 1
/
Complete_code
81 lines (59 loc) · 5.88 KB
/
Complete_code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# **This all the code used. Step by step in chronological order with comments explaining all the code.**
# Install package
install.packages("stm")
install.packages("tm")
install.packages("SnowballC")
install.packages("stringr")
# Run packages
library("stm")
library("stringr")
# Read data
data <- read.csv("texts.csv") # Read CSV file containing the texts and meta-data.
# Clean news articles
# Delete words before a certain character
data$content <- gsub(".*JvH1", "", data$content) # The news articles downloaded from the NexisLexis database contain information that we did not need (e.g. certain meta-data, dates, etc.) Considering the structure of each news article, we swapped a certain word in each news article for either 'JvH1' or 'JvH2'. This made it possible to delete all text before the string 'JvH1' and after the string 'JvH2', only leaving us with the plain news article text.
# Delete words after a certain character
data$content <-gsub(" JvH2:.*","",data$content)
## Delete whitespace in column
data$content <- trimws(data$content, which = ("both"))
write.csv(data, 'News_cleaned.csv') # Saved as seperate CSV file. Cleaned data is added to the main datafile called 'data'.
# Remove HTML stuff from all texts
data$Content = str_replace_all(data$Content, "'|"|/", "'") ## weird encoding
data$Content = str_replace_all(data$Content, "<a(.*?)>", " ") ## links
data$Content = str_replace_all(data$Content, ">|<|&", " ") ## html yuck
data$Content = str_replace_all(data$Content, "&#[:digit:]+;", " ") ## html yuck
data$Content = str_remove_all(data$Content, "<[^>]*>") ## more html yuck
# Remove HTML stuff from all texts
data$Content <- gsub("&", "", data$Content)
data$Content <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", data$Content)
data$Content <- gsub("@\\w+", "", data$Content)
data$Content <- gsub("[[:punct:]]", "", data$Content)
data$Content <- gsub("[[:digit:]]", "", data$Content)
data$Content <- gsub("http\\w+", "", data$Content)
data$Content <- gsub("^\\s+|\\s+$", "", data$Content)
# Note on removing HTML stuff: The textProcessor also removes many of the unwanted characters from the text. Because we are working with tweets and many weird/unwanted characters popped up we run the above two blocks of code as an extra. We might be overdoing it, but better safe than sorry.
# Use textProcessor and prepDocuments to clean and prepare data
processed <- textProcessor(data$Content, metadata = data)
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
docs <- out$documents
vocab <- out$vocab
meta <- out$meta
kresult <- searchK(out$documents, out$vocab, K = c(60, 70, 80, 90, 100), prevalence =~ Company + Type, data = meta) # This function is used to compare models that use different amount of topics. The output are metrics to determine what the 'optimal' amount of topics is.
# Estimate model
model80 <- stm(documents = out$documents, vocab = out$vocab,
K = 80, prevalence =~ Company + Type, max.em.its = 200, data = out$meta, init.type = "Spectral") # Estimate the topic model. In this case 80 topics are being estiamted.
model60 <- stm(documents = out$documents, vocab = out$vocab,
K = 60, prevalence =~ Company + Type, max.em.its = 200, data = out$meta, init.type = "Spectral") # Same as above, except in this example a model with 60 topics is estimated.
plot(topicQuality(model = model80, documents = docs)) # Creates a plot displaying topic quality. Topic quality is a trade off between semantic coherence and exclusivity.
# Function that produces output containing four different methods of devising the words that form a topic.
labelTopics(model80)
# With this function the top words for each selected topic are displayed.
plot.STM(model80, "labels", topics = c(8, 12, 18, 21, 28, 29, 31, 32, 33, 36, 37, 38, 42, 44, 46, 47, 49, 53, 64, 65, 67, 70, 73), label = 'frex', n = 10, width = 55)
# With this code a regression analysis is run, using ''Type' as a covariates. 23 topics are selected.
out$meta$Company <- as.factor(out$meta$Company)
prep <- estimateEffect(1:80 ~ Type, model80, meta = out$meta, uncertainty = "Global")
summary(prep, topics = 8, 12, 18, 21, 28, 29, 31, 32, 33, 36, 37, 38, 42, 44, 46, 47, 49, 53, 64, 65, 67, 70, 73)
# Plot function to visualize the regression results from estimateEffect. In this case 23 topics are displayed. The output shows the differences in topic proportion between Twitter and the news. Company and Type are being used as covariates.
plot(prep, covariate = "Type", topics = c(8, 12, 18, 21, 28, 29, 31, 32, 33, 36, 37, 38, 42, 44, 46, 47, 49, 53, 64, 65, 67, 70, 73), model = model80, method = "difference", cov.value1 = "Twitter", cov.value2 = "News", xlab = "News vs. Twitter", main = "Effect of source", xlim = c(-0.3, 0.9), labeltype = "custom", custom.labels = c("Government", "Factory worker union", "Hospital donation", "Fair wages", "Environmental impact", "Data protection", "Plastic bags", "Garment factory workers", "Supply chain transparency", "Abuse", "Uyghur workers", "Conscious fashion", "Copy right violation", "ABF Food profits", "Canceled rent", "Racist H&M ad", "Recycable cotton", "Hunting", "Furloughing employees", "Mohair wool", "Covid19 safety measures", "Recycable fashion", "Gender equality"))
# Same example as above, but now the news and company reports are compared.
plot(prep, covariate = "Type", topics = c(8, 12, 18, 21, 28, 29, 31, 32, 33, 36, 37, 38, 42, 44, 46, 47, 49, 53, 64, 65, 67, 70, 73), model = model80, method = "difference", cov.value1 = "News", cov.value2 = "Company reports", xlab = "Company reports vs News", main = "Effect of source", xlim = c(-0.3, 0.1), labeltype = "custom", custom.labels = c("Ban Mohair", "(Gender) Equality", "Fair wages", "Boycot Brazilian leather", "Ban plastic bags", "Uyghur workers", "Copyright violation", "Quartely results", "Cancel orders", "Factory workers union", "Racist H&M ad", "Sustainable fashion", "Covid19 safety measures"))