Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
631 lines (511 sloc) 16.3 KB
---
title: "R Notebook"
output: html_notebook
---
```{r}
library(tidyverse)
library(tidytext)
library(lubridate)
library(ggrepel)
library(ggthemes)
```
Raw data, email joe at residualthoughts dot com if you'd like me to share it with you; otherwise you can pull reddit data through the python api
```{r}
df <- read_csv("~/downloads/reddit_nutrition_submissions.csv")
```
```{r}
# create date
df <- df %>% mutate(dt = as.Date(created_date))
```
Custom stop words filter out some words we don't want to analyse that appear often in reddit text (imgur.com, for example)
```{r}
custom_stop_words <- tibble(word = c("na", "drink", "cocktail", "recipe", "shake", "ice", "1", "2", "oz", "ounces", "http","https", "cocktails", "recipes", "amp", "4", "favorite", "love", "removed", "imgur", "i.imgur", "NA", "keywords", "na", "utf8", "amp", "qid", "sr"))
```
Creating a combined text column with both heading and body text
```{r}
df <- df %>% mutate(combined_text = paste0(title," ", text))
```
Tokenizing datasets
```{r}
tokenized_df <- df %>%
select(post_id, combined_text, dt, score, numcomms) %>%
unnest_tokens(word, combined_text)
tokenized_df <- tokenized_df %>%
anti_join(stop_words) %>%
anti_join(custom_stop_words)
bigram_df <- df %>%
select(post_id, combined_text, dt, score, numcomms) %>%
unnest_tokens(bigram, combined_text, "ngrams", n = 2)
```
posts per year
```{r}
post_per_year <- df %>% count(year = year(dt)) %>% rename(yearly_posts = n)
```
# Visualizing bigrams
```{r}
library(ggplot2)
library(igraph)
library(ggraph)
count_bigrams <- function(dataset) {
dataset %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word1 %in% custom_stop_words$word,
!word2 %in% custom_stop_words$word) %>%
count(word1, word2, sort = TRUE)
}
visualize_bigrams <- function(bigrams, min_number = 200) {
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
bigrams %>%
filter(n > min_number) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
}
```
```{r}
bg <- count_bigrams(df)
bg %>%
filter(!is.na(word1),
!is.na(word2)) %>%
visualize_bigrams()
```
# visualize only recent bigrams
Showing bigram chart for 2018 and 2019 to see newer trends
```{r}
bg_recent <- df %>%
filter(year(dt) == 2019 |
year(dt) == 2018) %>%
count_bigrams()
bg_recent %>%
filter(!is.na(word1),
!is.na(word2)) %>%
visualize_bigrams(min_number = 100)
```
See the most commonly used words by year. For nutrition, this isn't very insightful, as the top words are mostly the same each year
```{r}
tokenized_df %>%
count(word, year = year(dt), sort = T) %>%
filter(n > 1400) %>%
ggplot(aes(x = reorder(word, n), y = n)) +
geom_col() +
coord_flip() +
facet_wrap(~year, scales = "free")
```
Here, I'm going to look at the words with the most growth from 2018 to 2019.
```{r}
# combining bigrams and single words
bigram_and_word <- bigram_df %>%
rename(word = bigram) %>%
rbind(tokenized_df) %>%
rename(token = word)
# only recent posts to shrink it
bigram_and_word <- bigram_and_word %>%
filter(dt > '2017-12-31')
# getting top words from each
counts_2018 <- bigram_and_word %>%
filter(year(dt) == 2018) %>%
count(token, year = year(dt), sort = T) %>%
filter(n > 100) %>%
inner_join(post_per_year) %>%
mutate(post_per_thousand_2018 = 1000* n / yearly_posts)
counts_2019 <- bigram_and_word %>%
filter(year(dt) == 2019) %>%
count(token, year = year(dt), sort = T) %>%
filter(n > 50) %>%
inner_join(post_per_year) %>%
mutate(post_per_thousand_2019 = 1000* n / yearly_posts)
diff <- counts_2018 %>%
inner_join(counts_2019, by = c("token" = "token"))
diff <- diff %>%
mutate(difference = post_per_thousand_2019 - post_per_thousand_2018) %>%
select(token, difference) %>%
arrange(-difference)
```
# Functions to plot trends over time
```{r}
word_over_time_month <- function(ingredient){
tokenized_df %>%
mutate(month = as.Date(cut(tokenized_df$dt, breaks = "month"))) %>%
distinct() %>%
filter(word == ingredient) %>%
count(month) %>%
ggplot(aes(x = month, y = n)) +
geom_col()
}
word_over_time_year <- function(ingredient){
tokenized_df %>%
mutate(year = year(dt)) %>%
filter(word == ingredient) %>%
distinct() %>%
count(year) %>%
inner_join(post_per_year) %>%
mutate(posts_per_thousand = 1000 * n / yearly_posts)
}
bigram_year <- function(two_words){
bigram_df %>%
mutate(year = year(dt)) %>%
filter(bigram == two_words) %>%
distinct() %>%
count(year) %>%
inner_join(post_per_year) %>%
mutate(posts_per_thousand = 1000 * n / yearly_posts)
}
```
# Greek yogurt
```{r}
bigram_year("greek yogurt") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019)) +
theme_minimal() +
labs(x = "", y = "Mentions per Thousand Posts", title = "Greek Yogurt Mentions on Reddit's Nutrition Forum")
```
```{r}
bigram_year("plant based") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019)) +
theme_minimal() +
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Plant Based' on Reddit's Nutrition Forum")
```
```{r}
word_over_time_year("potassium") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy")
```
```{r}
bigram_year("amino acids") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
bigram_year("healthy fats") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
bigram_year("green smoothie") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy")
```
```{r}
bigram_year("high protein") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy")
```
```{r}
word_over_time_year("chobani") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("avocado") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
avocado <- read_csv("~/Downloads/avo_trends.csv")
avocado_yearly <- avocado %>%
group_by(Month) %>%
summarise(yearly_google_index = mean(avocado)) %>%
rename(year = Month) # i had accidentally named the date month when importing google sheet
```
Have to create dataset for labels, this is an annoying step
```{r}
index_label_data <- word_over_time_year("avocado") %>%
inner_join(avocado_yearly, by = c("year" = "year")) %>%
mutate(reddit_index = posts_per_thousand / 3.021148,
google_index = yearly_google_index / 31.9167) %>%
select(year, reddit_index, google_index) %>%
gather(key = "type", value = "index", 2:3) %>% # have to reshape data to plot them together on one chart
arrange(year) %>%
filter(year == 2017)
```
```{r}
word_over_time_year("avocado") %>%
inner_join(avocado_yearly, by = c("year" = "year")) %>%
mutate(reddit_index = posts_per_thousand / 3.021148,
google_index = yearly_google_index / 31.9167) %>%
select(year, reddit_index, google_index) %>%
gather(key = "type", value = "index", 2:3) %>% # have to reshape data to plot them together on one chart
arrange(year) %>%
ggplot(aes(x = year, y = index, group = type)) +
geom_line(aes(color = type), size = 1) +
geom_text(data = index_label_data, aes(label = type, vjust = -2)) +
scale_x_continuous(breaks = c(2010:2019)) +
theme_minimal() +
theme(legend.position = "none") +
labs(x = "", y = "index", title = "Avocados on Google and Reddit")
```
```{r}
word_over_time_year("avocado") %>%
inner_join(avocado_yearly, by = c("year" = "year")) %>%
mutate(reddit_index = posts_per_thousand / 3.021148,
google_index = yearly_google_index / 31.9167) %>%
select(year, reddit_index, google_index) %>%
gather(key = "type", value = "index", 2:3) %>% # have to reshape data to plot them together on one chart
arrange(year) %>%
ggplot(aes(x = year, y = index, group = type)) +
geom_col(aes(fill = type)) +
scale_x_continuous(breaks = c(2010:2019)) +
theme_minimal() +
theme(legend.position = "none") +
facet_wrap(~type, scales = "free", ncol = 1) +
labs(x = "", y = "index", title = "Interest in Avocados on Google and Reddit's Nutrition Forum",
caption = "Based on Google Trends data and mentions of Avocado on reddit.com/r/nutrition")
```
### Nutrients
Here, combining sugar, salt, fat, and carbs
```{r}
sugar <- word_over_time_year("sugar") %>%
mutate(nutrient = "sugar")
salt <- word_over_time_year("salt") %>%
rbind(word_over_time_year("sodium")) %>%
group_by(year) %>%
summarise(n = sum(n), yearly_posts = mean(yearly_posts), posts_per_thousand = sum(posts_per_thousand)) %>%
mutate(nutrient = "salt")
fat <- word_over_time_year("fat") %>%
mutate(nutrient = "fat")
protein <- word_over_time_year("protein") %>%
mutate(nutrient = "protein")
carbs <- word_over_time_year("carbs") %>%
rbind(word_over_time_year("carbohydrates")) %>%
group_by(year) %>%
summarise(n = sum(n), yearly_posts = mean(yearly_posts), posts_per_thousand = sum(posts_per_thousand)) %>%
mutate(nutrient = "carbs")
fiber <- word_over_time_year("fiber") %>%
mutate(nutrient = "fiber")
nutrient_df <- rbind(sugar, salt, fat, protein, carbs, fiber)
```
```{r}
yr_2015_labels <- nutrient_df %>% filter(year == 2015)
nutrient_df %>%
filter(year > 2010) %>%
ggplot(aes(x = year, y = posts_per_thousand, group = nutrient)) +
geom_line(aes(color = nutrient), size = 1) +
geom_text_repel(data = nutrient_df %>% filter(year == 2019 |
year == 2011), aes(label = nutrient)) +
scale_x_continuous(breaks = c(2010:2019)) +
theme_fivethirtyeight()+
theme(legend.position = "none") +
labs(y = "Mentions per Thousand Posts", title = "The Most Commonly Mentioned Nutrients \non Reddit's Nutrition Forum")
```
```{r}
nutrient_df %>%
filter(year > 2010) %>%
ggplot(aes(x = year, y = posts_per_thousand, group = nutrient)) +
geom_col(aes(fill = nutrient), position = "dodge") +
geom_text(aes(label = nutrient))
```
```{r}
word_over_time_year("sugar") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019)) + theme_minimal() +
labs(x = "", y = "Mentions per Thousand Posts", title = "Greek Yogurt Mentions on Reddit's Nutrition Forum")
```
# gut health
```{r}
word_over_time_year("gut") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019)) +
theme_minimal() +
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Gut Health' on Reddit's Nutrition Forum")
```
```{r}
word_over_time_year("amino") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("potassium") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("matcha") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
bigram_year("bone broth") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("turmeric") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("kombucha") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("vinegar") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
Possible leading indicator to google trends
```{r}
word_over_time_year("fat") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("carbs") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
Interesting
# Keto
```{r}
word_over_time_year("keto") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019)) +
theme_minimal() +
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Keto' on Reddit's Nutrition Forum")
```
Interesting to compare to broader market /google trends --> moving mass market
```{r}
bigram_year("chia seeds") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
bigram_year("gut health") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019)) +
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Gut Health' per Thousand Posts on Reddit's Nutrition Forum") +
theme_minimal()
```
# kale
```{r}
word_over_time_year("kale") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019)) +
theme_minimal() +
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Kale' on Reddit's Nutrition Forum")
```
```{r}
word_over_time_year("fiber") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("spinach") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("quinoa") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("cereal") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
Amino acids
```{r}
word_over_time_year("amino") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("oatmeal") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
kalita wave
```{r}
bigram_year("almond milk") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
What's a steam wand
```{r}
bigram_year("gluten free") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
Buy buy buy!
```{r}
bigram_year("almond butter") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
bigram_year("protein shake") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
bigram_year("grass fed") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
bigram_year("intermittent fasting") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col(fill = "navy") +
scale_x_continuous(breaks = c(2010:2019)) +
theme_minimal() +
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Intermittent Fasting' on Reddit's Nutrition Forum")
```
```{r}
bigram_year("coconut oil") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
word_over_time_year("butter") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
```{r}
bigram_year("bulletproof coffee") %>%
ggplot(aes(x = year, y = posts_per_thousand)) +
geom_col() +
scale_x_continuous(breaks = c(2010:2019))
```
You can’t perform that action at this time.