Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
630 lines (511 sloc)
16.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "R Notebook" | |
output: html_notebook | |
--- | |
```{r} | |
library(tidyverse) | |
library(tidytext) | |
library(lubridate) | |
library(ggrepel) | |
library(ggthemes) | |
``` | |
Raw data, email joe at residualthoughts dot com if you'd like me to share it with you; otherwise you can pull reddit data through the python api | |
```{r} | |
df <- read_csv("~/downloads/reddit_nutrition_submissions.csv") | |
``` | |
```{r} | |
# create date | |
df <- df %>% mutate(dt = as.Date(created_date)) | |
``` | |
Custom stop words filter out some words we don't want to analyse that appear often in reddit text (imgur.com, for example) | |
```{r} | |
custom_stop_words <- tibble(word = c("na", "drink", "cocktail", "recipe", "shake", "ice", "1", "2", "oz", "ounces", "http","https", "cocktails", "recipes", "amp", "4", "favorite", "love", "removed", "imgur", "i.imgur", "NA", "keywords", "na", "utf8", "amp", "qid", "sr")) | |
``` | |
Creating a combined text column with both heading and body text | |
```{r} | |
df <- df %>% mutate(combined_text = paste0(title," ", text)) | |
``` | |
Tokenizing datasets | |
```{r} | |
tokenized_df <- df %>% | |
select(post_id, combined_text, dt, score, numcomms) %>% | |
unnest_tokens(word, combined_text) | |
tokenized_df <- tokenized_df %>% | |
anti_join(stop_words) %>% | |
anti_join(custom_stop_words) | |
bigram_df <- df %>% | |
select(post_id, combined_text, dt, score, numcomms) %>% | |
unnest_tokens(bigram, combined_text, "ngrams", n = 2) | |
``` | |
posts per year | |
```{r} | |
post_per_year <- df %>% count(year = year(dt)) %>% rename(yearly_posts = n) | |
``` | |
# Visualizing bigrams | |
```{r} | |
library(ggplot2) | |
library(igraph) | |
library(ggraph) | |
count_bigrams <- function(dataset) { | |
dataset %>% | |
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% | |
separate(bigram, c("word1", "word2"), sep = " ") %>% | |
filter(!word1 %in% stop_words$word, | |
!word2 %in% stop_words$word, | |
!word1 %in% custom_stop_words$word, | |
!word2 %in% custom_stop_words$word) %>% | |
count(word1, word2, sort = TRUE) | |
} | |
visualize_bigrams <- function(bigrams, min_number = 200) { | |
set.seed(2016) | |
a <- grid::arrow(type = "closed", length = unit(.15, "inches")) | |
bigrams %>% | |
filter(n > min_number) %>% | |
graph_from_data_frame() %>% | |
ggraph(layout = "fr") + | |
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) + | |
geom_node_point(color = "lightblue", size = 5) + | |
geom_node_text(aes(label = name), vjust = 1, hjust = 1) + | |
theme_void() | |
} | |
``` | |
```{r} | |
bg <- count_bigrams(df) | |
bg %>% | |
filter(!is.na(word1), | |
!is.na(word2)) %>% | |
visualize_bigrams() | |
``` | |
# visualize only recent bigrams | |
Showing bigram chart for 2018 and 2019 to see newer trends | |
```{r} | |
bg_recent <- df %>% | |
filter(year(dt) == 2019 | | |
year(dt) == 2018) %>% | |
count_bigrams() | |
bg_recent %>% | |
filter(!is.na(word1), | |
!is.na(word2)) %>% | |
visualize_bigrams(min_number = 100) | |
``` | |
See the most commonly used words by year. For nutrition, this isn't very insightful, as the top words are mostly the same each year | |
```{r} | |
tokenized_df %>% | |
count(word, year = year(dt), sort = T) %>% | |
filter(n > 1400) %>% | |
ggplot(aes(x = reorder(word, n), y = n)) + | |
geom_col() + | |
coord_flip() + | |
facet_wrap(~year, scales = "free") | |
``` | |
Here, I'm going to look at the words with the most growth from 2018 to 2019. | |
```{r} | |
# combining bigrams and single words | |
bigram_and_word <- bigram_df %>% | |
rename(word = bigram) %>% | |
rbind(tokenized_df) %>% | |
rename(token = word) | |
# only recent posts to shrink it | |
bigram_and_word <- bigram_and_word %>% | |
filter(dt > '2017-12-31') | |
# getting top words from each | |
counts_2018 <- bigram_and_word %>% | |
filter(year(dt) == 2018) %>% | |
count(token, year = year(dt), sort = T) %>% | |
filter(n > 100) %>% | |
inner_join(post_per_year) %>% | |
mutate(post_per_thousand_2018 = 1000* n / yearly_posts) | |
counts_2019 <- bigram_and_word %>% | |
filter(year(dt) == 2019) %>% | |
count(token, year = year(dt), sort = T) %>% | |
filter(n > 50) %>% | |
inner_join(post_per_year) %>% | |
mutate(post_per_thousand_2019 = 1000* n / yearly_posts) | |
diff <- counts_2018 %>% | |
inner_join(counts_2019, by = c("token" = "token")) | |
diff <- diff %>% | |
mutate(difference = post_per_thousand_2019 - post_per_thousand_2018) %>% | |
select(token, difference) %>% | |
arrange(-difference) | |
``` | |
# Functions to plot trends over time | |
```{r} | |
word_over_time_month <- function(ingredient){ | |
tokenized_df %>% | |
mutate(month = as.Date(cut(tokenized_df$dt, breaks = "month"))) %>% | |
distinct() %>% | |
filter(word == ingredient) %>% | |
count(month) %>% | |
ggplot(aes(x = month, y = n)) + | |
geom_col() | |
} | |
word_over_time_year <- function(ingredient){ | |
tokenized_df %>% | |
mutate(year = year(dt)) %>% | |
filter(word == ingredient) %>% | |
distinct() %>% | |
count(year) %>% | |
inner_join(post_per_year) %>% | |
mutate(posts_per_thousand = 1000 * n / yearly_posts) | |
} | |
bigram_year <- function(two_words){ | |
bigram_df %>% | |
mutate(year = year(dt)) %>% | |
filter(bigram == two_words) %>% | |
distinct() %>% | |
count(year) %>% | |
inner_join(post_per_year) %>% | |
mutate(posts_per_thousand = 1000 * n / yearly_posts) | |
} | |
``` | |
# Greek yogurt | |
```{r} | |
bigram_year("greek yogurt") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) + | |
theme_minimal() + | |
labs(x = "", y = "Mentions per Thousand Posts", title = "Greek Yogurt Mentions on Reddit's Nutrition Forum") | |
``` | |
```{r} | |
bigram_year("plant based") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) + | |
theme_minimal() + | |
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Plant Based' on Reddit's Nutrition Forum") | |
``` | |
```{r} | |
word_over_time_year("potassium") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") | |
``` | |
```{r} | |
bigram_year("amino acids") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
bigram_year("healthy fats") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
bigram_year("green smoothie") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") | |
``` | |
```{r} | |
bigram_year("high protein") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") | |
``` | |
```{r} | |
word_over_time_year("chobani") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("avocado") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
avocado <- read_csv("~/Downloads/avo_trends.csv") | |
avocado_yearly <- avocado %>% | |
group_by(Month) %>% | |
summarise(yearly_google_index = mean(avocado)) %>% | |
rename(year = Month) # i had accidentally named the date month when importing google sheet | |
``` | |
Have to create dataset for labels, this is an annoying step | |
```{r} | |
index_label_data <- word_over_time_year("avocado") %>% | |
inner_join(avocado_yearly, by = c("year" = "year")) %>% | |
mutate(reddit_index = posts_per_thousand / 3.021148, | |
google_index = yearly_google_index / 31.9167) %>% | |
select(year, reddit_index, google_index) %>% | |
gather(key = "type", value = "index", 2:3) %>% # have to reshape data to plot them together on one chart | |
arrange(year) %>% | |
filter(year == 2017) | |
``` | |
```{r} | |
word_over_time_year("avocado") %>% | |
inner_join(avocado_yearly, by = c("year" = "year")) %>% | |
mutate(reddit_index = posts_per_thousand / 3.021148, | |
google_index = yearly_google_index / 31.9167) %>% | |
select(year, reddit_index, google_index) %>% | |
gather(key = "type", value = "index", 2:3) %>% # have to reshape data to plot them together on one chart | |
arrange(year) %>% | |
ggplot(aes(x = year, y = index, group = type)) + | |
geom_line(aes(color = type), size = 1) + | |
geom_text(data = index_label_data, aes(label = type, vjust = -2)) + | |
scale_x_continuous(breaks = c(2010:2019)) + | |
theme_minimal() + | |
theme(legend.position = "none") + | |
labs(x = "", y = "index", title = "Avocados on Google and Reddit") | |
``` | |
```{r} | |
word_over_time_year("avocado") %>% | |
inner_join(avocado_yearly, by = c("year" = "year")) %>% | |
mutate(reddit_index = posts_per_thousand / 3.021148, | |
google_index = yearly_google_index / 31.9167) %>% | |
select(year, reddit_index, google_index) %>% | |
gather(key = "type", value = "index", 2:3) %>% # have to reshape data to plot them together on one chart | |
arrange(year) %>% | |
ggplot(aes(x = year, y = index, group = type)) + | |
geom_col(aes(fill = type)) + | |
scale_x_continuous(breaks = c(2010:2019)) + | |
theme_minimal() + | |
theme(legend.position = "none") + | |
facet_wrap(~type, scales = "free", ncol = 1) + | |
labs(x = "", y = "index", title = "Interest in Avocados on Google and Reddit's Nutrition Forum", | |
caption = "Based on Google Trends data and mentions of Avocado on reddit.com/r/nutrition") | |
``` | |
### Nutrients | |
Here, combining sugar, salt, fat, and carbs | |
```{r} | |
sugar <- word_over_time_year("sugar") %>% | |
mutate(nutrient = "sugar") | |
salt <- word_over_time_year("salt") %>% | |
rbind(word_over_time_year("sodium")) %>% | |
group_by(year) %>% | |
summarise(n = sum(n), yearly_posts = mean(yearly_posts), posts_per_thousand = sum(posts_per_thousand)) %>% | |
mutate(nutrient = "salt") | |
fat <- word_over_time_year("fat") %>% | |
mutate(nutrient = "fat") | |
protein <- word_over_time_year("protein") %>% | |
mutate(nutrient = "protein") | |
carbs <- word_over_time_year("carbs") %>% | |
rbind(word_over_time_year("carbohydrates")) %>% | |
group_by(year) %>% | |
summarise(n = sum(n), yearly_posts = mean(yearly_posts), posts_per_thousand = sum(posts_per_thousand)) %>% | |
mutate(nutrient = "carbs") | |
fiber <- word_over_time_year("fiber") %>% | |
mutate(nutrient = "fiber") | |
nutrient_df <- rbind(sugar, salt, fat, protein, carbs, fiber) | |
``` | |
```{r} | |
yr_2015_labels <- nutrient_df %>% filter(year == 2015) | |
nutrient_df %>% | |
filter(year > 2010) %>% | |
ggplot(aes(x = year, y = posts_per_thousand, group = nutrient)) + | |
geom_line(aes(color = nutrient), size = 1) + | |
geom_text_repel(data = nutrient_df %>% filter(year == 2019 | | |
year == 2011), aes(label = nutrient)) + | |
scale_x_continuous(breaks = c(2010:2019)) + | |
theme_fivethirtyeight()+ | |
theme(legend.position = "none") + | |
labs(y = "Mentions per Thousand Posts", title = "The Most Commonly Mentioned Nutrients \non Reddit's Nutrition Forum") | |
``` | |
```{r} | |
nutrient_df %>% | |
filter(year > 2010) %>% | |
ggplot(aes(x = year, y = posts_per_thousand, group = nutrient)) + | |
geom_col(aes(fill = nutrient), position = "dodge") + | |
geom_text(aes(label = nutrient)) | |
``` | |
```{r} | |
word_over_time_year("sugar") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) + theme_minimal() + | |
labs(x = "", y = "Mentions per Thousand Posts", title = "Greek Yogurt Mentions on Reddit's Nutrition Forum") | |
``` | |
# gut health | |
```{r} | |
word_over_time_year("gut") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) + | |
theme_minimal() + | |
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Gut Health' on Reddit's Nutrition Forum") | |
``` | |
```{r} | |
word_over_time_year("amino") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("potassium") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("matcha") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
bigram_year("bone broth") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("turmeric") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("kombucha") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("vinegar") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
Possible leading indicator to google trends | |
```{r} | |
word_over_time_year("fat") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("carbs") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
Interesting | |
# Keto | |
```{r} | |
word_over_time_year("keto") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) + | |
theme_minimal() + | |
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Keto' on Reddit's Nutrition Forum") | |
``` | |
Interesting to compare to broader market /google trends --> moving mass market | |
```{r} | |
bigram_year("chia seeds") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
bigram_year("gut health") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) + | |
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Gut Health' per Thousand Posts on Reddit's Nutrition Forum") + | |
theme_minimal() | |
``` | |
# kale | |
```{r} | |
word_over_time_year("kale") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) + | |
theme_minimal() + | |
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Kale' on Reddit's Nutrition Forum") | |
``` | |
```{r} | |
word_over_time_year("fiber") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("spinach") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("quinoa") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("cereal") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
Amino acids | |
```{r} | |
word_over_time_year("amino") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("oatmeal") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
kalita wave | |
```{r} | |
bigram_year("almond milk") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
What's a steam wand | |
```{r} | |
bigram_year("gluten free") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
Buy buy buy! | |
```{r} | |
bigram_year("almond butter") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
bigram_year("protein shake") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
bigram_year("grass fed") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
bigram_year("intermittent fasting") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col(fill = "navy") + | |
scale_x_continuous(breaks = c(2010:2019)) + | |
theme_minimal() + | |
labs(x = "", y = "Mentions per Thousand Posts", title = "Mentions of 'Intermittent Fasting' on Reddit's Nutrition Forum") | |
``` | |
```{r} | |
bigram_year("coconut oil") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
word_over_time_year("butter") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |
```{r} | |
bigram_year("bulletproof coffee") %>% | |
ggplot(aes(x = year, y = posts_per_thousand)) + | |
geom_col() + | |
scale_x_continuous(breaks = c(2010:2019)) | |
``` | |