In [None]:
library(tidyverse)
library(data.table)
library(dplyr)
library(stringr)
library(lubridate)
library(repr)

In [None]:
twitter_csv <- fread("UBC-ISD.csv", encoding="UTF-8")

In [None]:
#cleanup, extracting only tweets

twitter_data <- twitter_csv |>
select(starts_with("twitter"))

colnames(twitter_data) <- str_replace(colnames(twitter_data), "twitter.tweet/", "")
colnames(twitter_data) <- str_replace(colnames(twitter_data), "twitter.user/", "")


colnames(twitter_data)

In [None]:
twitter_info<- twitter_data |>
select("originalText", "text", "possiblySensitive", "screenName", "created")

In [None]:

extract_hashtags <- function(x) {
  hashtags <- str_extract_all(x, "#\\S+")
  return(unlist(hashtags))
}

In [None]:
#extracting hashtags, cleaning them 

twitter_info <- twitter_info |>
  mutate(hashtags = lapply(text, extract_hashtags))|>
  unnest(hashtags)|>
  mutate(hashtags = str_replace_all(hashtags, "\\.$", ""))|>
  filter(hashtags != "character(0)")

In [None]:
# finding top hashtags

cleaned_hashtags <- twitter_info|>
select(hashtags)|>
group_by(hashtags)|>
summarize(count = n())|>
arrange(desc(count))

write.csv(cleaned_hashtags, "cleaned_hashtags.csv")


In [None]:
#cleaning up dates for plotting

twitter_time_adjusted <- twitter_info |>
filter(hashtags %in% c("#COVID19","#COVID19ON","#vhcON","#COVID19Vaccine"))|>
mutate(created = round_date(created, unit = "day"))|>
group_by(created, hashtags)|>
summarise(freq = n())|>
ungroup()


In [None]:

options(repr.plot.width=12, repr.plot.height=6) 

graph <- twitter_time_adjusted|>
    ggplot(aes(x=created, y=freq, color=hashtags))+geom_point(alpha=0.2)+
    geom_smooth(aes(group=hashtags), span=0.05, method="loess", se=FALSE, size=0.5)+
    scale_x_datetime(date_breaks = "1 month", date_labels = "%b %Y")+
    theme(aspect.ratio=1/2, axis.text.x = element_text(angle=45, hjust = 1))+
    labs(
        title="Usage of the Four Most Popular Covid-Related Hashtags", 
       subtitle="Drawn from UBC-ISD Dataset",
       y="Frequency (# of Tweets)", 
       x="Date",
       color=NULL
       )+
    theme_bw()+
    scale_color_manual(
        name="", 
        values = c("#COVID19"="#ca2c2c", "#COVID19ON"="#da69be", "#COVID19Vaccine"="#24bf8e", "#vhcON"="#3867e9")
        )

graph

ggsave("Hashtag_Usage.png", width=10, height=5)

