In [1]:
library(tidyverse)
library(data.table)
library(dplyr)

-- [1mAttaching core tidyverse packages[22m ------------------------ tidyverse 2.0.0 --
[32mv[39m [34mdplyr    [39m 1.1.4     [32mv[39m [34mreadr    [39m 2.1.5
[32mv[39m [34mforcats  [39m 1.0.0     [32mv[39m [34mstringr  [39m 1.5.0
[32mv[39m [34mggplot2  [39m 3.5.1     [32mv[39m [34mtibble   [39m 3.2.1
[32mv[39m [34mlubridate[39m 1.9.3     [32mv[39m [34mtidyr    [39m 1.3.1
[32mv[39m [34mpurrr    [39m 1.0.2     
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mi[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: 'data.table'


The following objects are masked from 'package:lubridate':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    y

In [2]:
UBC_ISD <- fread("UBC-ISD.csv", encoding = "UTF-8") #loading in dataset

In [3]:
options(scipen=999) #getting rid of scientific notation

**NFRF username IDs** 

Because elon musk got rid of academic API access I couldn't directly retrieve the twitter IDs of the health communicators. Instead I pulled the IDS from the ISD dataset from tweets made by those health communicators and then manually searched for the ids of and appended usernames which were missing from the dataset.

These are used later on in the NFRF analysis. 

In [4]:
usernames_NFRF <- c("adriandix", "AlikaMD", "AmyGreerKalisz", "angie_rasmussen", "AnnaBlakney", 
                    "AntibioticDoc", "asapscience", "ASPphysician", "atRachelGilmore", "binhanv", 
                    "BirinderNarang", "blackdocscanada", "BogochIsaac", "bornk", "carlyweeks", 
                    "CaulfieldTim", "CDCofBC", "cdube_sante", "cfpcceo", "ChiefSciCan", "cmcovidtf", 
                    "CMOH_Alberta", "CMOH_NL", "conquercovid19", "COVID_19_Canada", "COVIDSciOntario", 
                    "CPHO_Canada", "ctouzin", "CTV_AvisFavaro", "DeNovo_Fatima", "deonandan", "DFisman", 
                    "Dr_ChrisSimpson", "drgigiosler", "DrKaliBarrett", "drmwarner", "drsusanshaw", 
                    "DrVivianS", "egpayne", "epdevilla", "ErnieHudsonPEI", "everetthindley", "First10EM", 
                    "GermHunterMD", "glenpyle", "heysciencesam", "hgagneTVA", "IDEpiPhD", "imgrund", 
                    "iPreetBrar", "IrfanDhalla", "j_mcelroy", "jasonfherring", "jfrketich", "jkwan_md", 
                    "Johnrockdoc", "JuliaWongCBC", "juliegreenMLA", "Justin_Ling", "jyangstar", "KashPrime", 
                    "KatharineSmart", "Kevin__Parent", "KindrachukJason", "KrishanaSankar", "kwadwo777", 
                    "LaurenPelley", "LisaBarrettID", "McGillOSS", "MerrimanPaul", "MichaelSchwandt", 
                    "MLAStefanson", "moirawyton", "MPaiMD", "NaheedD", "NathanStall", 
                    "NightShiftMD", "NoLore", "OttawaHealth", "paimadhu", "PattyHajdu", "picardonhealth", 
                    "RicharLisa", "roussin_brent", "sabaeitizaz", "sabiVM", "SammyG_MD", "sarperotto", 
                    "SciChefCan", "sciencemonkeyca", "ScienceUpFirst", "sdbaral", "shandro", "SharkawyMD", 
                    "shazmamithani", "ShephardDorothy", "srinmurthy99", "SteiniBrown", "theresaboyle", 
                    "thisisourshotca", "TorontoIDDOC", "UbakaOgbogu", "VaxHuntersCan", "VeraEtches", 
                    "VikCBC", "wickdchiq", "zachchurchill", "zchagla", "DrKathleenRoss1", "twpiggott")

In [5]:
UBC_ISD_conversion <-  UBC_ISD |>
select('twitter.user/screenName', 'twitter.user/id')|>
rename(username='twitter.user/screenName', id = 'twitter.user/id')|> #renaming the columns to something more legible
filter(username %in% usernames_NFRF)|> #filitering for tweets of authors of tweets matching the list of ISD health officials
distinct()|> #removing duplicates (to just get a dataset containing each author and their respective ID)
arrange(username)

UBC_ISD_conversion$id <- as.character(UBC_ISD_conversion$id) #convering column to character vector (makes operations easier later on)


In [7]:
missing_ids <- data.frame(username=c('asapscience','ErnieHudsonPEI','MerrimanPaul','sabiVM', 'TorontoIDDOC'),
                            id=c(592912724, 1138505723623743488, 529683297, 334856992, 4866713721)) #usernames had no tweets in the ISD dataset and had to be manually added in
missing_ids$id <- as.character(missing_ids$id)
NFRF_ID_dataset <- rbind(missing_ids, UBC_ISD_conversion)
arrange(NFRF_ID_dataset, username)|>
select(id)|>
distinct()

write_csv(NFRF_ID_dataset, "nfrf_IDs.csv")

id
<chr>
1345345074
188603208
1362794760
1104229062
1107178256
237176284
1093695069176000512
141379865
1239656978495856641
1254743775357829122


**ISD analysis**

In [None]:
usernames_ISD <- c("adriandix", "AlikaMD", "AmyGreerKalisz", "angie_rasmussen", "AnnaBlakney", 
           "AntibioticDoc", "asapscience", "ASPphysician", "binhanv", 
           "blackdocscanada", "BogochIsaac", "bornk", "carlyweeks", 
           "CaulfieldTim", "CDCofBC", "cdube_sante", "ChiefSciCan", "cmcovidtf", 
           "CMOH_Alberta", "CMOH_NL", "conquercovid19", "COVID_19_Canada", "COVIDSciOntario", 
           "CPHO_Canada", "ctouzin", "CTV_AvisFavaro", "DeNovo_Fatima", "deonandan", "DFisman", 
           "Dr_ChrisSimpson", "drgigiosler", "DrKaliBarrett", "drmwarner", "drsusanshaw", 
           "DrVivianS", "egpayne", "epdevilla", "ErnieHudsonPEI", "everetthindley", "First10EM", 
           "GermHunterMD", "heysciencesam", "hgagneTVA", "IDEpiPhD", "imgrund", 
           "iPreetBrar", "IrfanDhalla", "j_mcelroy", "jasonfherring", "jfrketich", "jkwan_md", 
           "Johnrockdoc", "juliegreenMLA", "jyangstar", "KashPrime", 
            "Kevin__Parent", "KindrachukJason", "KrishanaSankar", "kwadwo777", 
           "LaurenPelley", "LisaBarrettID", "McGillOSS", "MerrimanPaul",
           "MLAStefanson", "MPaiMD", "NaheedD", "NathanStall", 
           "NightShiftMD", "NoLore", "OttawaHealth", "paimadhu", "PattyHajdu", "picardonhealth", 
           "RicharLisa", "roussin_brent", "SammyG_MD",
           "SciChefCan", "ScienceUpFirst", "sdbaral", "shandro", "SharkawyMD", 
           "shazmamithani", "ShephardDorothy", "srinmurthy99", "SteiniBrown", "theresaboyle", 
           "thisisourshotca", "TorontoIDDOC", "UbakaOgbogu", "VaxHuntersCan", "VeraEtches", 
           "VikCBC", "zachchurchill", "zchagla", "strangrobert") #creating a dataframe containing all the usernames in ISD set

In [None]:
terms <- c('fuck', 'dirty', 'stupid', 'idiot', 'asshole', 'bastard', 'nasty', 'whore', 'slut', 'skank', 'cunt', 'bitch', 'motherfucker', 'go to hell', 'wimp', 'coward', 'moron', 'traitor', 'shut up', 'shut the fuck up', 'stfu', 'piece of shit', 'ignorant', 'clown', 'ccpvirus', 'chinese coronavirus', 'chinavirus', 'china virus', 'fuckchina', 'nukechina', 'bombchina', 'deathtochina', 'chinesevirus', 'fake lab wuhan', 'china weapon', 'china bioweapon', 'zionazi', 'Soros virus', 'blood libel', 'nwovirus', 'new world order', 'sacrifice children', 'jew world order', 'liar', 'yellow star', 'israel virus', 'israelvirus', 'rothschild covid', 'kalergi plan', 'jail', 'locked up', 'evil.', 'is evil', 'so evil', 'pure evil', 'an evil', 'evil wom', 'evil man', 'evil witch', 'is evil', 'are evil', 'that evil', 'very evil', 'tyrant', 'tyranny', 'dictator', 'tv doctor', 'twitter doctor', 'murder', 'murderer', 'eugenic', 'satan', 'Nazi', 'non-human', 'sterile', 'slave', 'crooked', 'corrupt', 'shill', 'liar', 'lying', 'killer', 'fraud', 'big pharma', 'pharma puppet', 'Nuremberg', 'crimes against humanity', 'genocide', 'child abuse', 'covid hoax', 'covid fake', 'virus hoax', 'gates virus', 'bioweapon', 'secret elite', 'depopulation', 'save our children')
terms <- paste0(" ", terms)
pattern <- paste(terms, collapse = "|")

In [None]:
# Returns a dataframe that for each tweet, contains the number of likes, followers, and creation of tweet authors account. 

UBC_ISD_text_likes <- UBC_ISD |>
select("retweet.twitter.tweet/text", "twitter.user/description", "twitter.user/createdAt", "twitter.tweet/text",'twitter.user/followersCount', 'twitter.user/screenName', 'twitter.tweet/favouriteCount')|>
rename(text="twitter.tweet/text", followers='twitter.user/followersCount', username='twitter.user/screenName', likes= 'twitter.tweet/favouriteCount',
        description="twitter.user/description", created_at="twitter.user/createdAt")|>
  mutate(`retweet.twitter.tweet/text` = na_if(`retweet.twitter.tweet/text`, "")) |>
  mutate(not_retweet = ifelse(is.na(`retweet.twitter.tweet/text`), 1, 0))|> #adding a column noting if a tweet is a retweet or not
  select(text, followers, username, likes, not_retweet, description, created_at)|>
filter(not_retweet == 1)|> #removing RTs
  select(!not_retweet)|>
  mutate(negative = as.integer(grepl(pattern, text, ignore.case = TRUE)))|>
  filter(negative==1)
  

preview <- head(UBC_ISD_text_likes, 5)
preview

Creating the full ISD information dataset:

In [None]:
#creating username column
ISD_n_users <-  UBC_ISD_text_likes|>
select(username)|>
distinct()|>
nrow()
ISD_n_users 


#creating column containing total number of toxic tweets for each user
UBC_ISD_text_negative_totals <- UBC_ISD_text_likes|>
select(negative, username)|>
group_by(username)|>
summarize(negative=sum(negative))|>
select(!username)


#creating columns containing average follower, like, maximum follower counts for each user
    # follower counts are taken at the time of each tweet's posting so differ for all tweets made by a certain author
    # max_followers is the maximum number of followers ever attained by a tweet author
    # mean_followers is the average number of followers by an author across all tweets
UBC_ISD_text_follower_average <- UBC_ISD_text_likes|>
select(followers, username, likes)|>
group_by(username)|>
mutate(mean_followers=mean(followers))|>
mutate(max_followers=max(followers))|>
mutate(mean_likes=mean(likes))|> #gives the average number of likes attained by an author
mutate(sum_likes=sum(likes))|> #gives the total amount of likes, added up, for each author
select(username, max_followers, mean_followers, mean_likes, sum_likes)|>
distinct()|>
arrange(username)|>
select(!username)

# creates a column containing the twitter "about"/description sections for each tweet author. Descriptions are taken at time of posting so vary for authors.
# each description for each author is placed in a [].
UBC_ISD_text_description <- UBC_ISD_text_likes|>
select(description, username)|>
group_by(username)|>
distinct()|>
  summarize(description = paste0("[", paste(description, collapse = "]["), "]"))|>
  mutate(description = ifelse(description == "[]", "", description))|> #gets rid of empty descriptions
  arrange(username)|>
  select(!username)

#creating created_at column (self-explanatory)
UBC_ISD_text_created_at <- UBC_ISD_text_likes|>
select(created_at, username)|>
distinct()|>
group_by(username) |>
summarise(created_at = min(created_at))|> #for some reason, three usernames had multiple creation dates? They aren't a part of the top 50 so are excluded in the end result, but perhaps due to being banned/unbanned?
select(!username)


In [None]:
#putting everything into one big dataset!

ISD_user_values <- cbind(UBC_ISD_text_follower_average, UBC_ISD_text_description, UBC_ISD_text_created_at, UBC_ISD_text_negative_totals)|>
arrange(desc(negative))|>
mutate(is_health_official = username %in% usernames_ISD)|>
filter(negative>=50)


preview <- head(ISD_user_values, 5)
preview

In [None]:
ISD_values_non_HC <- ISD_user_values |>
filter(is_health_official == FALSE)|>
select(!is_health_official) #creating separate df for non-health communicators

ISD_values_HC <- ISD_user_values |>
filter(is_health_official == TRUE)|>
select(!is_health_official) #creating separate df for health communicators

In [None]:
write_csv(ISD_user_values, "authors_of_toxic_tweets_info_ISD.csv")
write_csv(ISD_values_non_HC, "authors_of_toxic_tweets_info_ISD_non_HC.csv")
write_csv(ISD_values_HC, "authors_of_toxic_tweets_info_ISD_HC_only.csv")

In [None]:
# what % of users in the ISD_user_values were health communicators?

percentage_HC <- nrow(ISD_values_HC)/nrow(ISD_user_values)*100
percentage_HC 

Creating a dataframe containing all toxic tweets by health communicators.

In [None]:
UBC_ISD_by_HC <- UBC_ISD|>
select("twitter.tweet/text", "retweet.twitter.tweet/text", 'twitter.user/screenName')|>
rename(text = `twitter.tweet/text`, username='twitter.user/screenName') |>
  mutate(`retweet.twitter.tweet/text` = na_if(`retweet.twitter.tweet/text`, "")) |>
  mutate(not_retweet = ifelse(is.na(`retweet.twitter.tweet/text`), 1, 0))|> #removing RTs
  select(text, username, not_retweet)|>
  filter(not_retweet == 1)|>
  select(!not_retweet)|>
  filter(username %in% usernames_ISD)|>
  mutate(negative = as.integer(grepl(pattern, text, ignore.case = TRUE)))|>
  filter(negative==1)
  

In [None]:
write_csv(UBC_ISD_by_HC, "toxic_tweets_BY_hc.csv")

Creating a second dataset containing the aggregate totals of toxic tweets for each health communicator.

In [None]:
UBC_ISD_by_HC_counts <-UBC_ISD_by_HC|>
select(username, negative)|>
group_by(username)|>
summarize(count_toxicity = sum(negative))|>
arrange(desc(count_toxicity))


In [None]:
write_csv(UBC_ISD_by_HC_counts, "toxic_tweets_BY_hc_counts.csv")

Lastly, I'm creating a new dataset containing percentages and total sums of toxic tweets AT health communicators.

I use a for-loop that iterates over each user in the list of usernames and performs various calculations. It's fairly slow and has a runtime of ~20 minutes. there is probably a faster way of doing this, but I couldn't figure it out. 

In [None]:
UBC_ISD_text <- UBC_ISD |>
select("retweet.twitter.tweet/text", "twitter.tweet/text")|>
rename(text="twitter.tweet/text")|>
mutate(`retweet.twitter.tweet/text` = na_if(`retweet.twitter.tweet/text`, "")) |>
mutate(not_retweet = ifelse(is.na(`retweet.twitter.tweet/text`), 1, 0))|>
filter(not_retweet == 1)|>
select(!`retweet.twitter.tweet/text`, !not_retweet)#removing RTs

In [None]:

# Creating an empty dataframe to store the results
negative_proportions_df <- data.frame(user = character(), total_negative = integer(), percent_negative = numeric(), stringsAsFactors = FALSE)

# Filtering tweets for each user and adding columns for hate speech: value of 1 if row contains hate speech, 0 if not
filter_tweets <- function(user) {

  filtered_data <- UBC_ISD_text |>
    filter(grepl(user, text, ignore.case = TRUE)) |>
    mutate(negative = as.integer(grepl(pattern, text, ignore.case = TRUE)))

  total_user_tweets <- nrow(filtered_data)
  negative_entries <- nrow(filtered_data |> filter(negative == 1))

  percent_negative <- if (total_user_tweets > 0) (negative_entries / total_user_tweets) * 100 else 0

  return(data.frame(user = user, total_negative = negative_entries, total_tweets=total_user_tweets, percent_negative = percent_negative))
}

# Looping through each user and storing the results in the dataframe
for (user in usernames_ISD) {
  user_data <- filter_tweets(user)
  negative_proportions_df <- rbind(negative_proportions_df, user_data)
}

negative_proportions_df 


In [None]:
write_csv(negative_proportions_df, "total_tweets_total_negative_pct_negative_ISD.csv")

**NFRF analysis continued**

In [81]:

adriandix <- read_csv("csvs/twitterpho/@adriandix_tweets.csv")
AlikaMD <- read_csv("csvs/twitterpho/@AlikaMD_tweets.csv")
AmyGreerKalisz <- read_csv("csvs/twitterpho/@AmyGreerKalisz_tweets.csv")
angie_rasmussen <- read_csv("csvs/twitterpho/@angie_rasmussen_tweets.csv")
AnnaBlakney <- read_csv("csvs/twitterpho/@AnnaBlakney_tweets.csv")
AntibioticDoc <- read_csv("csvs/twitterpho/@AntibioticDoc_tweets.csv")
asapscience <- read_csv("csvs/twitterpho/@asapscience_tweets.csv")
ASPphysician <- read_csv("csvs/twitterpho/@ASPphysician_tweets.csv")
atRachelGilmore <- read_csv("csvs/twitterpho/@atRachelGilmore_tweets.csv")
binhanv <- read_csv("csvs/twitterpho/@binhanv_tweets.csv")
BirinderNarang <- read_csv("csvs/twitterpho/@BirinderNarang_tweets.csv")
blackdocscanada <- read_csv("csvs/twitterpho/@blackdocscanada_tweets.csv")
BogochIsaac <- read_csv("csvs/twitterpho/@BogochIsaac_tweets.csv")
bornk <- read_csv("csvs/twitterpho/@bornk_tweets.csv")
carlyweeks <- read_csv("csvs/twitterpho/@carlyweeks_tweets.csv")
CaulfieldTim <- read_csv("csvs/twitterpho/@CaulfieldTim_tweets.csv")
CDCofBC <- read_csv("csvs/twitterpho/@CDCofBC_tweets.csv")
cdube_sante <- read_csv("csvs/twitterpho/@cdube_sante_tweets.csv")
cfpcceo <- read_csv("csvs/twitterpho/@cfpcceo_tweets.csv")
ChiefSciCan <- read_csv("csvs/twitterpho/@ChiefSciCan_tweets.csv")
cmcovidtf <- read_csv("csvs/twitterpho/@cmcovidtf_tweets.csv")
CMOH_Alberta <- read_csv("csvs/twitterpho/@CMOH_Alberta_tweets.csv")
CMOH_NL <- read_csv("csvs/twitterpho/@CMOH_NL_tweets.csv")
conquercovid19 <- read_csv("csvs/twitterpho/@conquercovid19_tweets.csv")
COVID_19_Canada <- read_csv("csvs/twitterpho/@COVID_19_Canada_tweets.csv")
COVIDSciOntario <- read_csv("csvs/twitterpho/@COVIDSciOntario_tweets.csv")
CPHO_Canada <- read_csv("csvs/twitterpho/@CPHO_Canada_tweets.csv")
ctouzin <- read_csv("csvs/twitterpho/@ctouzin_tweets.csv")
CTV_AvisFavaro <- read_csv("csvs/twitterpho/@CTV_AvisFavaro_tweets.csv")
DeNovo_Fatima <- read_csv("csvs/twitterpho/@DeNovo_Fatima_tweets.csv")
deonandan <- read_csv("csvs/twitterpho/@deonandan_tweets.csv")
DFisman <- read_csv("csvs/twitterpho/@drfisman_tweets.csv")
Dr_ChrisSimpson <- read_csv("csvs/twitterpho/@Dr_ChrisSimpson_tweets.csv")
drgigiosler <- read_csv("csvs/twitterpho/@drgigiosler_tweets.csv")
DrKaliBarrett <- read_csv("csvs/twitterpho/@DrKaliBarrett_tweets.csv")
drmwarner <- read_csv("csvs/twitterpho/@drmwarner_tweets.csv")
drsusanshaw <- read_csv("csvs/twitterpho/@drsusanshaw_tweets.csv")
DrVivianS <- read_csv("csvs/twitterpho/@DrVivianS_tweets.csv")
egpayne <- read_csv("csvs/twitterpho/@egpayne_tweets.csv")
epdevilla <- read_csv("csvs/twitterpho/@epdevilla_tweets.csv")
ErnieHudsonPEI <- read_csv("csvs/twitterpho/@ErnieHudsonPEI_tweets.csv")
everetthindley <- read_csv("csvs/twitterpho/@everetthindley_tweets.csv")
First10EM <- read_csv("csvs/twitterpho/@First10EM_tweets.csv")
GermHunterMD <- read_csv("csvs/twitterpho/@GermHunterMD_tweets.csv")
glenpyle <- read_csv("csvs/twitterpho/@glenpyle_tweets.csv")
heysciencesam <- read_csv("csvs/twitterpho/@heysciencesam_tweets.csv")
hgagneTVA <- read_csv("csvs/twitterpho/@hgagneTVA_tweets.csv")
IDEpiPhD <- read_csv("csvs/twitterpho/@IDEpiPhD_tweets.csv")
imgrund <- read_csv("csvs/twitterpho/@imgrund_tweets.csv")
iPreetBrar <- read_csv("csvs/twitterpho/@iPreetBrar_tweets.csv")
IrfanDhalla <- read_csv("csvs/twitterpho/@IrfanDhalla_tweets.csv")
j_mcelroy <- read_csv("csvs/twitterpho/@j_mcelroy_tweets.csv")
jasonfherring <- read_csv("csvs/twitterpho/@jasonfherring_tweets.csv")
jfrketich <- read_csv("csvs/twitterpho/@jfrketich_tweets.csv")
jkwan_md <- read_csv("csvs/twitterpho/@jkwan_md_tweets.csv")
Johnrockdoc <- read_csv("csvs/twitterpho/@Johnrockdoc_tweets.csv")
JuliaWongCBC <- read_csv("csvs/twitterpho/@JuliaWongCBC_tweets.csv")
juliegreenMLA <- read_csv("csvs/twitterpho/@juliegreenMLA_tweets.csv")
Justin_Ling <- read_csv("csvs/twitterpho/@Justin_Ling_tweets.csv")
jyangstar <- read_csv("csvs/twitterpho/@jyangstar_tweets.csv")
KashPrime <- read_csv("csvs/twitterpho/@KashPrime_tweets.csv")
KatharineSmart <- read_csv("csvs/twitterpho/@KatharineSmart_tweets.csv")
Kevin__Parent <- read_csv("csvs/twitterpho/@Kevin__Parent_tweets.csv")
KindrachukJason <- read_csv("csvs/twitterpho/@KindrachukJason_tweets.csv")
KrishanaSankar <- read_csv("csvs/twitterpho/@KrishanaSankar_tweets.csv")
kwadwo777 <- read_csv("csvs/twitterpho/@kwadwo777_tweets.csv")
LaurenPelley <- read_csv("csvs/twitterpho/@LaurenPelley_tweets.csv")
LisaBarrettID <- read_csv("csvs/twitterpho/@LisaBarrettID_tweets.csv")
McGillOSS <- read_csv("csvs/twitterpho/@McGillOSS_tweets.csv")
MerrimanPaul <- read_csv("csvs/twitterpho/@MerrimanPaul_tweets.csv")
MichaelSchwandt <- read_csv("csvs/twitterpho/@MichaelSchwandt_tweets.csv")
MLAStefanson <- read_csv("csvs/twitterpho/@MLAStefanson_tweets.csv")
moirawyton <- read_csv("csvs/twitterpho/@moirawyton_tweets.csv")
MPaiMD <- read_csv("csvs/twitterpho/@MPaiMD_tweets.csv")
NaheedD <- read_csv("csvs/twitterpho/@NaheedD_tweets.csv")
NathanStall <- read_csv("csvs/twitterpho/@NathanStall_tweets.csv")
NightShiftMD <- read_csv("csvs/twitterpho/@NightShiftMD_tweets.csv")
NoLore <- read_csv("csvs/twitterpho/@NoLore_tweets.csv")
OttawaHealth <- read_csv("csvs/twitterpho/@OttawaHealth_tweets.csv")
paimadhu <- read_csv("csvs/twitterpho/@paimadhu_tweets.csv")
PattyHajdu <- read_csv("csvs/twitterpho/@PattyHajdu_tweets.csv")
picardonhealth <- read_csv("csvs/twitterpho/@picardonhealth_tweets.csv")
RicharLisa <- read_csv("csvs/twitterpho/@RicharLisa_tweets.csv")
roussin_brent <- read_csv("csvs/twitterpho/@roussin_brent_tweets.csv")
sabaeitizaz <- read_csv("csvs/twitterpho/@sabaeitizaz_tweets.csv")
sabiVM <- read_csv("csvs/twitterpho/@sabiVM_tweets.csv")
SammyG_MD <- read_csv("csvs/twitterpho/@SammyG_MD_tweets.csv")
sarperotto <- read_csv("csvs/twitterpho/@sarperotto_tweets.csv")
SciChefCan <- read_csv("csvs/twitterpho/@SciChefCan_tweets.csv")
sciencemonkeyca <- read_csv("csvs/twitterpho/@sciencemonkeyca_tweets.csv")
ScienceUpFirst <- read_csv("csvs/twitterpho/@ScienceUpFirst_tweets.csv")
sdbaral <- read_csv("csvs/twitterpho/@sdbaral_tweets.csv")
shandro <- read_csv("csvs/twitterpho/@shandro_tweets.csv")
SharkawyMD <- read_csv("csvs/twitterpho/@SharkawyMD_tweets.csv")
shazmamithani <- read_csv("csvs/twitterpho/@shazmamithani_tweets.csv")
ShephardDorothy <- read_csv("csvs/twitterpho/@ShephardDorothy_tweets.csv")
srinmurthy99 <- read_csv("csvs/twitterpho/@srinmurthy99_tweets.csv")
SteiniBrown <- read_csv("csvs/twitterpho/@SteiniBrown_tweets.csv")
theresaboyle <- read_csv("csvs/twitterpho/@theresaboyle_tweets.csv")
thisisourshotca <- read_csv("csvs/twitterpho/@thisisourshotca_tweets.csv")
TorontoIDDOC <- read_csv("csvs/twitterpho/@TorontoIDDOC_tweets.csv")
UbakaOgbogu <- read_csv("csvs/twitterpho/@UbakaOgbogu_tweets.csv")
VaxHuntersCan <- read_csv("csvs/twitterpho/@VaxHuntersCan_tweets.csv")
VeraEtches <- read_csv("csvs/twitterpho/@VeraEtches_tweets.csv")
VikCBC <- read_csv("csvs/twitterpho/@VikCBC_tweets.csv")
wickdchiq <- read_csv("csvs/twitterpho/@wickdchiq_tweets.csv")
zachchurchill <- read_csv("csvs/twitterpho/@zachchurchill_tweets.csv")
zchagla <- read_csv("csvs/twitterpho/@zchagla_tweets.csv")
DrKathleenRoss1 <- read_csv("csvs/twitterpho/DrKathleenRoss1_tweets.csv")
twpiggott <- read_csv("csvs/twitterpho/@twpiggott_tweets.csv")


[1m[22mNew names:
[36m*[39m `` -> `...1`
"[1m[22mOne or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)"
[1mRows: [22m[34m358751[39m [1mColumns: [22m[34m21[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m   (6): entities, text, attachments, clean_text, sentiment, emojis
[32mdbl[39m  (12): ...1, id, edit_history_tweet_ids, author_id, conversation_id, in_...
[33mlgl[39m   (2): possibly_sensitive, withheld
[34mdttm[39m  (1): created_at

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1m[22mNew names:
[36m*[39m `` -> `...1`
"[1m[22mOne or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)"
[1mRows: [22m[34m4462

In [86]:
NFRF_list <- list(adriandix, AlikaMD, AmyGreerKalisz, angie_rasmussen, AnnaBlakney, 
                       AntibioticDoc, asapscience, ASPphysician, atRachelGilmore, binhanv, 
                       BirinderNarang, blackdocscanada, BogochIsaac, bornk, carlyweeks, 
                       CaulfieldTim, CDCofBC, cdube_sante, cfpcceo, ChiefSciCan, cmcovidtf, 
                       CMOH_Alberta, CMOH_NL, conquercovid19, COVID_19_Canada, COVIDSciOntario, 
                       CPHO_Canada, ctouzin, CTV_AvisFavaro, DeNovo_Fatima, deonandan, DFisman, 
                       Dr_ChrisSimpson, drgigiosler, DrKaliBarrett, drmwarner, drsusanshaw, 
                       DrVivianS, egpayne, epdevilla, ErnieHudsonPEI, everetthindley, First10EM, 
                       GermHunterMD, glenpyle, heysciencesam, hgagneTVA, IDEpiPhD, imgrund, 
                       iPreetBrar, IrfanDhalla, j_mcelroy, jasonfherring, jfrketich, jkwan_md, 
                       Johnrockdoc, JuliaWongCBC, juliegreenMLA, Justin_Ling, jyangstar, KashPrime, 
                       KatharineSmart, Kevin__Parent, KindrachukJason, KrishanaSankar, kwadwo777, 
                       LaurenPelley, LisaBarrettID, McGillOSS, MerrimanPaul, MichaelSchwandt, 
                       MLAStefanson, moirawyton, MPaiMD, NaheedD, NathanStall, 
                       NightShiftMD, NoLore, OttawaHealth, paimadhu, PattyHajdu, picardonhealth, 
                       RicharLisa, roussin_brent, sabaeitizaz, sabiVM, SammyG_MD, sarperotto, 
                       SciChefCan, sciencemonkeyca, ScienceUpFirst, sdbaral, shandro, SharkawyMD, 
                       shazmamithani, ShephardDorothy, srinmurthy99, SteiniBrown, theresaboyle, 
                       thisisourshotca, TorontoIDDOC, UbakaOgbogu, VaxHuntersCan, VeraEtches, 
                       VikCBC, wickdchiq, zachchurchill, zchagla, DrKathleenRoss1, twpiggott)


In [87]:
terms <- c("fuck", "dirty", "stupid", "idiot", "asshole", "bastard", "nasty", "whore", "slut", "skank", "cunt", "bitch", "motherfucker", "go to hell", "wimp", "coward", "moron", "traitor", "shut up", "shut the fuck up", "stfu", "piece of shit", "ignorant", "clown", "ccpvirus", "chinese coronavirus", "chinavirus", "china virus", "fuckchina", "nukechina", "bombchina", "deathtochina", "chinesevirus", "fake lab wuhan", "china weapon", "china bioweapon", "zionazi", "Soros virus", "blood libel", "nwovirus", "new world order", "sacrifice children", "jew world order", "liar", "yellow star", "israel virus", "israelvirus", "rothschild covid", "kalergi plan", "jail", "locked up", "evil.", "is evil", "so evil", "pure evil", "an evil", "evil wom", "evil man", "evil witch", "is evil", "are evil", "that evil", "very evil", "tyrant", "tyranny", "dictator", "tv doctor", "twitter doctor", "murder", "murderer", "eugenic", "satan", "Nazi", "non-human", "sterile", "slave", "crooked", "corrupt", "shill", "liar", "lying", "killer", "fraud", "big pharma", "pharma puppet", "Nuremberg", "crimes against humanity", "genocide", "child abuse", "covid hoax", "covid fake", "virus hoax", "gates virus", "bioweapon", "secret elite", "depopulation", "save our children")
terms <- paste0(" ", terms)
pattern <- paste(terms, collapse = "|")

# Function that finds harassment terms and calculates proportion 
tox_check <- function(df) {
  total_entries <- nrow(df) #getting total n of entries in each df in order to calculate proportion
  
  # Selecting the text column and create the 'negative' column: value of 0 if does not contain an entry in the list, of terms, value of 1 if it does.
  df <- df |>
    filter(!startsWith(text, "RT")) |>
    select(text, author_id)|>
    mutate(negative = as.integer(grepl(pattern, text, ignore.case = TRUE)))|>
    mutate(id = as.character(author_id)) |>
    mutate(in_nfrf_ids = id %in% as.character(NFRF_ID_dataset$id)) |>
    select(!author_id)

  return(df)
}

# Applying function defined above using lapply
toxicity_scores <- lapply(NFRF_list, tox_check)

In [88]:

# Assigning the names of the datasets to list
names(toxicity_scores) <- usernames_NFRF
list2env(toxicity_scores, envir = .GlobalEnv)

<environment: R_GlobalEnv>

In [89]:
all_users_NFRF <-  bind_rows(adriandix, AlikaMD, AmyGreerKalisz, angie_rasmussen, AnnaBlakney, 
                       AntibioticDoc, asapscience, ASPphysician, atRachelGilmore, binhanv, 
                       BirinderNarang, blackdocscanada, BogochIsaac, bornk, carlyweeks, 
                       CaulfieldTim, CDCofBC, cdube_sante, cfpcceo, ChiefSciCan, cmcovidtf, 
                       CMOH_Alberta, CMOH_NL, conquercovid19, COVID_19_Canada, COVIDSciOntario, 
                       CPHO_Canada, ctouzin, CTV_AvisFavaro, DeNovo_Fatima, deonandan, DFisman, 
                       Dr_ChrisSimpson, drgigiosler, DrKaliBarrett, drmwarner, drsusanshaw, 
                       DrVivianS, egpayne, epdevilla, ErnieHudsonPEI, everetthindley, First10EM, 
                       GermHunterMD, glenpyle, heysciencesam, hgagneTVA, IDEpiPhD, imgrund, 
                       iPreetBrar, IrfanDhalla, j_mcelroy, jasonfherring, jfrketich, jkwan_md, 
                       Johnrockdoc, JuliaWongCBC, juliegreenMLA, Justin_Ling, jyangstar, KashPrime, 
                       KatharineSmart, Kevin__Parent, KindrachukJason, KrishanaSankar, kwadwo777, 
                       LaurenPelley, LisaBarrettID, McGillOSS, MerrimanPaul, MichaelSchwandt, 
                       MLAStefanson, moirawyton, MPaiMD, NaheedD, NathanStall, 
                       NightShiftMD, NoLore, OttawaHealth, paimadhu, PattyHajdu, picardonhealth, 
                       RicharLisa, roussin_brent, sabaeitizaz, sabiVM, SammyG_MD, sarperotto, 
                       SciChefCan, sciencemonkeyca, ScienceUpFirst, sdbaral, shandro, SharkawyMD, 
                       shazmamithani, ShephardDorothy, srinmurthy99, SteiniBrown, theresaboyle, 
                       thisisourshotca, TorontoIDDOC, UbakaOgbogu, VaxHuntersCan, VeraEtches, 
                       VikCBC, wickdchiq, zachchurchill, zchagla, DrKathleenRoss1, twpiggott)


In [90]:
all_users_NFRF_totals <- all_users_NFRF |>
    group_by(id, in_nfrf_ids) |>
    select(!text) |>
    summarize(toxicity_totals = sum(negative)) |>
    arrange(desc(toxicity_totals)) |>
    filter(toxicity_totals > 50) |>
    rename(is_health_communicator = in_nfrf_ids)

preview <- head(all_users_NFRF_totals, 5)
preview


[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups`
argument.


id,is_health_communicator,toxicity_totals
<chr>,<lgl>,<int>
386007819,False,1386
1358039257198256128,False,412
2265885618,False,390
37271053,False,308
1211781779687796736,False,302


In [49]:
write.csv(all_users_NFRF_totals, 'authors_of_toxic_tweets_info_NFRF.csv')

In [91]:
all_users_NFRF_hc <- all_users_NFRF|>
 rename(is_health_communicator = in_nfrf_ids)|>
filter(is_health_communicator == TRUE)|>
group_by(id)|>
summarize(toxicity_totals=sum(negative))|>
arrange(desc(toxicity_totals))|>
 left_join(UBC_ISD_conversion, by = "id")

preview <- head(all_users_NFRF_hc, 5)
preview

id,toxicity_totals,username
<chr>,<int>,<chr>
407395156,33,CaulfieldTim
14362766,24,imgrund
360214917,15,wickdchiq
1096205507348623360,13,DFisman
97620342,12,DeNovo_Fatima


In [None]:
write.csv(all_users_NFRF_hc, "NFRF_toxicity_totals_BY_health_communicators.csv")

In [92]:


# Function that finds harassment terms and calculates proportion and total negative entries
toxicity_calculations <- function(df) {
  total_entries <- nrow(df) # getting total n of entries in each df in order to calculate proportion
  
  # Selecting the text column and create the 'negative' column: value of 0 if does not contain an entry in the list of terms, value of 1 if it does.
  df <- df %>%
    select(text) %>%
    mutate(negative = as.integer(grepl(pattern, text, ignore.case = TRUE)))

  negative_entries <- nrow(df |> filter(negative == 1))

  # Creating a dataframe with the proportion negative and total negative entries
  result_df <- data.frame(
    total_negative_entries = negative_entries,
    percent_negative = (if (total_entries > 0) (negative_entries / total_entries) * 100 else 0),
    total_entries = total_entries
  )
  
  return(result_df)
}

# Applying toxicity calculations using lapply
NFRF_toxicity_counts <- lapply(NFRF_list, toxicity_calculations)

# Converting the names of each dataset from integer values back to the original names using list2env
names(NFRF_toxicity_counts) <- usernames_NFRF

# Creating a single dataframe with results for all users
results_df <- do.call(rbind, lapply(names(NFRF_toxicity_counts), function(name) {
  df <- NFRF_toxicity_counts[[name]]
  df$user <- name
  df
}))

results_df


total_negative_entries,percent_negative,total_entries,user
<int>,<dbl>,<int>,<chr>
12190,3.3978999,358751,adriandix
658,1.4746089,44622,AlikaMD
313,1.3809839,22665,AmyGreerKalisz
4217,2.0066715,210149,angie_rasmussen
3,0.2371542,1265,AnnaBlakney
1278,2.7023027,47293,AntibioticDoc
53,1.9421033,2729,asapscience
476,1.2203877,39004,ASPphysician
47349,6.0549600,781987,atRachelGilmore
0,0.0000000,615,binhanv


In [93]:
write_csv(results_df, "total_tweets_total_negative_pct_negative_NFRF.csv")

The code below is for combined_dfs.csv

In [98]:
negative_proportions_df

user,total_negative,total_tweets,percent_negative
<chr>,<int>,<int>,<dbl>
adriandix,2246,46790,4.8001710
AlikaMD,88,2230,3.9461883
AmyGreerKalisz,106,2861,3.7049983
angie_rasmussen,1212,33674,3.5992160
AnnaBlakney,0,66,0.0000000
AntibioticDoc,404,13750,2.9381818
asapscience,27,817,3.3047736
ASPphysician,263,9576,2.7464495
binhanv,2,81,2.4691358
blackdocscanada,1,190,0.5263158


In [99]:
results_ISD <- negative_proportions_df |>
rename(total_negative_ISD = total_negative, total_tweets_ISD = total_tweets, percent_negative_ISD = percent_negative)

results_NFRF <- results_df |>
rename(total_negative_NFRF = total_negative_entries, total_tweets_NFRF = total_entries, percent_negative_NFRF = percent_negative)


combined_dfs  <- full_join(results_ISD, results_NFRF, by = "user", suffix = c("_ISD", "_NFRF"))


In [100]:
combined_dfs

user,total_negative_ISD,total_tweets_ISD,percent_negative_ISD,total_negative_NFRF,percent_negative_NFRF,total_tweets_NFRF
<chr>,<int>,<int>,<dbl>,<int>,<dbl>,<int>
adriandix,2246,46790,4.8001710,12190,3.3978999,358751
AlikaMD,88,2230,3.9461883,658,1.4746089,44622
AmyGreerKalisz,106,2861,3.7049983,313,1.3809839,22665
angie_rasmussen,1212,33674,3.5992160,4217,2.0066715,210149
AnnaBlakney,0,66,0.0000000,3,0.2371542,1265
AntibioticDoc,404,13750,2.9381818,1278,2.7023027,47293
asapscience,27,817,3.3047736,53,1.9421033,2729
ASPphysician,263,9576,2.7464495,476,1.2203877,39004
binhanv,2,81,2.4691358,0,0.0000000,615
blackdocscanada,1,190,0.5263158,13,0.5960569,2181


In [101]:
write_csv(combined_dfs, "combined_dfs_2.csv")

In [4]:
colnames(detoxify)

In [3]:
detoxify <- fread("OptimizedTweets_with_toxicity.csv",  encoding = "UTF-8")

In [7]:
fulltwitterpho <-  fread("FullTwitterPHO_NoRT_with_toxicity.csv",  encoding = "UTF-8")

In [9]:
colnames(fulltwitterpho)