In [4]:
library(tidyverse)
library(dplyr)
library(readr)
library(data.table)
library(purrr)

## ISD Dataset analysis

This notebook attempts to answer the following questions, using Chris' list of toxic terms:
- What percentage of tweets at each user of interest is toxic?
- Is there a gender disparity between rates of toxicity? IE, does one gender experience a greater percentage of toxic tweets?
- What percentage of tweets from each category of users of interest are toxic?

Note that due to the size of the dataset some cells might take a painfully long time to run. I tried my best to cut down on execution time, but this is the best I could do. 

I exclude retweets from this analysis due to them not adding any new textual info. It would be nice, however, to perhaps analyze what kinds of tweets are being retweeted. 

In [6]:
UBC_ISD <- fread("UBC-ISD.csv", encoding = "UTF-8")

ERROR: Error in fread("UBC-ISD.csv", encoding = "UTF-8"): Opened 6.814GB (7316020036 bytes) file ok but could not memory map it. This is a 64bit process. There is probably not enough contiguous virtual memory available.


In [None]:
UBC_ISD_text <- UBC_ISD |>
select("twitter.tweet/text")|>
rename(text="twitter.tweet/text")|>
mutate_all(na_if,"")|>
drop_na()|>
distinct() #removes retweets


In [None]:
terms <- c('fuck', 'dirty', 'stupid', 'idiot', 'asshole', 'bastard', 'nasty', 'whore', 'slut', 'skank', 'cunt', 'bitch', 'motherfucker', 'go to hell', 'wimp', 'coward', 'moron', 'traitor', 'shut up', 'shut the fuck up', 'stfu', 'piece of shit', 'ignorant', 'clown', 'ccpvirus', 'chinese coronavirus', 'chinavirus', 'china virus', 'fuckchina', 'nukechina', 'bombchina', 'deathtochina', 'chinesevirus', 'fake lab wuhan', 'china weapon', 'china bioweapon', 'zionazi', 'Soros virus', 'blood libel', 'nwovirus', 'new world order', 'sacrifice children', 'jew world order', 'liar', 'yellow star', 'israel virus', 'israelvirus', 'rothschild covid', 'kalergi plan', 'jail', 'locked up', 'evil.', 'is evil', 'so evil', 'pure evil', 'an evil', 'evil wom', 'evil man', 'evil witch', 'is evil', 'are evil', 'that evil', 'very evil', 'tyrant', 'tyranny', 'dictator', 'tv doctor', 'twitter doctor', 'murder', 'murderer', 'eugenic', 'satan', 'Nazi', 'non-human', 'sterile', 'slave', 'crooked', 'corrupt', 'shill', 'liar', 'lying', 'killer', 'fraud', 'big pharma', 'pharma puppet', 'Nuremberg', 'crimes against humanity', 'genocide', 'child abuse', 'covid hoax', 'covid fake', 'virus hoax', 'gates virus', 'bioweapon', 'secret elite', 'depopulation', 'save our children', 'scamdemic', 'plandemic')
terms <- paste0(" ", terms)
pattern <- paste(terms, collapse = "|")

In [33]:

# List of users to filter tweets
users <- c("adriandix", "AlikaMD", "AmyGreerKalisz", "angie_rasmussen", "AnnaBlakney", 
           "AntibioticDoc", "asapscience", "ASPphysician", "atRachelGilmore", "binhanv", 
           "BirinderNarang", "blackdocscanada", "BogochIsaac", "bornk", "carlyweeks", 
           "CaulfieldTim", "CDCofBC", "cdube_sante", "cfpcceo", "ChiefSciCan", "cmcovidtf", 
           "CMOH_Alberta", "CMOH_NL", "conquercovid19", "COVID_19_Canada", "COVIDSciOntario", 
           "CPHO_Canada", "ctouzin", "CTV_AvisFavaro", "DeNovo_Fatima", "deonandan", "drfisman", 
           "Dr_ChrisSimpson", "drgigiosler", "DrKaliBarrett", "drmwarner", "drsusanshaw", 
           "DrVivianS", "egpayne", "epdevilla", "ErnieHudsonPEI", "everetthindley", "First10EM", 
           "GermHunterMD", "glenpyle", "heysciencesam", "hgagneTVA", "IDEpiPhD", "imgrund", 
           "iPreetBrar", "IrfanDhalla", "j_mcelroy", "jasonfherring", "jfrketich", "jkwan_md", 
           "Johnrockdoc", "JuliaWongCBC", "juliegreenMLA", "Justin_Ling", "jyangstar", "KashPrime", 
           "KatharineSmart", "Kevin_Parent", "KindrachuckJason", "KrishanaSankar", "kwadwo777", 
           "LaurenPelley", "LisaBarrettID", "McGillOSS", "MerrimanPaul", "MichaelSchwandt", 
           "MLAStefanson", "moirawyton", "moriartylabs", "MPaiMD", "NaheedD", "NathanStall", 
           "NightShiftMD", "NoLore", "OttawaHealth", "paimadhu", "PattyHajdu", "picardonhealth", 
           "RicharLisa", "roussin_brent", "sabaeitizaz", "sabiVM", "SammyG_MD", "sarperotto", 
           "SciChefCan", "sciencemonkeyca", "ScienceUpFirst", "sdbaral", "shandro", "SharkawyMD", 
           "shazmamithani", "ShephardDorothy", "srinmurthy99", "SteiniBrown", "theresaboyle", 
           "thisisourshotca", "TorontoIDDOC", "UbakaOgbogu", "VaxHuntersCan", "VeraEtches", 
           "VikCBC", "wickdchiq", "zachchurchhill", "zchangla", "DrKathleenRoss1")

# Create an empty list to store the dataframes
tweet_dfs <- list()

# Function to filter tweets mentioning a specific user
filter_tweets <- function(user) {
  total_entries <- nrow(UBC_ISD_text)
  
  filtered_data <- UBC_ISD_text |>
    filter(grepl(user, text, ignore.case = TRUE)) |>
    mutate(negative = as.integer(grepl(pattern, text, ignore.case = TRUE)))

  negative_entries <- nrow(filtered_data |> filter(negative == 1))

  filtered_data <- filtered_data |>
    mutate(proportion_negative = negative_entries / total_entries*100) |>
    select(proportion_negative) |>
    head(1) |>
    as.list()

  return(filtered_data)
}

# Loop through each user and store the filtered tweets in the list
for (user in users) {
  tweet_dfs$user <- filter_tweets(user)
}

# Now tweet_dfs contains a dataframe for each user


: 

In [None]:
# List of users to filter tweets
users <- c("adriandix", "AlikaMD", "AmyGreerKalisz", "angie_rasmussen", "AnnaBlakney", 
           "AntibioticDoc", "asapscience", "ASPphysician", "atRachelGilmore", "binhanv", 
           "BirinderNarang", "blackdocscanada", "BogochIsaac", "bornk", "carlyweeks", 
           "CaulfieldTim", "CDCofBC", "cdube_sante", "cfpcceo", "ChiefSciCan", "cmcovidtf", 
           "CMOH_Alberta", "CMOH_NL", "conquercovid19", "COVID_19_Canada", "COVIDSciOntario", 
           "CPHO_Canada", "ctouzin", "CTV_AvisFavaro", "DeNovo_Fatima", "deonandan", "drfisman", 
           "Dr_ChrisSimpson", "drgigiosler", "DrKaliBarrett", "drmwarner", "drsusanshaw", 
           "DrVivianS", "egpayne", "epdevilla", "ErnieHudsonPEI", "everetthindley", "First10EM", 
           "GermHunterMD", "glenpyle", "heysciencesam", "hgagneTVA", "IDEpiPhD", "imgrund", 
           "iPreetBrar", "IrfanDhalla", "j_mcelroy", "jasonfherring", "jfrketich", "jkwan_md", 
           "Johnrockdoc", "JuliaWongCBC", "juliegreenMLA", "Justin_Ling", "jyangstar", "KashPrime", 
           "KatharineSmart", "Kevin_Parent", "KindrachuckJason", "KrishanaSankar", "kwadwo777", 
           "LaurenPelley", "LisaBarrettID", "McGillOSS", "MerrimanPaul", "MichaelSchwandt", 
           "MLAStefanson", "moirawyton", "moriartylabs", "MPaiMD", "NaheedD", "NathanStall", 
           "NightShiftMD", "NoLore", "OttawaHealth", "paimadhu", "PattyHajdu", "picardonhealth", 
           "RicharLisa", "roussin_brent", "sabaeitizaz", "sabiVM", "SammyG_MD", "sarperotto", 
           "SciChefCan", "sciencemonkeyca", "ScienceUpFirst", "sdbaral", "shandro", "SharkawyMD", 
           "shazmamithani", "ShephardDorothy", "srinmurthy99", "SteiniBrown", "theresaboyle", 
           "thisisourshotca", "TorontoIDDOC", "UbakaOgbogu", "VaxHuntersCan", "VeraEtches", 
           "VikCBC", "wickdchiq", "zachchurchhill", "zchangla", "DrKathleenRoss1")

# Create an empty list to store the negative proportions
negative_proportions <- list()

# Function to filter tweets mentioning a specific user and calculate the negative proportion
filter_tweets <- function(user) {
  total_entries <- nrow(UBC_ISD_text)
  
  filtered_data <- UBC_ISD_text |>
    filter(grepl(user, text, ignore.case = TRUE)) |>
    mutate(negative = as.integer(grepl(pattern, text, ignore.case = TRUE)))

  negative_entries <- nrow(filtered_data |> filter(negative == 1))

  proportion_negative <- (negative_entries / total_entries) * 100

  return(proportion_negative)
}

# Loop through each user and store the negative proportion in the list
for (user in users) {
  negative_proportions[[user]] <- filter_tweets(user)
}

# Example of how to access the negative proportion for a specific user
adriandix_negative_proportion <- negative_proportions[["adriandix"]]


In [None]:
tweet_dfs$adriandix

**Gender bias analysis**: 

What percentage of all tweets directed at women are toxic? At men?

run `women_df` and `men_df` for the results.

In [None]:
men <- list("adriandix", "AlikaMD", "ASPphysician", "BogochIsaac", "CaulfieldTim", 
            "cdube_sante", "deonandan", "Dr_ChrisSimpson", "drfisman", "drmwarner", 
            "ErnieHudsonPEI", "everetthindley", "First10EM", "GermHunterMD", "imgrund", 
            "IrfanDhalla", "j_mcelroy", "jasonfherring", "Johnrockdoc", "Justin_Ling", 
            "KashPrime", "Kevin_Parent", "KindrachuckJason", "kwadwo777", "MerrimanPaul", 
            "NaheedD", "NathanStall", "NightShiftMD", "paimadhu", "picardonhealth", 
            "roussin_brent", "SammyG_MD", "sciencemonkeyca", "sdbaral", "shandro", 
            "srinmurthy99", "SteiniBrown", "TorontoIDDOC", "UbakaOgbogu", "VikCBC", 
            "zachchurchhill", "zchangla")

men_df <- do.call(rbind, lapply(men, function(user) tweet_dfs$user))


In [None]:
men_df

In [None]:
women <- list("AmyGreerKalisz", "angie_rasmussen", "AnnaBlakney", "AntibioticDoc", "binhanv", 
              "bornk", "carlyweeks", "ChiefSciCan", "CMOH_Alberta", "CMOH_NL", "CPHO_Canada", 
              "ctouzin", "CTV_AvisFavaro", "DeNovo_Fatima", "drgigiosler", "DrKaliBarrett", 
              "drsusanshaw", "DrVivianS", "egpayne", "epdevilla", "glenpyle", "heysciencesam", 
              "IDEpiPhD", "iPreetBrar", "jfrketich", "jkwan_md", "JuliaWongCBC", "juliegreenMLA", 
              "jyangstar", "KatharineSmart", "KrishanaSankar", "LaurenPelley", "LisaBarrettID", 
              "MLAStefanson", "moriartylabs", "moirawyton", "MPaiMD", "NoLore", "PattyHajdu", 
              "RicharLisa", "sabiVM", "SciChefCan", "sarperotto", "shazmamithani", "ShephardDorothy", 
              "theresaboyle", "VeraEtches", "wickdchiq", "DrKathleenRoss1")

women_df <- do.call(rbind, lapply(women, function(user) tweet_dfs$user))


**Category analysis**

What percentage of tweets to each category of user are toxic? Run `category_metrics` for the result.

In [None]:
health_communicators_roles <- read_csv("Health Communicators - List(Communicators).csv")

In [None]:
health_communicators_roles <- health_communicators_roles |>
select(Handle, "Category - simplified")|>
rename(Category="Category - simplified")

In [None]:
#converting everything to lists

medical_professional <- health_communicators_roles|>
filter(Category=="Medical professional")|>
  mutate(Handle = str_remove(Handle, "@"))|>
  select(Handle)|>
  rename("medical_professional"=Handle)|>
  as.list()|>
  paste(collapse = "', '")

expert_university <- health_communicators_roles|>
filter(Category=="Expert - university")|>
  mutate(Handle = str_remove(Handle, "@"))|>
  select(Handle)|>
  rename(expert_university=Handle)|>
  as.list()|>
  paste(collapse = "', '")

politician <- health_communicators_roles |>
  filter(Category == "Politician")|>
  mutate(Handle = str_remove(Handle, "@"))|>
  select(Handle)|>
  rename(politician=Handle)|>
  as.list()|>
  paste(collapse = "', '")

journalist <- health_communicators_roles |>
  filter(Category == "Journalist")|>
  mutate(Handle = str_remove(Handle, "@"))|>
  select(Handle)|>
  rename(journalist=Handle)|>
  as.list()|>
  paste(collapse = "', '")

civil_society_groups <- health_communicators_roles |>
  filter(Category == "Civil society groups")|>
  mutate(Handle = str_remove(Handle, "@"))|>
  select(Handle)|>
  rename(civil_society_groups=Handle)|>
  as.list()|>
  paste(collapse = "', '")

expert <- health_communicators_roles |>
  filter(Category == "Expert")|>
  mutate(Handle = str_remove(Handle, "@"))|>
  select(Handle)|>
  rename(expert=Handle)|>
  as.list()|>
  paste(collapse = "', '")

health_official <- health_communicators_roles |>
  filter(Category == "Health official")|>
  mutate(Handle = str_remove(Handle, "@"))|>
  select(Handle)|>
  rename(health_official=Handle)|>
  as.list()|>
  paste(collapse = "', '")

In [None]:
health_official <- list("CDCofBC", "ChiefSciCan", "CMOH_Alberta", "CMOH_NL", "CPHO_Canada", "epdevilla", "OttawaHealth", "roussin_brent", "StrangRobert", "VeraEtches")
expert <- list("BogochIsaac", "CaulfieldTim", "DeNovo_Fatima", "deonandan", "DFisman", "Dr_ChrisSimpson", "drgigiosler", "DrKaliBarrett", "drmwarner", "drsusanshaw", "DrVivianS", "First10EM", "GermHunterID", "goaliegirlmom31", "heysciencesam", "IDEpiPhD", "imgrund", "iPreetBrar", "IrfanDhalla", "jkwan_md", "jonathanstea", "KashPrime", "Kevin__Parent", "KindrachukJason", "KrishanaSankar", "kwadwo777", "LisaBarrettID", "MoriartyLab", "MPaiMD", "NaheedD", "NathanStall", "paimadhu", "RicharLisa", "sabivm", "SammyG_MD", "SciChefCan", "sdbaral", "SharkawyMD", "shazmamithani", "SteiniBrown", "TorontoIDDOC", "UbakaOgbogu", "zchagla")
civil_society_groups <- list("blackdocscanada", "cmcovidtf", "conquercovid19", "COVID_19_Canada", "COVIDSciOntario", "McGillOSS", "sacovidtf", "ScienceUpFirst", "thisisourshotca", "VaxHuntersCan")
journalist <- list("asapscience", "binhanv", "carlyweeks", "ctouzin", "CTV_AvisFavaro", "egpayne", "j_mcelroy", "jasonfherring", "jfrketich", "JWongGlobalNews", "jyangstar", "LaurenPelley", "NightShiftMD", "NoLore", "picardonhealth", "sharon_kirkey", "theresaboyle", "VikCBC")
politician <- list("adriandix", "cdube_sante", "celliottability", "ErnieHudsonPEI", "everetthindley", "Johnrockdoc", "juliegreenMLA", "MerrimanPaul", "MLAStefanson", "PattyHajdu", "shandro", "ShephardDorothy", "zachchurchill")
expert_university <- list("AmyGreerKalisz", "angie_rasmussen", "AnnaBlakney", "AntibioticDoc", "ASPphysician")
medical_professional<- list("AlikaMD")

In [None]:
health_official_df <- do.call(rbind, lapply(health_official, function(user) tweet_dfs$user))
as.data.frame(health_official_df)
expert_df <- do.call(rbind, lapply(expert, function(user) tweet_dfs$user))
as.data.frame(expert_df)
civil_society_groups_df <- do.call(rbind, lapply(civil_society_groups, function(user) tweet_dfs$user))
as.data.frame(civil_society_groups_df)
journalist_df <- do.call(rbind, lapply(journalist, function(user) tweet_dfs$user))
as.data.frame(journalist_df)
politician_df <- do.call(rbind, lapply(politician, function(user) tweet_dfs$user))
as.data.frame(politician_df)
expert_university_df <- do.call(rbind, lapply(expert_university, function(user) tweet_dfs$user))
as.data.frame(expert_university_df)
medical_professional_df <- do.call(rbind, lapply(medical_professional, function(user) tweet_dfs$user))
as.data.frame(medical_professional_df)

category_metrics <- rbind(health_official_df,expert_df,civil_society_groups_df,journalist_df,politician_df,expert_university_df, medical_professional_df)



In [None]:
category_visual <- category_metrics|>
ggplot(aes(x=count, fill=type))