In [1]:
library(tidyverse)
library(data.table)
library(dplyr)

-- [1mAttaching core tidyverse packages[22m ------------------------ tidyverse 2.0.0 --
[32mv[39m [34mdplyr    [39m 1.1.4     [32mv[39m [34mreadr    [39m 2.1.5
[32mv[39m [34mforcats  [39m 1.0.0     [32mv[39m [34mstringr  [39m 1.5.0
[32mv[39m [34mggplot2  [39m 3.5.1     [32mv[39m [34mtibble   [39m 3.2.1
[32mv[39m [34mlubridate[39m 1.9.3     [32mv[39m [34mtidyr    [39m 1.3.1
[32mv[39m [34mpurrr    [39m 1.0.2     
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mi[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: 'data.table'


The following objects are masked from 'package:lubridate':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    y

In [2]:
detoxify <- fread("OptimizedTweets_with_toxicity.csv",  encoding = "UTF-8")

In [None]:
head(detoxify)

In [None]:
colnames(detoxify)

In [None]:
options(scipen=999)

In [3]:

detoxify_filtered <- detoxify |>
select(text, author_id, username, public_metrics_impression_count, public_metrics_like_count, toxicity_score)|>
mutate(toxicity = if_else(toxicity_score >= 0.8, 1, 0)) #change 0.8 to a different value to change theshold

detoxify_AT_authors <- detoxify_filtered |>
select(public_metrics_like_count, public_metrics_impression_count, username, toxicity)|>
mutate(public_metrics_impression_count = as.numeric(public_metrics_impression_count))|>
na.omit()|>
group_by(username)|>
mutate(n_total_tweets = n()) |>  # Calculate total number of tweets per user
mutate(n_toxic_tweets = sum(toxicity))|>
mutate(pct_toxic_tweets = n_toxic_tweets / n_total_tweets * 100)|> 
mutate(likes_total = sum(public_metrics_like_count))|>
mutate(likes_mean = mean(public_metrics_like_count))|>
mutate(impressions_total = sum(public_metrics_impression_count))|>
mutate(impressions_mean = mean(public_metrics_impression_count))|>
filter(toxicity==1)|>
mutate(likes_total_toxic = sum(public_metrics_like_count))|>
mutate(likes_mean_toxic = mean(public_metrics_like_count))|>
mutate(impressions_total_toxic = sum(public_metrics_impression_count))|>
mutate(impressions_mean_toxic = mean(public_metrics_impression_count))|>
select(username, pct_toxic_tweets, n_toxic_tweets, likes_total, likes_mean, impressions_total, impressions_mean, likes_total_toxic, likes_mean_toxic, impressions_total_toxic, impressions_mean_toxic)|>
distinct()

detoxify_AT_authors

[1m[22m[36mi[39m In argument: `public_metrics_impression_count =
  as.numeric(public_metrics_impression_count)`.
[33m![39m NAs introduced by coercion"


username,pct_toxic_tweets,n_toxic_tweets,likes_total,likes_mean,impressions_total,impressions_mean,likes_total_toxic,likes_mean_toxic,impressions_total_toxic,impressions_mean_toxic
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
@picardonhealth,4.3317791,4190,354727,3.667301,2937926,30.3733808,6249,1.4914081,15408,3.677326969
@AlikaMD,2.5654485,391,81723,5.362050,689182,45.2189489,1069,2.7340153,995,2.544757033
@hgagneTVA,0.3048780,2,847,1.291159,1834,2.7957317,0,0.0000000,0,0.000000000
@AntibioticDoc,3.4317485,895,88381,3.388842,674267,25.8537960,1378,1.5396648,6014,6.719553073
@ErnieHudsonPEI,2.7548209,20,2485,3.422865,29989,41.3071625,33,1.6500000,821,41.050000000
@imgrund,4.3653384,7023,527387,3.278119,265934,1.6529857,13308,1.8949167,5741,0.817456927
@shazmamithani,4.0609137,576,48525,3.421108,428484,30.2089679,921,1.5989583,4718,8.190972222
@DeNovo_Fatima,3.5545410,436,42132,3.434861,42291,3.4478233,657,1.5068807,1362,3.123853211
@GermhunterMD,0.9823308,154,57072,3.640492,753653,48.0738024,327,2.1233766,1074,6.974025974
@moirawyton,2.1928666,83,17269,4.562483,152462,40.2805812,248,2.9879518,766,9.228915663


In [5]:
nfrf_IDs <- read_csv("nfrf_IDs.csv")

[1mRows: [22m[34m110[39m [1mColumns: [22m[34m2[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m (1): username
[32mdbl[39m (1): id

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [6]:
detoxify_BY_authors <- detoxify_filtered |>
select(public_metrics_like_count, public_metrics_impression_count, author_id, toxicity)|>
mutate(public_metrics_impression_count = as.numeric(public_metrics_impression_count))|>
na.omit()|>
group_by(author_id)|>
mutate(n_total_tweets = n()) |> 
mutate(n_toxic_tweets = sum(toxicity))|>
mutate(pct_toxic_tweets = n_toxic_tweets / n_total_tweets * 100)|> 
mutate(likes_total = sum(public_metrics_like_count))|>
mutate(likes_mean = mean(public_metrics_like_count))|>
mutate(impressions_total = sum(public_metrics_impression_count))|>
mutate(impressions_mean = mean(public_metrics_impression_count))|>
filter(toxicity==1)|>
mutate(likes_total_toxic = sum(public_metrics_like_count))|>
mutate(likes_mean_toxic = mean(public_metrics_like_count))|>
mutate(impressions_total_toxic = sum(public_metrics_impression_count))|>
mutate(impressions_mean_toxic = mean(public_metrics_impression_count))|>
select(n_toxic_tweets, author_id, likes_total, likes_mean, impressions_total, impressions_mean, likes_total_toxic, likes_mean_toxic, impressions_total_toxic, impressions_mean_toxic)|>
mutate(is_health_official = author_id%in%nfrf_IDs$id)|>
distinct()
detoxify_BY_authors

[1m[22m[36mi[39m In argument: `public_metrics_impression_count =
  as.numeric(public_metrics_impression_count)`.
[33m![39m NAs introduced by coercion"


: 

In [None]:
write_csv(detoxify_BY_authors, "detoxify_BY_authors.csv")
write_csv(detoxify_AT_authors, "detoxify_AT_authors.csv")

In [None]:
detoxify_BY_authors_filtered <- detoxify_BY_authors|>
filter(n_toxic_tweets > 50)|>
arrange(desc(n_toxic_tweets))|>
pivot_longer(cols = c("likes_mean", "likes_total", "impressions_mean", "impressions_total"), 
               names_to = "metric", 
               values_to = "likes_impressions")|>
select(n_toxic_tweets, likes_impressions, metric,  is_health_official)

In [None]:
detoxify_BY_authors_filtered

In [None]:
options(repr.plot.width =15, repr.plot.height =15) 

BY_graph <- detoxify_BY_authors_filtered |>
  ggplot(aes(x=n_toxic_tweets)) +
  geom_point(aes(y=likes_impressions), alpha=0.1) + 
  scale_y_log10()+
  geom_smooth(aes(y=likes_impressions), color='navy', method = NULL)+
  facet_wrap(~ metric, scales = "free_y") +
  labs(y = "likes", color = "metric") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  theme_bw()

In [None]:
BY_graph

In [None]:
detoxify_filtered_test <- detoxify_filtered|>
filter(toxicity_score >= 0.5)|>
select(toxicity_score)|>
count()
detoxify_filtered_test

In [None]:
detoxify_filtered <- detoxify |>
select(text, author_id, username, public_metrics_impression_count, public_metrics_like_count, toxicity_score)
values <- data.frame(values = seq(0.1, 0.9, by = 0.05),
                     count = rep(0, length(seq(0.1, 0.9, by = 0.05))))

for (i in 1:nrow(values)) {
    value <- values$values[i]
    count <- 0
    for (individual_toxicity_score in detoxify_filtered$toxicity_score) {
        if (individual_toxicity_score >= value) {
            count <- count + 1
        }
    }
    values$count[i] <- count
} #nested for-loop for calculating the number of toxic tweets given a threshold in values$values

In [None]:
values

In [None]:
thesholds <- values |>
ggplot(aes(x=values, y=count))+
geom_bar(stat="identity", fill="white",color="black", size=1)+
geom_point(color="red", size=4, alpha=0.5)+
geom_line(color="blue", size=1)+
scale_x_continuous(breaks = seq(0.1, 0.9, by = 0.05)) +  
labs(x = "Threshold (between 0.1 and 0.9)", y = "Count", title = "Number of Toxicity Scores Above a Given Toxicity Threshold") +
theme_bw()
thesholds