In [None]:
# author: Jana Lasser

In [1]:
import pandas as pd
from os.path import join
import numpy as np

from scipy.stats import spearmanr

# parallelisation functionality
from multiprocess import Pool
import psutil
from tqdm import tqdm

In [2]:
dst = "../../data/bootstrapping"

# Tweets

In [3]:
src = "../../data/tweets"
fname = "US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip"
tweets = pd.read_csv(join(src, fname), compression="gzip", parse_dates=["created_at"])

In [4]:
# drop retweets
tweets = tweets[tweets["retweeted"] == False]

In [5]:
# drop tweets without honesty component score (distill RoBERTa filtering)
tweets = tweets.dropna(subset=["avg_belief_score", "avg_truth_score"])

In [6]:
# set tweet creation date as index for easier sampling and aggregation
tweets = tweets.set_index("created_at")

In [7]:
# drop tweets from 2010
tweets = tweets[tweets.index.year > 2010]

## Honesty scores

In [8]:
def run_bootstrap_belief(i):
    tweet_sample = tweets.sample(frac=1, replace=True)
    belief = tweet_sample[["avg_belief_score", "party"]]\
        .groupby(by=[tweet_sample.index.year, tweet_sample.index.month, "party"])\
        .mean()

    belief.index.set_names(["year", "month", "party"], inplace=True)
    belief = belief.reset_index()
    belief["run"] = i
    return belief

In [9]:
fname = "belief"
belief_score_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_belief, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        belief_score_bootstrap = pd.concat([belief_score_bootstrap, tmp])
belief_score_bootstrap = belief_score_bootstrap.reset_index(drop=True)
belief_score_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:45<00:00,  4.43it/s]


In [10]:
def run_bootstrap_truth(i):
    tweet_sample = tweets.sample(frac=1, replace=True)
    truth = tweet_sample[["avg_truth_score", "party"]]\
        .groupby(by=[tweet_sample.index.year, tweet_sample.index.month, "party"])\
        .mean()

    truth.index.set_names(["year", "month", "party"], inplace=True)
    truth = truth.reset_index()
    truth["run"] = i
    return truth

In [11]:
fname = "truth"
truth_score_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_truth, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        truth_score_bootstrap = pd.concat([truth_score_bootstrap, tmp])
truth_score_bootstrap = truth_score_bootstrap.reset_index(drop=True)
truth_score_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:43<00:00,  4.48it/s]


In [12]:
del tweets

# URLs

In [20]:
src = "../../data/urls"
fname = "US_politician_URLs_2010-11-06_to_2022-03-16.csv.gzip"
urls = pd.read_csv(
    join(src, fname),
    compression="gzip",
    parse_dates=["created_at"]
)

In [21]:
# drop retweets
urls = urls[urls["retweeted"] == False]

# drop entries without an honesty component score
urls = urls.dropna(subset=["avg_belief_score", "avg_truth_score"])

In [22]:
urls["has_NG_score"] = False
urls.loc[urls["NG_score"].dropna().index, "has_NG_score"] = True
urls["has_independent_score"] = False
urls.loc[urls["independent_unreliable"].dropna().index, "has_independent_score"] = True

In [23]:
urls = urls.set_index("created_at")

## NewsGuard coverage

In [24]:
# remove all entries with urls that point to large social media (twitter, 
# facebook, youtube, instagram), search (google, yahoo) or e-commerce (amazon) 
# sites
excluded_domains = ["twitter.com", "youtube.com", "facebook.com",
            "instagram.com", "cards.twitter.com", "google.com", "yahoo.com"]
urls_clean = urls[~urls["domain"].isin(excluded_domains)]

In [25]:
def run_bootstrap_NG_coverage(i):
    url_sample = urls_clean.sample(frac=1, replace=True)
    coverage = url_sample[["has_NG_score", "party"]]\
        .groupby(by=[url_sample.index.year, url_sample.index.month, "party"])\
        .agg(["sum", "count"])

    coverage.index.set_names(["year", "month", "party"], inplace=True)
    coverage = coverage.reset_index()
    coverage.columns = ["year", "month", "party", "has_NG_score_sum", "has_NG_score_count"]
    coverage["NG_coverage"] = coverage["has_NG_score_sum"] / coverage["has_NG_score_count"]
    coverage["run"] = i
    return coverage

In [26]:
fname = "NG_coverage"
pool = Pool(10)
N_bootstrap = 1000
NG_coverage_bootstrap = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_NG_coverage, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        NG_coverage_bootstrap = pd.concat([NG_coverage_bootstrap, tmp])
NG_coverage_bootstrap = NG_coverage_bootstrap.reset_index(drop=True)
NG_coverage_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:48<00:00, 20.49it/s]


## Independent list coverage

In [27]:
def run_bootstrap_independent_coverage(i):
    url_sample = urls_clean.sample(frac=1, replace=True)
    coverage = url_sample[["has_independent_score", "party"]]\
        .groupby(by=[url_sample.index.year, url_sample.index.month, "party"])\
        .agg(["sum", "count"])

    coverage.index.set_names(["year", "month", "party"], inplace=True)
    coverage = coverage.reset_index()
    coverage.columns = ["year", "month", "party", "has_independent_score_sum", "has_independent_score_count"]
    coverage["independent_coverage"] = coverage["has_independent_score_sum"] / coverage["has_independent_score_count"]
    coverage["run"] = i
    return coverage

In [28]:
fname = "independent_coverage"
N_bootstrap = 1000
pool = Pool(10)
independent_coverage_bootstrap = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_independent_coverage, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        independent_coverage_bootstrap = pd.concat([independent_coverage_bootstrap, tmp])
independent_coverage_bootstrap = independent_coverage_bootstrap.reset_index(drop=True)
independent_coverage_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:48<00:00, 20.48it/s]


In [29]:
del urls
del urls_clean

# Users

In [30]:
src = "../../data/users"
fname = "US_politician_accounts_2010-11-06_to_2022-03-16.csv"
users = pd.read_csv(join(src, fname))

## Politifact, NG score & unreliable correlations

In [31]:
def run_bootstrap_reliability_score_correlations(i):
    user_sample = users.sample(frac=1, replace=True, random_state=i)
    pf_bootstrap = pd.DataFrame({
        "corr_NGScore_pf":[user_sample[["NG_score_mean", "pf_score"]]\
                           .corr().loc["pf_score"][0]],
        "corr_ind_pf":[np.abs(user_sample[["independent_unreliable_share", "pf_score"]]\
                           .corr().loc["pf_score"][0])],
        "corr_NGScore_ind":[np.abs(user_sample[["NG_score_mean", "independent_unreliable_share"]]\
                           .corr().loc["independent_unreliable_share"][0])],
        "corr_NGShare_pf":[np.abs(user_sample[["NG_unreliable_share", "pf_score"]]\
                           .corr().loc["pf_score"][0])],
        "corr_NGShare_ind":[np.abs(user_sample[["NG_unreliable_share", "independent_unreliable_share"]]\
                           .corr().loc["independent_unreliable_share"][0])],
        "corr_NGScore_NGShare":[np.abs(user_sample[["NG_unreliable_share", "NG_score_mean"]]\
                           .corr().loc["NG_score_mean"][0])],
        "corr_NGScore_accuracy":[np.abs(user_sample[["NG_unreliable_share", "accuracy_mean"]]\
                           .corr().loc["accuracy_mean"][0])],
        "corr_NGScore_transparency":[np.abs(user_sample[["NG_unreliable_share", "transparency_mean"]]\
                           .corr().loc["transparency_mean"][0])],
        "run":[i]
    })
    return pf_bootstrap

In [32]:
fname = "user_reliability_score_correlations"
N_bootstrap = 10000
pool = Pool(10)
pf_bootstrap = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_reliability_score_correlations, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        pf_bootstrap = pd.concat([pf_bootstrap, tmp])
pf_bootstrap = pf_bootstrap.reset_index(drop=True)
pf_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:13<00:00, 716.87it/s]


In [33]:
del users

# LIWC scores

In [46]:
src = "../../data/tweets"

In [47]:
fname = "US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip"
cols = ["id", "author_id", "party", "created_at", "avg_belief_score",
        "avg_truth_score", "LIWC_authentic", "LIWC_analytic", "LIWC_moral", 
        "LIWC_emo_pos", "LIWC_emo_neg"]
tweets = pd.read_csv(
    join(src, fname), 
    compression="gzip",
    parse_dates=["created_at"],
    dtype={"id":str, "author_id":str},
    usecols=cols
) 

In [48]:
# give tweets that have a belief [truth] score in the highest quantile a 
# "belief" ["truth"] label
belief_cutoff = tweets["avg_belief_score"].quantile(0.8)
truth_cutoff = tweets["avg_truth_score"].quantile(0.8)

In [49]:
tweets["belief"] = 0
tweets["truth"] = 0
tweets["neutral"] = 0
tweets.loc[tweets[tweets["avg_belief_score"] > belief_cutoff].index, "belief"] = 1
tweets.loc[tweets[tweets["avg_truth_score"] > truth_cutoff].index, "truth"] = 1
tweets.loc[tweets[(tweets["avg_belief_score"] <= belief_cutoff) &\
                  (tweets["avg_truth_score"] <= truth_cutoff)].index, "neutral"] = 1

In [50]:
tweets["honesty_component"] = np.nan
tweets.loc[tweets[tweets["belief"] == 1].index, "honesty_component"] = "belief"
tweets.loc[tweets[tweets["truth"] == 1].index, "honesty_component"] = "truth"

In [51]:
cols = ["honesty_component", "LIWC_analytic", "LIWC_authentic", "LIWC_moral",
        "LIWC_emo_pos", "LIWC_emo_neg"]
tweets[cols]\
    .groupby(["honesty_component"])\
    .agg(["mean", "std"])

Unnamed: 0_level_0,LIWC_analytic,LIWC_analytic,LIWC_authentic,LIWC_authentic,LIWC_moral,LIWC_moral,LIWC_emo_pos,LIWC_emo_pos,LIWC_emo_neg,LIWC_emo_neg
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
honesty_component,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
belief,61.150287,30.29334,48.946705,33.786139,1.074518,2.108067,1.259735,2.326057,0.786381,1.794334
truth,72.596671,26.069229,42.87705,31.795235,0.643749,1.601816,0.535682,1.319021,0.349778,1.10876


In [52]:
tweets = tweets.set_index("created_at")

In [53]:
def run_bootstrap_LIWC(i):
    cols = ["LIWC_analytic", "LIWC_authentic", "LIWC_moral",
            "LIWC_emo_pos", "LIWC_emo_neg"]
    newcols = [
        "year", "month", "party", 
        "LIWC_analytic_sum", "LIWC_analytic_count",
        "LIWC_authentic_sum", "LIWC_authentic_count",
        "LIWC_moral_sum", "LIWC_moral_count",
        "LIWC_emo_pos_sum", "LIWC_emo_pos_count",
        "LIWC_emo_neg_sum", "LIWC_emo_neg_count"
    ]
    
    df_sample = tweets.sample(frac=1, replace=True, random_state=i)
    grouping = df_sample[cols + ["party"]]\
        .groupby(by=[df_sample.index.year, df_sample.index.month, "party"])\
        .agg(["sum", "count"])

    grouping.index.set_names(["year", "month", "party"], inplace=True)
    grouping = grouping.reset_index()
    grouping.columns = newcols
    for col in cols:
        grouping[f"{col}_share"] = grouping[f"{col}_sum"] / \
            grouping[f"{col}_count"]
    grouping["run"] = i
    
    belief_subset = df_sample[df_sample["belief"] == 1]
    truth_subset = df_sample[df_sample["truth"] == 1]
    neutral_subset = df_sample[df_sample["neutral"] == 1]
    
    belief_grouping = belief_subset[cols + ["party"]]\
        .groupby(by=[belief_subset.index.year, belief_subset.index.month, "party"])\
        .agg(["sum", "count"])
    belief_grouping.index.set_names(["year", "month", "party"], inplace=True)
    belief_grouping = belief_grouping.reset_index()
    belief_grouping.columns = newcols
    for col in cols:
        belief_grouping[f"{col}_share"] = belief_grouping[f"{col}_sum"] / \
            belief_grouping[f"{col}_count"]
    belief_grouping["run"] = i
    
    truth_grouping = truth_subset[cols + ["party"]]\
        .groupby(by=[truth_subset.index.year, truth_subset.index.month, "party"])\
        .agg(["sum", "count"])
    truth_grouping.index.set_names(["year", "month", "party"], inplace=True)
    truth_grouping = truth_grouping.reset_index()
    truth_grouping.columns = newcols
    for col in cols:
        truth_grouping[f"{col}_share"] = truth_grouping[f"{col}_sum"] / \
            truth_grouping[f"{col}_count"]
    truth_grouping["run"] = i
    
    neutral_grouping = neutral_subset[cols + ["party"]]\
        .groupby(by=[neutral_subset.index.year, neutral_subset.index.month, "party"])\
        .agg(["sum", "count"])
    neutral_grouping.index.set_names(["year", "month", "party"], inplace=True)
    neutral_grouping = neutral_grouping.reset_index()
    neutral_grouping.columns = newcols
    for col in cols:
        neutral_grouping[f"{col}_share"] = neutral_grouping[f"{col}_sum"] / \
            neutral_grouping[f"{col}_count"]
    neutral_grouping["run"] = i
    
    return grouping, belief_grouping, truth_grouping, neutral_grouping

In [54]:
LIWC_bootstrap = pd.DataFrame()
LIWC_belief_bootstrap = pd.DataFrame()
LIWC_truth_bootstrap = pd.DataFrame()
LIWC_neutral_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp1, tmp2, tmp3, tmp4 in tqdm(pool.imap_unordered(
    func=run_bootstrap_LIWC, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        LIWC_bootstrap = pd.concat([LIWC_bootstrap, tmp1])
        LIWC_belief_bootstrap = pd.concat([LIWC_belief_bootstrap, tmp2])
        LIWC_truth_bootstrap = pd.concat([LIWC_truth_bootstrap, tmp3])
        LIWC_neutral_bootstrap = pd.concat([LIWC_neutral_bootstrap, tmp4])
        
LIWC_bootstrap = LIWC_bootstrap.reset_index(drop=True)
LIWC_belief_bootstrap = LIWC_belief_bootstrap.reset_index(drop=True)
LIWC_truth_bootstrap = LIWC_truth_bootstrap.reset_index(drop=True)
LIWC_neutral_bootstrap = LIWC_neutral_bootstrap.reset_index(drop=True)

fname = "LIWC"
LIWC_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
fname = "LIWC_belief"
LIWC_belief_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
fname = "LIWC_truth"
LIWC_truth_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
fname = "LIWC_neutral"
LIWC_neutral_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [10:02<00:00,  1.66it/s]


# Code graveyard

## Honesty counts

In [8]:
def run_bootstrap_belief(i):
    tweet_sample = tweets.sample(frac=1, replace=True)
    belief = tweet_sample[["belief", "party"]]\
        .groupby(by=[tweet_sample.index.year, tweet_sample.index.month, "party"])\
        .agg(["sum", "count"])

    belief.index.set_names(["year", "month", "party"], inplace=True)
    belief = belief.reset_index()
    belief.columns = ["year", "month", "party", "belief_sum", "belief_count"]
    belief["belief_share"] = belief["belief_sum"] / belief["belief_count"]
    belief["run"] = i
    return belief

In [9]:
fname = "belief"
belief_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_belief, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        belief_bootstrap = pd.concat([belief_bootstrap, tmp])
belief_bootstrap = belief_bootstrap.reset_index(drop=True)
belief_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:34<00:00,  4.67it/s]


In [10]:
def run_bootstrap_truth(i):
    tweet_sample = tweets.sample(frac=1, replace=True)
    truth = tweet_sample[["truth", "party"]]\
        .groupby(by=[tweet_sample.index.year, tweet_sample.index.month, "party"])\
        .agg(["sum", "count"])

    truth.index.set_names(["year", "month", "party"], inplace=True)
    truth = truth.reset_index()
    truth.columns = ["year", "month", "party", "truth_sum", "truth_count"]
    truth["truth_share"] = truth["truth_sum"] / truth["truth_count"]
    truth["run"] = i
    return truth

In [11]:
fname = "truth"
truth_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_truth, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        truth_bootstrap = pd.concat([truth_bootstrap, tmp])
truth_bootstrap = truth_bootstrap.reset_index(drop=True)
truth_bootstrap.to_csv(join(dst, fname + ".csv"), index=False)
pool.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:11<00:00,  3.98it/s]
