In [None]:
# author: Jana Lasser

In [1]:
import pandas as pd
from os.path import join
import numpy as np

from scipy.stats import spearmanr

# parallelisation functionality
from multiprocess import Pool
import psutil
from tqdm import tqdm

In [2]:
dst = "../../data/bootstrapping"

# Tweets

In [3]:
src = "../../data/tweets"
fname = "tweets.csv.gzip"
cols = ["retweeted", "created_at", "avg_belief_score", "avg_truth_score", "party"]
tweets = pd.read_csv(
    join(src, fname), 
    compression="gzip", 
    parse_dates=["created_at"],
    dtype={"id":str},
    usecols=cols
)

In [4]:
# drop tweets without honesty component score (distill RoBERTa filtering)
tweets = tweets.dropna(subset=["avg_belief_score", "avg_truth_score"])

In [5]:
# set tweet creation date as index for easier sampling and aggregation
tweets = tweets.set_index("created_at")

## Honesty scores

In [6]:
def run_bootstrap_belief(i):
    tweet_sample = tweets.sample(frac=1, replace=True)
    belief = tweet_sample[["avg_belief_score", "party"]]\
        .groupby(by=[tweet_sample.index.year, tweet_sample.index.month, "party"])\
        .mean()

    belief.index.set_names(["year", "month", "party"], inplace=True)
    belief = belief.reset_index()
    belief["run"] = i
    return belief

In [7]:
fname = "belief.csv.gzip"
belief_score_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_belief, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        belief_score_bootstrap = pd.concat([belief_score_bootstrap, tmp])
belief_score_bootstrap = belief_score_bootstrap.reset_index(drop=True)
belief_score_bootstrap.to_csv(join(dst, fname), index=False, compression="gzip")
pool.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:24<00:00,  3.79it/s]


In [8]:
def run_bootstrap_truth(i):
    tweet_sample = tweets.sample(frac=1, replace=True)
    truth = tweet_sample[["avg_truth_score", "party"]]\
        .groupby(by=[tweet_sample.index.year, tweet_sample.index.month, "party"])\
        .mean()

    truth.index.set_names(["year", "month", "party"], inplace=True)
    truth = truth.reset_index()
    truth["run"] = i
    return truth

In [9]:
fname = "truth.csv.gzip"
truth_score_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_truth, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        truth_score_bootstrap = pd.concat([truth_score_bootstrap, tmp])
truth_score_bootstrap = truth_score_bootstrap.reset_index(drop=True)
truth_score_bootstrap.to_csv(join(dst, fname), index=False, compression="gzip")
pool.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:10<00:00,  3.98it/s]


## LIWC and VADER scores

In [10]:
src = "../../data/tweets"

In [11]:
fname = "tweets.csv.gzip"
cols = ["party", "created_at", "avg_belief_score",
        "avg_truth_score", "LIWC_authentic", "LIWC_analytic", "LIWC_moral", 
        "VADER_neg", "VADER_pos"]
tweets = pd.read_csv(
    join(src, fname), 
    compression="gzip",
    parse_dates=["created_at"],
    usecols=cols
) 

In [12]:
# give tweets that have a belief [truth] score in the highest quantile a 
# "belief" ["truth"] label
belief_cutoff_top = tweets["avg_belief_score"].quantile(0.8)
truth_cutoff_top = tweets["avg_truth_score"].quantile(0.8)
belief_cutoff_bottom = tweets["avg_belief_score"].quantile(0.2)
truth_cutoff_bottom = tweets["avg_truth_score"].quantile(0.2)

In [13]:
tweets["belief"] = 0
tweets["truth"] = 0
tweets["neutral_belief"] = 0
tweets["neutral_truth"] = 0
tweets.loc[tweets[tweets["avg_belief_score"] > belief_cutoff_top].index, "belief"] = 1
tweets.loc[tweets[tweets["avg_truth_score"] > truth_cutoff_top].index, "truth"] = 1
tweets.loc[tweets[tweets["avg_belief_score"] <= belief_cutoff_bottom].index, "neutral_belief"] = 1
tweets.loc[tweets[tweets["avg_truth_score"] <= truth_cutoff_bottom].index, "neutral_truth"] = 1

In [14]:
tweets["honesty_component"] = np.nan
tweets.loc[tweets[tweets["belief"] == 1].index, "honesty_component"] = "belief"
tweets.loc[tweets[tweets["truth"] == 1].index, "honesty_component"] = "truth"
tweets.loc[tweets[tweets["neutral_belief"] == 1].index, "honesty_component"] = "neutral_belief"
tweets.loc[tweets[tweets["neutral_truth"] == 1].index, "honesty_component"] = "neutral_truth"

In [15]:
cols = ["honesty_component", "LIWC_analytic", "LIWC_authentic", "LIWC_moral",
        "VADER_neg", "VADER_pos"]
tweets[cols]\
    .groupby(["honesty_component"])\
    .agg(["mean", "std"])

Unnamed: 0_level_0,LIWC_analytic,LIWC_analytic,LIWC_authentic,LIWC_authentic,LIWC_moral,LIWC_moral,VADER_neg,VADER_neg,VADER_pos,VADER_pos
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
honesty_component,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
belief,63.816813,29.18205,33.540176,33.156391,0.903643,1.965382,0.085163,0.098149,0.158631,0.127333
neutral_belief,88.35784,15.783461,24.724051,28.387783,0.270979,1.129438,0.037345,0.069616,0.123823,0.110382
neutral_truth,85.195942,20.471554,24.756807,29.658462,0.54451,1.606829,0.037122,0.074383,0.15131,0.1388
truth,73.998799,26.223247,28.207291,31.461025,0.58889,1.709724,0.074363,0.094174,0.141605,0.115433


In [16]:
tweets = tweets.set_index("created_at")

In [17]:
def run_bootstrap_LIWC(i):
    cols = ["LIWC_analytic", "LIWC_authentic", "LIWC_moral",
            "VADER_pos", "VADER_neg"]
    newcols = [
        "year", "month", "party", 
        "LIWC_analytic_sum", "LIWC_analytic_count",
        "LIWC_authentic_sum", "LIWC_authentic_count",
        "LIWC_moral_sum", "LIWC_moral_count",
        "VADER_pos_sum", "VADER_pos_count",
        "VADER_neg_sum", "VADER_neg_count"
    ]
    
    df_sample = tweets.sample(frac=1, replace=True, random_state=i)
    grouping = df_sample[cols + ["party"]]\
        .groupby(by=[df_sample.index.year, df_sample.index.month, "party"])\
        .agg(["sum", "count"])

    grouping.index.set_names(["year", "month", "party"], inplace=True)
    grouping = grouping.reset_index()
    grouping.columns = newcols
    for col in cols:
        grouping[f"{col}_share"] = grouping[f"{col}_sum"] / \
            grouping[f"{col}_count"]
    grouping["run"] = i
    
    belief_subset = df_sample[df_sample["belief"] == 1]
    truth_subset = df_sample[df_sample["truth"] == 1]
    neutral_belief_subset = df_sample[df_sample["neutral_belief"] == 1]
    neutral_truth_subset = df_sample[df_sample["neutral_truth"] == 1]
    
    belief_grouping = belief_subset[cols + ["party"]]\
        .groupby(by=[belief_subset.index.year, belief_subset.index.month, "party"])\
        .agg(["sum", "count"])
    belief_grouping.index.set_names(["year", "month", "party"], inplace=True)
    belief_grouping = belief_grouping.reset_index()
    belief_grouping.columns = newcols
    for col in cols:
        belief_grouping[f"{col}_share"] = belief_grouping[f"{col}_sum"] / \
            belief_grouping[f"{col}_count"]
    belief_grouping["run"] = i
    
    truth_grouping = truth_subset[cols + ["party"]]\
        .groupby(by=[truth_subset.index.year, truth_subset.index.month, "party"])\
        .agg(["sum", "count"])
    truth_grouping.index.set_names(["year", "month", "party"], inplace=True)
    truth_grouping = truth_grouping.reset_index()
    truth_grouping.columns = newcols
    for col in cols:
        truth_grouping[f"{col}_share"] = truth_grouping[f"{col}_sum"] / \
            truth_grouping[f"{col}_count"]
    truth_grouping["run"] = i
    
    neutral_belief_grouping = neutral_belief_subset[cols + ["party"]]\
        .groupby(by=[neutral_belief_subset.index.year, neutral_belief_subset.index.month, "party"])\
        .agg(["sum", "count"])
    neutral_belief_grouping.index.set_names(["year", "month", "party"], inplace=True)
    neutral_belief_grouping = neutral_belief_grouping.reset_index()
    neutral_belief_grouping.columns = newcols
    for col in cols:
        neutral_belief_grouping[f"{col}_share"] = neutral_belief_grouping[f"{col}_sum"] / \
            neutral_belief_grouping[f"{col}_count"]
    neutral_belief_grouping["run"] = i
    
    neutral_truth_grouping = neutral_truth_subset[cols + ["party"]]\
        .groupby(by=[neutral_truth_subset.index.year, neutral_truth_subset.index.month, "party"])\
        .agg(["sum", "count"])
    neutral_truth_grouping.index.set_names(["year", "month", "party"], inplace=True)
    neutral_truth_grouping = neutral_truth_grouping.reset_index()
    neutral_truth_grouping.columns = newcols
    for col in cols:
        neutral_truth_grouping[f"{col}_share"] = neutral_truth_grouping[f"{col}_sum"] / \
            neutral_truth_grouping[f"{col}_count"]
    neutral_truth_grouping["run"] = i
    
    return grouping, belief_grouping, truth_grouping, \
           neutral_belief_grouping, neutral_truth_grouping

In [18]:
LIWC_bootstrap = pd.DataFrame()
LIWC_belief_bootstrap = pd.DataFrame()
LIWC_truth_bootstrap = pd.DataFrame()
LIWC_neutral_belief_bootstrap = pd.DataFrame()
LIWC_neutral_truth_bootstrap = pd.DataFrame()
pool = Pool(10)
N_bootstrap = 1000

for tmp1, tmp2, tmp3, tmp4, tmp5 in tqdm(pool.imap_unordered(
    func=run_bootstrap_LIWC, 
    iterable=range(N_bootstrap)), 
    total=N_bootstrap):
        LIWC_bootstrap = pd.concat([LIWC_bootstrap, tmp1])
        LIWC_belief_bootstrap = pd.concat([LIWC_belief_bootstrap, tmp2])
        LIWC_truth_bootstrap = pd.concat([LIWC_truth_bootstrap, tmp3])
        LIWC_neutral_belief_bootstrap = pd.concat([LIWC_neutral_belief_bootstrap, tmp4])
        LIWC_neutral_truth_bootstrap = pd.concat([LIWC_neutral_truth_bootstrap, tmp5])
        
LIWC_bootstrap = LIWC_bootstrap.reset_index(drop=True)
LIWC_belief_bootstrap = LIWC_belief_bootstrap.reset_index(drop=True)
LIWC_truth_bootstrap = LIWC_truth_bootstrap.reset_index(drop=True)
LIWC_neutral_belief_bootstrap = LIWC_neutral_belief_bootstrap.reset_index(drop=True)
LIWC_neutral_truth_bootstrap = LIWC_neutral_truth_bootstrap.reset_index(drop=True)

fname = "LIWC.csv.gzip"
LIWC_bootstrap.to_csv(join(dst, fname), index=False, compression="gzip")
fname = "LIWC_belief.csv.gzip"
LIWC_belief_bootstrap.to_csv(join(dst, fname), index=False, compression="gzip")
fname = "LIWC_truth.csv.gzip"
LIWC_truth_bootstrap.to_csv(join(dst, fname), index=False, compression="gzip")
fname = "LIWC_neutral_belief.csv.gzip"
LIWC_neutral_belief_bootstrap.to_csv(join(dst, fname), index=False, compression="gzip")
fname = "LIWC_neutral_truth.csv.gzip"
LIWC_neutral_truth_bootstrap.to_csv(join(dst, fname), index=False, compression="gzip")
pool.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [15:59<00:00,  1.04it/s]


In [19]:
del tweets

# URLs

In [19]:
src = "../../data/urls"
fname = "urls.csv.gzip"
urls = pd.read_csv(
    join(src, fname),
    compression="gzip",
    parse_dates=["created_at"]
)

In [21]:
# drop entries without an honesty component score
urls = urls.dropna(subset=["avg_belief_score", "avg_truth_score"])

In [22]:
urls["has_NG_score"] = False
urls.loc[urls["NG_score"].dropna().index, "has_NG_score"] = True
urls["has_independent_score"] = False
urls.loc[urls["independent_unreliable"].dropna().index, "has_independent_score"] = True

In [23]:
urls = urls.set_index("created_at")

## NewsGuard coverage

In [24]:
def run_bootstrap_NG_coverage(i):
    url_sample = urls.sample(frac=1, replace=True)
    coverage = url_sample[["has_NG_score", "party"]]\
        .groupby(by=[url_sample.index.year, url_sample.index.month, "party"])\
        .agg(["sum", "count"])

    coverage.index.set_names(["year", "month", "party"], inplace=True)
    coverage = coverage.reset_index()
    coverage.columns = ["year", "month", "party", "has_NG_score_sum", "has_NG_score_count"]
    coverage["NG_coverage"] = coverage["has_NG_score_sum"] / coverage["has_NG_score_count"]
    coverage["run"] = i
    return coverage

In [25]:
fname = "NG_coverage.csv.gzip"
pool = Pool(10)
N_bootstrap = 1000
NG_coverage_bootstrap = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_NG_coverage, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        NG_coverage_bootstrap = pd.concat([NG_coverage_bootstrap, tmp])
NG_coverage_bootstrap = NG_coverage_bootstrap.reset_index(drop=True)
NG_coverage_bootstrap.to_csv(join(dst, fname), index=False, compression="gzip")
pool.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:50<00:00, 19.94it/s]


## Independent list coverage

In [26]:
def run_bootstrap_independent_coverage(i):
    url_sample = urls.sample(frac=1, replace=True)
    coverage = url_sample[["has_independent_score", "party"]]\
        .groupby(by=[url_sample.index.year, url_sample.index.month, "party"])\
        .agg(["sum", "count"])

    coverage.index.set_names(["year", "month", "party"], inplace=True)
    coverage = coverage.reset_index()
    coverage.columns = ["year", "month", "party", "has_independent_score_sum", "has_independent_score_count"]
    coverage["independent_coverage"] = coverage["has_independent_score_sum"] / coverage["has_independent_score_count"]
    coverage["run"] = i
    return coverage

In [27]:
fname = "independent_coverage.csv.gzip"
N_bootstrap = 1000
pool = Pool(10)
independent_coverage_bootstrap = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_independent_coverage, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        independent_coverage_bootstrap = pd.concat([independent_coverage_bootstrap, tmp])
independent_coverage_bootstrap = independent_coverage_bootstrap.reset_index(drop=True)
independent_coverage_bootstrap.to_csv(join(dst, fname), index=False, compression="gzip")
pool.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:50<00:00, 19.62it/s]


In [28]:
del urls

# Users

In [29]:
src = "../../data/users"
fname = "users.csv"
users = pd.read_csv(join(src, fname))

## Politifact, NG score & unreliable correlations

In [30]:
def run_bootstrap_reliability_score_correlations(i):
    user_sample = users.sample(frac=1, replace=True, random_state=i)
    pf_bootstrap = pd.DataFrame({
        "corr_NGScore_pf":[user_sample[["NG_score_mean", "pf_score"]]\
                           .corr().loc["pf_score"][0]],
        "corr_ind_pf":[np.abs(user_sample[["independent_unreliable_share", "pf_score"]]\
                           .corr().loc["pf_score"][0])],
        "corr_NGScore_ind":[np.abs(user_sample[["NG_score_mean", "independent_unreliable_share"]]\
                           .corr().loc["independent_unreliable_share"][0])],
        "corr_NGShare_pf":[np.abs(user_sample[["NG_unreliable_share", "pf_score"]]\
                           .corr().loc["pf_score"][0])],
        "corr_NGShare_ind":[np.abs(user_sample[["NG_unreliable_share", "independent_unreliable_share"]]\
                           .corr().loc["independent_unreliable_share"][0])],
        "corr_NGScore_NGShare":[np.abs(user_sample[["NG_unreliable_share", "NG_score_mean"]]\
                           .corr().loc["NG_score_mean"][0])],
        "corr_NGScore_accuracy":[np.abs(user_sample[["NG_unreliable_share", "accuracy_mean"]]\
                           .corr().loc["accuracy_mean"][0])],
        "corr_NGScore_transparency":[np.abs(user_sample[["NG_unreliable_share", "transparency_mean"]]\
                           .corr().loc["transparency_mean"][0])],
        "run":[i]
    })
    return pf_bootstrap

In [31]:
fname = "user_reliability_score_correlations.csv.gzip"
N_bootstrap = 10000
pool = Pool(10)
pf_bootstrap = pd.DataFrame()
for tmp in tqdm(pool.imap_unordered(
    func=run_bootstrap_reliability_score_correlations, 
    iterable = range(N_bootstrap)), 
    total = N_bootstrap):
        pf_bootstrap = pd.concat([pf_bootstrap, tmp])
pf_bootstrap = pf_bootstrap.reset_index(drop=True)
pf_bootstrap.to_csv(join(dst, fname), index=False, compression="gzip")
pool.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:07<00:00, 1255.71it/s]


In [32]:
del users