In [None]:
# author: Jana Lasser

In [1]:
import pandas as pd
import numpy as np
from os.path import join
from scipy.stats import linregress

# Create a URL data frame

## Expand URL lists

In [2]:
# load the cleaned timeline-data
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_clean.csv.gzip"
cols = ["id", "author_id", "created_at", "expanded_urls",
        "retweeted", "quoted", "reply"]
tweets = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols)

In [3]:
# parse the URL lists
tweets["expanded_urls"] = tweets["expanded_urls"].fillna("[]")
tweets["expanded_urls"] = tweets["expanded_urls"].apply(lambda x: eval(x))
tweets["has_url"] = tweets["expanded_urls"].apply(lambda x: len(x) > 0)

In [4]:
tweets["N_urls"] = tweets["expanded_urls"].apply(lambda x: len(x))

In [5]:
# expand only entries with multiple URLs
multiple_urls = tweets[tweets["N_urls"] > 1]
expanded_urls = pd.DataFrame()
for idx, entry in multiple_urls.iterrows():
    row = {key:val for key, val in entry.items()}
    expanded_urls = pd.concat([expanded_urls, pd.DataFrame(row)])
    
expanded_urls = expanded_urls.set_index("id")
urls = tweets.copy()
urls = urls.set_index("id")
# drop entries with mutiple URLs
urls = urls.drop(multiple_urls["id"].values)
# add expanded entries with one line for each URL
urls = pd.concat([urls, expanded_urls])
urls = urls.reset_index()

In [6]:
len(urls)

5072662

In [None]:
# now, some URLs are stored as singular entries of a list, and some as string.
# empty entries are stored as empty list. Below we streamline URL entries such
# that every entry is a single string
def extract_URL_from_list(entry):
    if len(entry) == 0:
        return np.nan
    elif len(entry) == 1:
        return entry[0]
    else:
        return entry
    
urls["expanded_urls"] = urls["expanded_urls"].apply(extract_URL_from_list)
urls = urls.rename(columns={"expanded_urls":"url"})

In [9]:
# some tweets contain the same URL twice. We drop these
N = len(urls)
urls = urls.drop_duplicates(subset=["id", "url"])
print(f"dropped {N - len(urls)} duplicate URL entries")

dropped 338509 duplicate URL entries


In [10]:
del tweets

In [11]:
# save the outcome
dst = "../../data/urls"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_clean_urls.csv.gzip"
urls.to_csv(join(dst, fname), compression="gzip", index=False)

In [2]:
# load the data frame with the expanded URLs
src = "../../data/urls"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_clean_urls.csv.gzip"
cols = ["id", "author_id", "created_at", "url", "retweeted",
        "quoted", "reply", "has_url"]
urls = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols,
    parse_dates=["created_at"],
    dtype={"author_id":str, "id":str}
)

In [3]:
len(urls)

4734153

In [4]:
# load the public metrics information for the collected tweets
# note: this is not needed for the analysis in this publication, but might be
# handy for analyses of tweet engagement metrics
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_clean.csv.gzip"
tweet_metrics = pd.read_csv(join(src, fname),
                 compression="gzip",
                 usecols=["id", "retweet_count",
                          "reply_count", "like_count", "quote_count"],
                dtype={"id":str})
tweet_metrics = tweet_metrics.drop_duplicates(subset="id")
# merge the tweet metrics with the tweet data frame
urls = pd.merge(urls, tweet_metrics, how="left", left_on="id", right_on="id")
del tweet_metrics

## Add unraveled URLs

In [5]:
# load the list of originally shortened URLs with their expansions to their true
# destination
src = "../../data/urls"
fname = "US_unraveled_urls.csv.xz"
unraveled_urls = pd.read_csv(
    join(src, fname), 
    compression="xz",
    usecols=["url", "unraveled_url"]
)

In [6]:
# add URL information
urls = pd.merge(
    urls,
    unraveled_urls[["url", "unraveled_url"]],
    how="left",
    left_on="url",
    right_on="url"
)

# add indicator of whether the URL was originally shortened
urls["shortened_url"] = False
urls.loc[urls["unraveled_url"].dropna().index, "shortened_url"] = True

# replace the shortened URL with the unraveled URL
urls.loc[urls["unraveled_url"].dropna().index, "url"] = \
    urls.loc[urls["unraveled_url"].dropna().index, "unraveled_url"]
urls = urls.drop(columns=["unraveled_url"])

In [7]:
# extract the domain from the URL. Note: a few "found malformed URL" warnings
# are acceptable
def extract_domain(url):
    '''Given an ULR, extracts the domain name in the form XXXXX.YY'''
    if url != url:
        return np.nan
    # reformat entries that have the domain after a general name in parantheses
    if url.find('(') > 0:
        url = url.split('(')[-1]
        url = url.strip(')')
    # trailing "/" and spaces
    url = url.strip('/').strip()
    # transform all domains to lowercase
    url = url.lower()
    # remove any white spaces
    url = url.replace(' ', '')
    # if present: remove the protocol
    if url.startswith(("http", "https")):
        try:
            url = url.split('//')[1]
        except IndexError:
            print(f"found malformed URL {url}")
            return np.nan
    # remove "www." 
    url = url.replace('www.', '')
    url = url.split("/")[0]
    return url

urls["domain"] = urls["url"].apply(extract_domain)

found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL https
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http

## Add NewsGuard nutrition scores

Newsguard rating threshold to label a domain as "unreliable": 60 (see [description](https://www.newsguardtech.com/ratings/rating-process-criteria/)).

In [8]:
threshold = 60

In [9]:
# load the nutrition labels
src = "../../data/utilities/"
fname = "NewsGuard_labels.csv"
cols = ["Domain", "Score", "Last Updated"]
NG_scores = pd.read_csv(join(src, fname), usecols=cols)
# if more than one score exists for the same domain, keep the most recent one
NG_scores = NG_scores.sort_values(by=["Domain","Last Updated"], ascending=False)
NG_scores = NG_scores.drop_duplicates(subset=["Domain"])
NG_scores = NG_scores.rename(columns={"Domain":"domain", "Score":"NG_score"})
NG_scores = NG_scores.drop(columns=["Last Updated"])

# threshold scores at various cutoffs to define untrustworthy domains
NG_scores["NG_unreliable"] = 0
NG_scores.loc[NG_scores[NG_scores["NG_score"] < threshold].index, "NG_unreliable"] = 1

# add the nutrition information to the tweet data table
urls = pd.merge(urls, NG_scores,
         left_on="domain", right_on="domain", how="left")
del NG_scores

## Add alternative trustworthiness labels

In [10]:
# load the list of independently compiled trustworthiness labels for 
# news sources
src = "../../data/utilities"
fname = "independent_labels.csv"
alt_labels = pd.read_csv(join(src, fname))
alt_labels = alt_labels.rename(columns = {
    "type":"independent_unreliable", 
    "url":"domain"})

# convert reliability labels to binary
alt_labels["independent_unreliable"] = alt_labels["independent_unreliable"]\
    .replace({"reliable":0, "unreliable":1})

# merge with the tweet data table
urls = pd.merge(urls, alt_labels[["accuracy", "transparency", 
        "independent_unreliable", "domain"]], how="left", left_on="domain",
         right_on="domain")
del alt_labels

## Tweet length

In [11]:
# load the cleaned timeline-data
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_clean.csv.gzip"
cols = ["id", "text"]
texts = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols,
    dtype={"id":str, "text":str}
)

texts["tweet_length"] = texts["text"].apply(lambda x: len(x))

urls = pd.merge(
    urls,
    texts[["id", "tweet_length"]],
    how="left",
    left_on="id",
    right_on="id"
)

## Add truth seeking & belief speaking scores

### Glove

In [12]:
# load the embedding scores for belief-speaking and truth-seeking
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_honesty_component_scores_glove.csv.gzip"
honesty_scores = pd.read_csv(
    join(src, fname),
    dtype={"id":str}, 
    compression="gzip"
).rename(columns={"avg_truth_score":"avg_truth_score_raw", "avg_belief_score":"avg_belief_score_raw"})

In [13]:
honesty_scores = pd.merge(
    honesty_scores,
    urls[["id", "tweet_length"]],
    how="left",
    left_on="id",
    right_on="id"
)
honesty_scores = honesty_scores.dropna()

In [14]:
# correct the similarity scores for tweet-length effects
slope_belief, intercept_belief, rval_belief, pval_belief, stderr_belief = \
    linregress(honesty_scores["tweet_length"], honesty_scores["avg_belief_score_raw"])
print(f"belief-speaking slope: {slope_belief}, intercept: {intercept_belief}")

def predict_belief_similarity(tweet_length):
    return intercept_belief + slope_belief * tweet_length

slope_truth, intercept_truth, rval_truth, pval_truth, stderr_truth = \
    linregress(honesty_scores["tweet_length"], honesty_scores["avg_truth_score_raw"])
print(f"truth-seeking slope: {slope_truth}, intercept: {intercept_truth}")

def predict_truth_similarity(tweet_length):
    return intercept_truth + slope_truth * tweet_length

honesty_scores["avg_belief_score"] = honesty_scores\
    .apply(lambda x: x["avg_belief_score_raw"] - predict_belief_similarity(x["tweet_length"]), axis=1)

honesty_scores["avg_truth_score"] = honesty_scores\
    .apply(lambda x: x["avg_truth_score_raw"] - predict_truth_similarity(x["tweet_length"]), axis=1)

belief-speaking slope: 0.0009353730302445349, intercept: 0.36703874244674656
truth-seeking slope: 0.0010236247437594504, intercept: 0.3052734641526922


In [16]:
# merge the raw and corrected scores with the url data frame
cols = ["id", "avg_truth_score_raw", "avg_truth_score", "avg_belief_score_raw", "avg_belief_score"]
urls = pd.merge(
    honesty_scores[cols], 
    urls, 
    how="right", 
    left_on="id", 
    right_on="id"
)

### Add truth seeking & belief speaking scores for dictionary bootstraps

**Note** include this code if you have generated honesty component similarities using the bootstrapped dictionaries by running `label_lexicon_loop.sh`.

In [17]:
'''
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_honesty_component_scores_glove_bootstrap.csv.gzip"
honesty_scores_bootstrap = pd.read_csv(
    join(src, fname),
    compression="gzip",
    dtype={"id":str},
)
'''

'\nsrc = "../../data/tweets"\nfname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_honesty_component_scores_glove_bootstrap.csv.gzip"\nhonesty_scores_bootstrap = pd.read_csv(\n    join(src, fname),\n    compression="gzip",\n    dtype={"id":str},\n)\n'

In [18]:
'''
honesty_scores_bootstrap = pd.merge(
    honesty_scores_bootstrap,
    urls[["id", "tweet_length"]],
    how="left",
    left_on="id",
    right_on="id"
)
honesty_scores_bootstrap = honesty_scores_bootstrap.dropna()
honesty_scores_bootstrap = honesty_scores_bootstrap.drop_duplicates(subset=["id"])
'''

'\nhonesty_scores_bootstrap = pd.merge(\n    honesty_scores_bootstrap,\n    urls[["id", "tweet_length"]],\n    how="left",\n    left_on="id",\n    right_on="id"\n)\nhonesty_scores_bootstrap = honesty_scores_bootstrap.dropna()\nhonesty_scores_bootstrap = honesty_scores_bootstrap.drop_duplicates(subset=["id"])\n'

In [None]:
'''
for i in range(100):
    print(i)
    honesty_scores_bootstrap[f"avg_belief_score_{i}"] = honesty_scores_bootstrap\
        .apply(lambda x: x[f"avg_belief_score_{i}"] - predict_belief_similarity(x["tweet_length"]), axis=1)    
    honesty_scores_bootstrap[f"avg_truth_score_{i}"] = honesty_scores_bootstrap\
        .apply(lambda x: x[f"avg_truth_score_{i}"] - predict_truth_similarity(x["tweet_length"]), axis=1)

honesty_scores_bootstrap = honesty_scores_bootstrap.drop(columns=["tweet_length"])
'''

In [29]:
'''
urls = pd.merge(
    honesty_scores_bootstrap, 
    urls, 
    how="right", 
    left_on="id", 
    right_on="id"
)
del honesty_scores_bootstrap
'''

### word2vec

In [19]:
# load the embedding scores for belief-speaking and truth-seeking
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_honesty_component_scores_word2vec.csv.gzip"
honesty_scores = pd.read_csv(
    join(src, fname),
    dtype={"id":str}, 
    compression="gzip"
)
honesty_scores = honesty_scores.rename(columns={
    "avg_truth_score":"avg_truth_score_word2vec_raw",
    "avg_belief_score":"avg_belief_score_word2vec_raw"
})

In [20]:
honesty_scores = pd.merge(
    honesty_scores,
    urls[["id", "tweet_length"]],
    how="left",
    left_on="id",
    right_on="id"
)
honesty_scores = honesty_scores.dropna()

In [21]:
# correct the similarity scores for tweet-length effects
slope_belief, intercept_belief, rval_belief, pval_belief, stderr_belief = \
    linregress(honesty_scores["tweet_length"], honesty_scores["avg_belief_score_word2vec_raw"])
print(f"belief-speaking slope: {slope_belief}, intercept: {intercept_belief}")

def predict_belief_similarity(tweet_length):
    return intercept_belief + slope_belief * tweet_length

slope_truth, intercept_truth, rval_truth, pval_truth, stderr_truth = \
    linregress(honesty_scores["tweet_length"], honesty_scores["avg_truth_score_word2vec_raw"])
print(f"truth-seeking slope: {slope_truth}, intercept: {intercept_truth}")

def predict_truth_similarity(tweet_length):
    return intercept_truth + slope_truth * tweet_length

honesty_scores["avg_belief_score_word2vec"] = honesty_scores\
    .apply(lambda x: x["avg_belief_score_word2vec_raw"] - predict_belief_similarity(x["tweet_length"]), axis=1)

honesty_scores["avg_truth_score_word2vec"] = honesty_scores\
    .apply(lambda x: x["avg_truth_score_word2vec_raw"] - predict_truth_similarity(x["tweet_length"]), axis=1)

belief-speaking slope: 0.0005525060956405117, intercept: 0.5790686291835863
truth-seeking slope: 0.0006087550972715038, intercept: 0.516270415882894


In [22]:
cols = ["id", "avg_truth_score_word2vec_raw", "avg_truth_score_word2vec", "avg_belief_score_word2vec_raw", "avg_belief_score_word2vec"]
urls = pd.merge(
    honesty_scores[cols], 
    urls, 
    how="right", 
    left_on="id", 
    right_on="id"
)
del honesty_scores

### fasttext

In [23]:
# load the embedding scores for belief-speaking and truth-seeking
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_honesty_component_scores_fasttext.csv.gzip"
honesty_scores = pd.read_csv(
    join(src, fname),
    dtype={"id":str}, 
    compression="gzip"
)
honesty_scores = honesty_scores.rename(columns={
    "avg_truth_score":"avg_truth_score_fasttext_raw",
    "avg_belief_score":"avg_belief_score_fasttext_raw"
})

In [24]:
honesty_scores = pd.merge(
    honesty_scores,
    urls[["id", "tweet_length"]],
    how="left",
    left_on="id",
    right_on="id"
)
honesty_scores = honesty_scores.dropna()

In [25]:
honesty_scores = honesty_scores.drop_duplicates("id")

In [26]:
# correct the similarity scores for tweet-length effects
slope_belief, intercept_belief, rval_belief, pval_belief, stderr_belief = \
    linregress(honesty_scores["tweet_length"], honesty_scores["avg_belief_score_fasttext_raw"])
print(f"belief-speaking slope: {slope_belief}, intercept: {intercept_belief}")

def predict_belief_similarity(tweet_length):
    return intercept_belief + slope_belief * tweet_length

slope_truth, intercept_truth, rval_truth, pval_truth, stderr_truth = \
    linregress(honesty_scores["tweet_length"], honesty_scores["avg_truth_score_fasttext_raw"])
print(f"truth-seeking slope: {slope_truth}, intercept: {intercept_truth}")

def predict_truth_similarity(tweet_length):
    return intercept_truth + slope_truth * tweet_length

honesty_scores["avg_belief_score_fasttext"] = honesty_scores\
    .apply(lambda x: x["avg_belief_score_fasttext_raw"] - predict_belief_similarity(x["tweet_length"]), axis=1)

honesty_scores["avg_truth_score_fasttext"] = honesty_scores\
    .apply(lambda x: x["avg_truth_score_fasttext_raw"] - predict_truth_similarity(x["tweet_length"]), axis=1)

belief-speaking slope: 0.0005501307210788646, intercept: 0.3800728900605277
truth-seeking slope: 0.0005520439469570941, intercept: 0.34329250235666736


In [27]:
cols = ["id", "avg_truth_score_fasttext_raw", "avg_truth_score_fasttext", "avg_belief_score_fasttext_raw", "avg_belief_score_fasttext"]
urls = pd.merge(
    honesty_scores[cols], 
    urls, 
    how="right", 
    left_on="id", 
    right_on="id"
)
del honesty_scores

## Add party affiliation

In [28]:
src = "../../data/users"
fname = "US_politician_twitter_accounts_clean.csv"
party_affiliation = pd.read_csv(
    join(src, fname), 
    usecols=["author_id", "party", "name", "handle"],
    dtype={"author_id":str}
)
urls = pd.merge(urls, party_affiliation, how="left", left_on="author_id",
    right_on="author_id")
del party_affiliation

## Export URLs for article scraping & statistical modelling

In [29]:
# export the list of all URLs for article text straping
dst = "../../data/articles/"
fname = "url_list_for_article_scraping.csv.gzip"
url_export = urls[["domain", "url", "party"]].copy()
url_export = url_export.drop_duplicates().dropna(subset=["url"])
url_export.to_csv(join(dst, fname), index=False, compression="gzip")

fname = "url_NG_scores.csv.gzip"
url_export = urls[["url", "NG_score", "party"]].copy()
url_export = url_export.drop_duplicates().dropna(subset=["url"])
url_export.to_csv(join(dst, fname), index=False, compression="gzip")

fname = "url_independent_scores.csv.gzip"
url_export = urls[["url", "accuracy", "transparency"]].copy()
url_export = url_export.drop_duplicates().dropna(subset=["url"])
url_export.to_csv(join(dst, fname), index=False, compression="gzip")

# Create a tweet data frame

In [30]:
# the current "url" data frame contains one row per URL, i.e. the same
# tweet can be present more than once. To calculate the share of tweets with
# unreliable information, we first calculate the mean NewsGuard score (and 
# mean accuracy and transparency) per tweet by averaging over all scores 
# of URLs that are present in a given tweet and then assigning "fishy" and
# "unreliable" labels on the tweet level

# columns that are defined on the tweet level
#tweet_cols = ["id", "author_id", "created_at", "retweeted", "quoted", "reply",
#              "has_url", "handle", "name", "party", "tweet_length"] + \
#             ["avg_belief_score", "avg_truth_score"] + \
#             ["avg_belief_score_word2vec", "avg_truth_score_word2vec"] + \
#             ["avg_belief_score_fasttext", "avg_truth_score_fasttext"] + \
#             [f"avg_belief_score_{i}" for i in range(100)] + \
#             [f"avg_truth_score_{i}" for i in range(100)]

# note: use above columns if you run the script including the dictionary 
# robustness data
tweet_cols = ["id", "author_id", "created_at", "retweeted", "quoted", "reply",
              "has_url", "handle", "name", "party", "tweet_length"] + \
             ["avg_belief_score", "avg_truth_score"] + \
             ["avg_belief_score_word2vec", "avg_truth_score_word2vec"] + \
             ["avg_belief_score_fasttext", "avg_truth_score_fasttext"]
tweets = urls[tweet_cols].drop_duplicates(subset=["id"]).copy()

## Add LIWC scores

In [31]:
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_clean_LIWC.csv.gzip"
cols = ["id", "Analytic", "Authentic", "moral"]
LIWC_scores = pd.read_csv(
    join(src, fname), 
    compression="gzip",
    usecols=cols,
    dtype={"id":str},
)
LIWC_scores = LIWC_scores.rename(columns={
    "Analytic":"LIWC_analytic",
    "Authentic":"LIWC_authentic",
    "moral":"LIWC_moral",
})
tweets = pd.merge(tweets, LIWC_scores, how="left", left_on="id", right_on="id")

## Add VADER scores

In [32]:
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_clean_VADER.csv.gzip"
VADER_scores = pd.read_csv(
    join(src, fname), 
    compression="gzip",
    dtype={"id":str},
)
VADER_scores = VADER_scores.rename(columns={
    "neg":"VADER_neg",
    "pos":"VADER_pos",
    "neu":"VADER_neu",
    "compound":"VADER_compound"
})
tweets = pd.merge(tweets, VADER_scores, how="left", left_on="id", right_on="id")

## Calculate average NewsGuard score and misinfo components

In [33]:
average_scores = urls[["id", "NG_score", "transparency", "accuracy"]]\
    .groupby("id")\
    .agg("mean")

average_scores["NG_unreliable"] = np.nan
average_scores.loc[average_scores[\
            average_scores["NG_score"] < 60].index, "NG_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["NG_score"] >= 60].index, "NG_unreliable"] = 0

## Calculate average accuracy & transparency score and unreliable domains

In [34]:
average_scores["independent_unreliable"] = np.nan
# original definition: sources with transparency = 1 are unreliable
# since transparency can have non-integer values after averaging, we decide
# to label tweets with an average domain transparency value of links of
# <= 1.5 as "unreliable", since that means that the majority of domains 
# linked to in the tweet are unreliable. If one domain with transparency 1
# and one domain with transparency 2 are linked, the tweet is unreliable
average_scores.loc[average_scores[\
            average_scores["transparency"] <= 1.5].index, "independent_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["transparency"] > 1.5].index, "independent_unreliable"] = 0
# original defintion: sources with accuracy = 1 or 2 are unreliable
# since accuracy can have non-integer values after averaging, we decide to
# label tweets with an average domain accuracy value of links of <= 2.5 as
# "unreliable", since that means that the majority of domains linked to in 
# the tweet are unreliable. If one domain with accuracy 2 and one domain 
# with accuracy 3 are linked, the tweet is unreliable.
average_scores.loc[average_scores[\
            average_scores["accuracy"] <= 2.5].index, "independent_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["accuracy"] > 2.5].index, "independent_unreliable"] = 0

tweets = pd.merge(tweets, average_scores, how="left", left_on="id", right_on="id")
del average_scores

# Create a user data frame

In [35]:
users = tweets[["author_id", "handle", "name", "party", "id"]]\
    .groupby(["author_id", "handle", "name", "party"])\
    .agg("count")\
    .reset_index()\
    .rename(columns={"id":"N_tweets"})

## Add account stats

In [36]:
src = "../../data/users"
fname = "US_politician_twitter_accounts_clean.csv"
cols = ["followers_count", "following_count", "tweet_count", "created_at", 
        "author_id"]
account_stats = pd.read_csv(
    join(src, fname),
    parse_dates=["created_at"],
    usecols=cols,
    dtype={"author_id":str}
)

users = pd.merge(users, account_stats, how="left", left_on="author_id", right_on="author_id")
del account_stats

## Add Congress information

In [37]:
src = "../../data/users/clean"
fname = "congress-member-twitter-handles_114-118.csv"
congress_twitter_handles = pd.read_csv(join(src, fname))
congress_twitter_handles = congress_twitter_handles\
    .sort_values(by="congress", ascending=False)\
    .drop_duplicates(subset="handle")\
    .reset_index(drop=True)

users = pd.merge(users, congress_twitter_handles, how="left", left_on="handle",
                 right_on="handle")
del congress_twitter_handles

## Add share of untrustworthy domains (NewsGuard)

In [38]:
cols = ["author_id", "NG_unreliable"]
unreliable_user_count = tweets[tweets["retweeted"] == False][cols]\
    .groupby("author_id")\
    .agg(["sum", "count"])

unreliable_user_count["NG_unreliable_share"] = \
    unreliable_user_count["NG_unreliable"]["sum"] / \
    unreliable_user_count["NG_unreliable"]["count"]
    
# flatten the hierarchical indices
unreliable_user_count = unreliable_user_count.reset_index()
unreliable_user_count.columns = ['_'.join(col).strip("_") \
                            for col in unreliable_user_count.columns.values]

unreliable_user_count.head(2)

Unnamed: 0,author_id,NG_unreliable_sum,NG_unreliable_count,NG_unreliable_share
0,1002630999052865536,0.0,100,0.0
1,1004891731,1.0,249,0.004016


In [39]:
cols = ["NG_unreliable_share", "author_id"]
users = pd.merge(
    users, 
    unreliable_user_count[cols],
    how="left",
    left_on="author_id",
    right_on="author_id"
)

## Add average NewsGuard score

In [40]:
average_NG_scores = tweets[tweets["retweeted"] == False][["author_id", "NG_score"]]\
    .groupby("author_id")\
    .mean()\
    .reset_index()\
    .rename(columns={"NG_score":"NG_score_mean"})
users = pd.merge(
    users, 
    average_NG_scores, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add average accuracy & transparency score

In [41]:
average_accuracy_transparency = tweets[tweets["retweeted"] == False]\
    [["author_id", "accuracy", "transparency"]]\
    .groupby("author_id")\
    .mean()\
    .reset_index()\
    .rename(columns={
        "accuracy":"accuracy_mean",
        "transparency":"transparency_mean"
    })
users = pd.merge(
    users, 
    average_accuracy_transparency, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add share of unstrustworthy domains (independent list)

In [42]:
unreliable_user_count = tweets[tweets["retweeted"] == False]\
    [["author_id", "independent_unreliable"]]\
    .groupby("author_id")\
    .agg(["sum", "count"])

unreliable_user_count["independent_unreliable_share"] = \
    unreliable_user_count["independent_unreliable"]["sum"] / \
    unreliable_user_count["independent_unreliable"]["count"]
    
# flatten the hierarchical indices
unreliable_user_count = unreliable_user_count.reset_index()
unreliable_user_count.columns = ['_'.join(col).strip("_") \
                            for col in unreliable_user_count.columns.values]

users = pd.merge(
    users, 
    unreliable_user_count[["author_id", "independent_unreliable_share"]],
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)
del unreliable_user_count

## Add average belief-speaking and truth-seeking score

In [43]:
honesty_tweets_score = tweets[tweets["retweeted"] == False]\
    [["author_id", "avg_belief_score", "avg_truth_score", "created_at"]]\
    .dropna(subset=["avg_belief_score", "avg_truth_score"])\
    .copy()

In [44]:
# all honesty component tweets
honesty_score_average = honesty_tweets_score\
    [["author_id", "avg_belief_score", "avg_truth_score"]]\
    .groupby("author_id")\
    .mean()

honesty_score_average = honesty_score_average.reset_index()

In [45]:
honesty_tweets_score = honesty_tweets_score.set_index("created_at")

In [46]:
# only first 4 years
honesty_score_average_first = honesty_tweets_score[honesty_tweets_score.index.year <= 2013]\
    .groupby("author_id")\
    .mean()

honesty_score_average_first = honesty_score_average_first.reset_index()
cols = ["avg_belief_score", "avg_truth_score"]
honesty_score_average_first = honesty_score_average_first\
    .rename(columns={col:col + "_2010_to_2013" for col in cols}) 

In [47]:
# only last 4 years
honesty_score_average_last = honesty_tweets_score[honesty_tweets_score.index.year >= 2019]\
    .groupby("author_id")\
    .mean()
honesty_score_average_last = honesty_score_average_last.reset_index()
cols = ["avg_belief_score", "avg_truth_score"]
honesty_score_average_last = honesty_score_average_last\
    .rename(columns={col:col + "_2019_to_2022" for col in cols}) 

In [48]:
users = users.merge(honesty_score_average[["author_id", "avg_belief_score", 
                    "avg_truth_score"]], how="left", left_on="author_id", 
                    right_on="author_id")
del honesty_score_average

users = users.merge(
    honesty_score_average_first,
    how="left",
    left_on="author_id", 
    right_on="author_id"
)
del honesty_score_average_first

users = users.merge(
    honesty_score_average_last,
    how="left",
    left_on="author_id", 
    right_on="author_id"
)
del honesty_score_average_last

## Add average LIWC and VADER scores

In [49]:
cols = ['LIWC_analytic', 'LIWC_authentic', 'LIWC_moral',
        'VADER_neg', 'VADER_pos', 'VADER_neu', 'VADER_compound']
average_scores = tweets\
    [["author_id"] + cols]\
    .groupby("author_id")\
    .mean()\
    .reset_index()
score_map = {score:f"{score}_mean" for score in cols}
average_scores = average_scores.rename(columns=score_map)

users = pd.merge(
    users, 
    average_scores, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add ideology scores and states

In [50]:
src = "../../data/utilities"
fname = "govtrack-stats-{}-{}-ideology.csv"
ideology_scores = pd.DataFrame()
# note: scorecards for 2021 are missing in govtrack
for year in list(range(2013, 2021)) + [2022]:
    for chamber in ["house", "senate"]:
        tmp = pd.read_csv(join(src, "ideology_scores",
                               fname.format(year, chamber)))
        tmp["year"] = year
        tmp["name"] = tmp["name"].apply(lambda x: x.replace("b'", ""))
        tmp["name"] = tmp["name"].apply(lambda x: x.replace("'", "").lower())
        ideology_scores = pd.concat([ideology_scores, tmp])

In [51]:
# match politician Twitter account names to govtrack politician names

# a single politician can have at maximum 9 entries for 9 different years
# 2013 to 2020 plus 2022
counts = ideology_scores["name"].value_counts()
unique_names = list(counts[counts <= 8].index)

unique_scores = ideology_scores[ideology_scores["name"].isin(unique_names)]\
    .sort_values(by="year", ascending=False)\
    .drop_duplicates(subset=["name"])\
    .set_index("name")
unique_names = list(set(unique_scores.index))

def match_score(account_name):
    '''Matches govtrack politician names to Twitter account names.'''
    if account_name == account_name:
        account_name = set(account_name.lower().split(" "))
        for name in unique_names:
            # hard matching: if the govtrack name string is completely included
            # in the Twitter account name string, record a match
            if name in account_name:
                return unique_scores.loc[name]["id"]
    else:
        return np.nan
    
users["ideology_score_id"] = users["name"].apply(match_score)

In [52]:
# add hand-matched missing scores
src = "../../data/utilities"
fname = "missing_govtrack_ideology_scores.csv"
missing_scores = pd.read_csv(join(src, fname))
missing_scores = {row["handle"]:row["ideology_score_id"] \
                  for i, row in missing_scores.iterrows()}

# index on the handle since this seems to be the most consistent index between
# the two datasets
users = users.set_index("handle")
for handle, score_id in missing_scores.items():
    if handle in users.index:
        users.loc[handle, "ideology_score_id"] = score_id
users = users.reset_index()

In [53]:
# for many accounts, there is more than one ideology score since they were 
# active over many years. We calculate the mean, std and count of the ideology
# score for each user and add this information to the user_df
ideology_scores_agg = ideology_scores[["id", "ideology"]]\
    .groupby("id")\
    .agg(["mean", "std", "count"])
ideology_scores_agg = ideology_scores_agg.reset_index()
ideology_scores_agg.columns = ['_'.join(col).strip("_") \
                            for col in ideology_scores_agg.columns.values]

# add a state for each politician
states = ideology_scores[["id", "state"]].drop_duplicates()
ideology_scores_agg = ideology_scores_agg.merge(states, how="left",
                left_on="id", right_on="id")

In [54]:
users = users.merge(ideology_scores_agg, how="left", 
                      left_on="ideology_score_id", right_on="id")

In [55]:
# add hand-matched missing states
missing_states = pd.read_csv(
    join(src, "missing_states.csv"),
    usecols=["handle", "state"]
).set_index("handle")

users = users.set_index("handle")
users.loc[missing_states.index, "state"] = missing_states
users = users.reset_index()

In [56]:
del ideology_scores
del ideology_scores_agg
del states
del missing_states

## Add Politifact scores

In [57]:
src = "../../data/utilities"
fname = "misinfo_score_politifact.csv"
pf_scores = pd.read_csv(join(src, fname), 
        usecols=["pf_score", "elite_account"])\
    .rename(columns={"elite_account":"handle"})

users = pd.merge(users, pf_scores, how="left", left_on="handle", right_on="handle")
del pf_scores

# Data exports

## URL, tweet and user data frames

**Note**: if you are running this code including the data for the dictionary robustness ananlysis, saving the files takes a while because they are pretty large.

In [58]:
dst = "../../data/"

In [59]:
# URL data frame
fname = "US_politician_URLs_2010-11-06_to_2022-12-31.csv.gzip"
urls = urls[urls["has_url"] == True]
urls = urls.drop(columns=["url", "handle", "name", "has_url"])
urls.to_csv(join(dst, "urls", fname), index=False, compression="gzip")

In [60]:
# user data frame
fname = "users.csv"
users = users.drop(columns=["ideology_score_id", "id"])
users.to_csv(join(dst, "users", fname), index=False)

In [61]:
# tweet data frame
fname = "US_politician_tweets_2010-11-06_to_2022-12-31.csv.gzip"
tweets = tweets.drop(columns=["handle", "name"])
tweets.to_csv(join(dst, "tweets", fname), index=False, compression="gzip")

## Data for linear mixed effects modelling

In [62]:
src = "../../data"

In [63]:
fname = "US_politician_tweets_2010-11-06_to_2022-12-31.csv.gzip"
cols = [
    "author_id", # data grouping: independent random variable
    "party", # characteristic of author: independent fixed variable
    "avg_belief_score", # fixed variable
    "avg_truth_score", # fixed variable
    "NG_score", # dependent variable
    "accuracy", # dependent variable
    "transparency", # dependent variable
]
tweets = pd.read_csv(
    join(src, "tweets", fname),
    compression="gzip", 
    usecols=cols, 
    dtype={"author_id":str},
)

In [64]:
fname = "users.csv"
cols = ["author_id", "ideology_mean"]
users = pd.read_csv(join(src, "users", fname), dtype={"author_id":str}, usecols=cols)

In [65]:
tweets = tweets[tweets["party"].isin(["Democrat", "Republican"])] # remove independents
tweets = tweets.dropna(subset=["avg_belief_score", "avg_truth_score"]) # remove tweets without NG, belief or truth score
len(tweets)

3876333

In [66]:
# filter out authors with only a single tweet
tweet_counts = tweets["author_id"]\
    .value_counts()\
    .reset_index()\
    .rename(columns={"index":"author_id", "author_id":"count"})

tweets = tweets[tweets["author_id"].isin(tweet_counts[tweet_counts["count"] > 1]["author_id"])]
len(tweets)

3876332

In [67]:
tweets = pd.merge(
    tweets,
    users,
    how="left"
)

In [68]:
# center similarity scores
tweets["belief"] = tweets["avg_belief_score"]# - tweets["avg_belief_score"].mean()
tweets["truth"] = tweets["avg_truth_score"]# - tweets["avg_truth_score"].mean()
# normalize trustworthiness scores by maximum scale value
tweets["NG"] = tweets["NG_score"] / 100
tweets["accuracy"] = tweets["accuracy"] / 5
tweets["transparency"] = tweets["transparency"] / 3

In [69]:
cols = ["author_id", "party", "belief", "truth"]
# only tweets that have a NewsGuard score
tweets_NG = tweets[cols + ["NG"]].dropna()
# only tweets that have an accuracy/transparency score
tweets_independent = tweets[cols + ["accuracy", "transparency"]].dropna()

dst = "../../data/tweets"
tweets_NG.to_csv(join(dst, "tweets_for_lme_modelling_NG.csv"), index=False)
tweets_independent.to_csv(join(dst, "tweets_for_lme_modelling_independent.csv"), index=False)

## Data for linear mixed effects modelling other embeddings

In [70]:
src = "../../data"

In [71]:
fname = "US_politician_tweets_2010-11-06_to_2022-12-31.csv.gzip"
cols = [
    "author_id", # data grouping: independent random variable
    "party", # characteristic of author: independent fixed variable
    "avg_belief_score_word2vec", # fixed variable
    "avg_truth_score_word2vec", # fixed variable
    "avg_belief_score_fasttext", # fixed variable
    "avg_truth_score_fasttext", # fixed variable
    "NG_score", # dependent variable
]
tweets = pd.read_csv(
    join(src, "tweets", fname),
    compression="gzip", 
    usecols=cols, 
    dtype={"author_id":str},
)

In [72]:
tweets = tweets[tweets["party"].isin(["Democrat", "Republican"])] # remove independents
tweets = tweets.dropna(subset=["avg_belief_score_word2vec", "avg_truth_score_word2vec",
                               "avg_belief_score_fasttext", "avg_truth_score_fasttext"]) # remove tweets without NG, belief or truth score
len(tweets)

3876333

In [73]:
# filter out authors with only a single tweet
tweet_counts = tweets["author_id"]\
    .value_counts()\
    .reset_index()\
    .rename(columns={"index":"author_id", "author_id":"count"})

tweets = tweets[tweets["author_id"].isin(tweet_counts[tweet_counts["count"] > 1]["author_id"])]
len(tweets)

3876332

In [74]:
# clean up column names
tweets = tweets.rename(columns={
    "avg_belief_score_word2vec":"belief_word2vec",
    "avg_belief_score_fasttext":"belief_fasttext",
    "avg_truth_score_word2vec":"truth_word2vec",
    "avg_truth_score_fasttext":"truth_fasttext"
})

# normalize trustworthiness scores by maximum scale value
tweets["NG"] = tweets["NG_score"] / 100

In [75]:
cols = ["author_id", "party", "belief_word2vec", "truth_word2vec", "belief_fasttext", "truth_fasttext"]
# only tweets that have a NewsGuard score
tweets = tweets[cols + ["NG"]].dropna()

dst = "../../data/tweets"
tweets.to_csv(join(dst, "tweets_for_lme_modelling_other_embeddings.csv"), index=False)

## Data for topic modelling

In [91]:
# read the tweet data
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_clean.csv.gzip"
cols = ["id", "author_id", "text"]
tweets = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols,
    dtype={"id":str, "author_id":str}
)
tweets["id"] = tweets["id"].str.replace('"', '')
tweets["author_id"] = tweets["author_id"].str.replace('"', '')

# read the honesty scores
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_honesty_component_scores_glove.csv.gzip"
cols = ["id", "avg_belief_score", "avg_truth_score"]
honesty_scores = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols,
    dtype={"id":str}
)

# add honesty component scores
tweets = pd.merge(
    tweets,
    honesty_scores,
    how="left",
    left_on="id",
    right_on="id"
)

# add party information
tweets = pd.merge(
    tweets,
    users[["author_id", "party"]],
    how="left",
    left_on="author_id",
    right_on="author_id"
)

# drop all tweets without party and honesty score information (retweets and
# tweets with too short text that don't have honesty scores)
tweets = tweets.dropna()

# drop all tweets that are not from Democrats or Republicans
tweets = tweets[tweets["party"].isin(["Democrat", "Republican"])].copy()

In [97]:
# lemmatize text - note: this takes a while
from pandarallel import pandarallel
import spacy
pandarallel.initialize(nb_workers=20)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
tweets["lemmatized"] = tweets['text']\
    .parallel_apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))

tweets['lemmatized'] = tweets['lemmatized'].str.replace(r'\s+|\\n', ' ', regex=True)

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Calculate quantiles

In [98]:
# identify tweets that are in the top belief-speaking similarity and 
# truth-seeking similarity quantiles and assign them to the "belief" and "truth"
# categories. If a tweet is in both top quantiles, assign them to the category
# with the higher similarity
belief_quant = tweets["avg_belief_score"].quantile(0.8)
truth_quant = tweets["avg_truth_score"].quantile(0.8)
tweets["belief"] = 0
tweets["truth"] = 0

# assign categories based on quantiles
tweets.loc[tweets[tweets["avg_belief_score"] >= belief_quant].index, "belief"] = 1
tweets.loc[tweets[tweets["avg_truth_score"] >= truth_quant].index, "truth"] = 1

# tweets that are in both categories
tweets.loc[tweets[
    (tweets["belief"] == 1) & (tweets["truth"] == 1) & \
    (tweets["avg_belief_score"] > tweets["avg_truth_score"])].index, "truth"] = 0
tweets.loc[tweets[
    (tweets["belief"] == 1) & (tweets["truth"] == 1) & \
    (tweets["avg_truth_score"] > tweets["avg_belief_score"])].index, "belief"] = 0

In [99]:
# assign tweets to honesty component X party categories
tweets["classes_quant"] = np.nan
tweets.loc[tweets[(tweets["party"] == "Democrat") & (tweets["belief"] == 1)].index, "classes_quant"] = "db"
tweets.loc[tweets[(tweets["party"] == "Democrat") & (tweets["truth"] == 1)].index, "classes_quant"] = "dt"
tweets.loc[tweets[(tweets["party"] == "Republican") & (tweets["belief"] == 1)].index, "classes_quant"] = "rb"
tweets.loc[tweets[(tweets["party"] == "Republican") & (tweets["truth"] == 1)].index, "classes_quant"] = "rt"

tweets.loc[tweets[(tweets["party"] == "Democrat") & (tweets["belief"] == 0) & (tweets["truth"] == 0)].index, "classes_quant"] = "dn"
tweets.loc[tweets[(tweets["party"] == "Republican") & (tweets["belief"] == 0) & (tweets["truth"] == 0)].index, "classes_quant"] = "rn"

In [100]:
# save the data
dst = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_lemma.csv.gzip"
cols = ["id", "author_id", "lemmatized", "party", "avg_belief_score",
        "avg_truth_score", "classes_quant"]
tweets[cols].to_csv(
    join(dst, fname),
    compression="gzip",
    index=False
)

In [64]:
# save the data
dst = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_lemma.csv.gzip"
tweets = pd.read_csv(
    join(dst, fname),
    compression="gzip",
    dtype={"id":str, "author_id":str},
    usecols=["id", "author_id", "lemmatized", "party", "classes_quant"]
)

# Articles

In [41]:
! ls ../../data/articles

article_corpus_clean.csv.gzip	  remaining_urls.csv.gzip
article_corpus_combined.csv.gzip  url_independent_scores.csv.gzip
article_corpus_raw_0.csv.gzip	  url_list_for_article_scraping.csv.gzip
article_corpus_raw.csv.gzip	  url_NG_scores.csv.gzip
raw_scrape_020323.csv


In [5]:
df1 = pd.read_csv("../../data/articles/article_corpus_clean.csv.gzip", compression="gzip")

In [6]:
df2 = pd.read_csv("../../data/articles/raw_scrape_020323.csv")

In [30]:
df = pd.concat([df1[["url", "link_text"]], df2])

In [43]:
df[["url", "link_text"]].to_csv("../../data/articles/article_corpus_combined.csv.gzip", index=False)

In [31]:
len(df)

611511

In [32]:
df = df.drop_duplicates(subset=["url"])

In [33]:
len(df)

610057

In [45]:
fname = "url_list_for_article_scraping.csv.gzip"
urls = pd.read_csv(join(dst, "articles", fname), compression="gzip")

In [35]:
df = pd.merge(df, urls, how="left", left_on="url", right_on="url").dropna()

In [37]:
len(df)

428815

In [38]:
df = df[df["link_text"] != ""]

In [39]:
len(df)

428815

In [None]:
str1 = "unfortunately our website is currently unavailable in most European countries due to GDPR rules"

In [44]:
urls

Unnamed: 0,domain,url,party
0,twitter.com,https://twitter.com/RepBillJohnson/status/1608...,Republican
1,twitter.com,https://twitter.com/RepBillJohnson/status/1608...,Republican
2,twitter.com,https://twitter.com/RepBillJohnson/status/1608...,Republican
3,twitter.com,https://twitter.com/RepBillJohnson/status/1608...,Republican
4,twitter.com,https://twitter.com/RepBillJohnson/status/1606...,Republican
...,...,...,...
2885053,thehill.com,https://thehill.com/blogs/congress-blog/econom...,Democrat
2885054,blog.chron.com,http://blog.chron.com/txpotomac/2012/05/the-be...,Democrat
2885055,cuellar.house.gov,https://cuellar.house.gov/news/rss.aspx,Democrat
2885056,cuellar.house.gov,https://cuellar.house.gov/news/documentsingle....,Democrat


In [48]:
df = pd.read_csv("../../data/articles/article_corpus_clean.csv.gzip", compression="gzip")

In [49]:
df.head()

Unnamed: 0,url,link_text,remove,wc,party,NG_score,fishy
0,https://www.washingtonpost.com/politics/2018/1...,"\n\nDays before the midterm elections, Preside...",keep,775,Democrat,100.0,non_fishy
1,https://apnews.com/eb82c8597cef4c95a473e525402...,Residents look at a home damaged by a magnitud...,keep,715,Democrat,95.0,non_fishy
2,https://www.usatoday.com/story/news/nation/201...,Federal judge blocks Trump from deporting hund...,keep,1116,Democrat,100.0,non_fishy
3,https://www.cnn.com/2018/10/03/politics/senate...,Washington (CNN) The Senate on Wednesday passe...,keep,144,Democrat,80.0,non_fishy
4,https://www.usatoday.com/story/travel/flights/...,Senate approves bill that would regulate airli...,keep,506,Democrat,100.0,non_fishy


In [50]:
len(df)

354335