In [None]:
# author: Jana Lasser

In [2]:
import pandas as pd
import numpy as np
from os.path import join

# Create a URL data frame

In [3]:
# load the data frame with the expanded URLs
src = "../../data/urls"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2021-03-16_clean_urls.csv.gzip"
cols = ["id", "author_id", "created_at", "url", "retweeted",
        "quoted", "reply", "has_url"]
urls = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols,
    parse_dates=["created_at"],
    dtype={"author_id":str, "id":str}
)

## Add unraveled URLs

In [4]:
# load the list of originally shortened URLs with their expansions to their true
# destination
src = "../../data/urls"
fname = "US_unraveled_urls.csv.xz"
unraveled_urls = pd.read_csv(join(src, fname), compression="xz")

In [5]:
# add URL information
urls = pd.merge(urls, unraveled_urls, left_on="url", right_on="url", how="left")

# add indicator of whether the URL was originally shortened
urls["shortened_url"] = False
urls.loc[urls["unraveled_url"].dropna().index, "shortened_url"] = True

# replace the shortened URL with the unraveled URL
urls.loc[urls["unraveled_url"].dropna().index, "url"] = \
    urls.loc[urls["unraveled_url"].dropna().index, "unraveled_url"]
urls = urls.drop(columns=["unraveled_url"])

In [6]:
# extract the domain from the URL
def extract_domain(url):
    '''Given an ULR, extracts the domain name in the form XXXXX.YY'''
    if url != url:
        return np.nan
    # reformat entries that have the domain after a general name in parantheses
    if url.find('(') > 0:
        url = url.split('(')[-1]
        url = url.strip(')')
    # trailing "/" and spaces
    url = url.strip('/').strip()
    # transform all domains to lowercase
    url = url.lower()
    # remove any white spaces
    url = url.replace(' ', '')
    # if present: remove the protocol
    if url.startswith(("http", "https")):
        try:
            url = url.split('//')[1]
        except IndexError:
            print(f"found malformed URL {url}")
            return np.nan
    # remove "www." 
    url = url.replace('www.', '')
    url = url.split("/")[0]
    return url

urls["domain"] = urls["url"].apply(extract_domain)

found malformed URL https
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http


## Add NewsGuard nutrition scores

Newsguard rating cutoff: 60 (see [description](https://www.newsguardtech.com/ratings/rating-process-criteria/)).

In [7]:
# load the nutrition labels
src = "../../data/utilities/"
fname = "NewsGuard_labels.csv"
cols = ["Domain", "Score", "Last Updated"]
NG_scores = pd.read_csv(join(src, fname), usecols=cols)
# if more than one score exists for the same domain, keep the most recent one
NG_scores = NG_scores.sort_values(by=["Domain","Last Updated"], ascending=False)
NG_scores = NG_scores.drop_duplicates(subset=["Domain"])
NG_scores = NG_scores.rename(columns={"Domain":"domain", "Score":"NG_score"})
NG_scores = NG_scores.drop(columns=["Last Updated"])

# threshold scores at various cutoffs to define untrustworthy domains
NG_scores["NG_unreliable"] = 0
NG_scores.loc[NG_scores[NG_scores["NG_score"] < 60].index, "NG_unreliable"] = 1

# add the nutrition information to the tweet data table
urls = pd.merge(urls, NG_scores,
         left_on="domain", right_on="domain", how="left")
del NG_scores

In [8]:
# export the list of all URLs with a NewsGuard score for article text straping
dst = "../../data/urls/"
fname = "url_list_for_article_scraping.csv.gzip"
url_export = urls[["url", "domain", "NG_score"]].copy()
url_export = url_export.drop_duplicates().dropna(subset=["url"])
url_export.to_csv(join(dst, fname), index=False, compression="gzip")
del url_export

## Add alternative trustworthiness labels

In [9]:
# load the list of independently compiled trustworthiness labels for 
# news sources
src = "../../data/utilities"
fname = "independent_labels.csv"
alt_labels = pd.read_csv(join(src, fname))
alt_labels = alt_labels.rename(columns = {
    "type":"independent_unreliable", 
    "url":"domain"})

# convert reliability labels to binary
alt_labels["independent_unreliable"] = alt_labels["independent_unreliable"]\
    .replace({"reliable":0, "unreliable":1})

# merge with the tweet data table
urls = pd.merge(urls, alt_labels[["accuracy", "transparency", 
        "independent_unreliable", "domain"]], how="left", left_on="domain",
         right_on="domain")
del alt_labels

## Add truth seeking & belief speaking scores for different embeddings

In [21]:
src = "../../data/tweets"

fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_p0.05_swapped_label_DDR_glove840B.csv.zip"
cols = ["id", "avg_belief_score", "avg_truth_score"]
honesty_scores_glove = pd.read_csv(
    join(src, fname),
    usecols=cols,
    compression="zip"
)
honesty_scores_glove["id"] = honesty_scores_glove["id"].apply(lambda x: x.replace('"', ''))
honesty_scores_glove = honesty_scores_glove.rename(columns={
    "avg_belief_score":"avg_belief_score_glove",
    "avg_truth_score":"avg_truth_score_glove"
})

fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_p0.05_swapped_label_DDR_word2vec.csv.xz"
honesty_scores_word2vec = pd.read_csv(
    join(src, fname),
    usecols=cols,
    compression="xz"
)
honesty_scores_word2vec["id"] = honesty_scores_word2vec["id"].apply(lambda x: x.replace('"', ''))
honesty_scores_word2vec = honesty_scores_word2vec.rename(columns={
    "avg_belief_score":"avg_belief_score_word2vec",
    "avg_truth_score":"avg_truth_score_word2vec"
})

fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_p0.05_swapped_label_DDR_fasttext.csv.xz"
honesty_scores_fasttext = pd.read_csv(
    join(src, fname),
    usecols=cols,
    compression="xz"
)
honesty_scores_fasttext["id"] = honesty_scores_fasttext["id"].apply(lambda x: x.replace('"', ''))
honesty_scores_fasttext = honesty_scores_fasttext.rename(columns={
    "avg_belief_score":"avg_belief_score_fasttext",
    "avg_truth_score":"avg_truth_score_fasttext"
})

In [22]:
urls = pd.merge(honesty_scores_glove[["id", "avg_belief_score_glove", "avg_truth_score_glove"]], 
         urls, how="right", left_on="id", right_on="id")
urls = pd.merge(honesty_scores_word2vec[["id", "avg_belief_score_word2vec", "avg_truth_score_word2vec"]], 
         urls, how="right", left_on="id", right_on="id")
urls = pd.merge(honesty_scores_fasttext[["id", "avg_belief_score_fasttext", "avg_truth_score_fasttext"]], 
         urls, how="right", left_on="id", right_on="id")
del honesty_scores_glove
del honesty_scores_word2vec
del honesty_scores_fasttext

## Add truth seeking & belief speaking scores for dictionary bootstraps

In [32]:
src = "../../data/tweets"

fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_p0.05_swapped_label_DDR_glove840B_loop.csv.xz"
honesty_scores_bootstrap = pd.read_csv(
    join(src, fname),
    compression="xz"
)
honesty_scores_bootstrap["id"] = honesty_scores_bootstrap["id"].apply(lambda x: x.replace('"', ''))
cols = [c for c in honesty_scores_bootstrap.columns if not c in ["author_id", "conversation_id"]]
honesty_scores_bootstrap = honesty_scores_bootstrap[cols]

In [33]:
urls = pd.merge(
    honesty_scores_bootstrap, 
    urls, 
    how="right", 
    left_on="id", 
    right_on="id"
)
del honesty_scores_bootstrap

## Add party affiliation

In [34]:
src = "../../data/utilities"
fname = "party_affiliations_complete.csv"
party_affiliation = pd.read_csv(join(src, fname), dtype={"author_id":str})
urls = pd.merge(urls, party_affiliation, how="left", left_on="author_id",
    right_on="author_id")
del party_affiliation

In [36]:
urls.columns

Index(['id', 'avg_truth_score_0', 'avg_truth_score_1', 'avg_truth_score_2',
       'avg_truth_score_3', 'avg_truth_score_4', 'avg_truth_score_5',
       'avg_truth_score_6', 'avg_truth_score_7', 'avg_truth_score_8',
       ...
       'shortened_url', 'domain', 'NG_score', 'NG_unreliable', 'accuracy',
       'transparency', 'independent_unreliable', 'handle', 'name', 'party'],
      dtype='object', length=225)

# Create a tweet data frame

In [37]:
# the current "url" data frame contains one row per URL, i.e. the same
# tweet can be present more than once. To calculate the share of tweets with
# unreliable information, we first calculate the mean NewsGuard score (and 
# mean accuracy and transparency) per tweet by averaging over all scores 
# of URLs that are present in a given tweet and then assigning "fishy" and
# "unreliable" labels on the tweet level

# columns that are defined on the tweet level
tweet_cols = ["id", "author_id", "created_at", "retweeted", "quoted", "reply",
              "has_url", "handle", "name", "party"] + \
             [f"avg_truth_score_{i}" for i in range(100)] + \
             [f"avg_belief_score_{i}" for i in range(100)] + \
             ["avg_truth_score_glove", "avg_truth_score_word2vec", "avg_truth_score_fasttext"] + \
             ["avg_belief_score_glove", "avg_belief_score_word2vec", "avg_belief_score_fasttext"]
tweets = urls[tweet_cols].drop_duplicates(subset=["id"]).copy()

## Add LIWC scores

In [38]:
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean_mask_LIWC.csv.gzip"
cols = ["id", "WC", "Analytic", "Authentic", "moral", "emo_pos", "emo_neg"]
LIWC_scores = pd.read_csv(
    join(src, fname), 
    compression="gzip",
    usecols=cols,
    dtype={"id":str},
)
LIWC_scores = LIWC_scores.rename(columns={
    "WC":"word_count",
    "Analytic":"LIWC_analytic",
    "Authentic":"LIWC_authentic",
    "moral":"LIWC_moral",
    "emo_pos":"LIWC_emo_pos",
    "emo_neg":"LIWC_emo_neg"
})
tweets = pd.merge(tweets, LIWC_scores, how="left", left_on="id", right_on="id")

## Calculate average NewsGuard score and misinfo components

In [39]:
average_scores = urls[["id", "NG_score", "transparency", "accuracy"]]\
    .groupby("id")\
    .agg("mean")

average_scores["NG_unreliable"] = np.nan
average_scores.loc[average_scores[\
            average_scores["NG_score"] < 60].index, "NG_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["NG_score"] >= 60].index, "NG_unreliable"] = 0

## Calculate average accuracy & transparency score and unreliable domains

In [40]:
average_scores["independent_unreliable"] = np.nan
# original definition: sources with transparency = 1 are unreliable
# since transparency can have non-integer values after averaging, we decide
# to label tweets with an average domain transparency value of links of
# <= 1.5 as "unreliable", since that means that the majority of domains 
# linked to in the tweet are unreliable. If one domain with transparency 1
# and one domain with transparency 2 are linked, the tweet is unreliable
average_scores.loc[average_scores[\
            average_scores["transparency"] <= 1.5].index, "independent_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["transparency"] > 1.5].index, "independent_unreliable"] = 0
# original defintion: sources with accuracy = 1 or 2 are unreliable
# since accuracy can have non-integer values after averaging, we decide to
# label tweets with an average domain accuracy value of links of <= 2.5 as
# "unreliable", since that means that the majority of domains linked to in 
# the tweet are unreliable. If one domain with accuracy 2 and one domain 
# with accuracy 3 are linked, the tweet is unreliable.
average_scores.loc[average_scores[\
            average_scores["accuracy"] <= 2.5].index, "independent_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["accuracy"] > 2.5].index, "independent_unreliable"] = 0

tweets = pd.merge(tweets, average_scores, how="left", left_on="id", right_on="id")
del average_scores

# Create a user data frame

In [41]:
users = tweets[["author_id", "handle", "name", "party", "id"]]\
    .groupby(["author_id", "handle", "name", "party"])\
    .agg("count")\
    .reset_index()\
    .rename(columns={"id":"N_tweets"})

## Add account stats

In [42]:
src = "../../data/users"
fname = "US_politician_twitter_accounts_clean.csv"
cols = ["followers_count", "following_count", "tweet_count", "created_at", 
        "author_id"]
account_stats = pd.read_csv(
    join(src, fname),
    parse_dates=["created_at"],
    usecols=cols,
    dtype={"author_id":str}
)

users = pd.merge(users, account_stats, how="left", left_on="author_id", right_on="author_id")
del account_stats

## Add Congress information

In [43]:
src = "../../data/users/clean"
fname = "congress-member-twitter-handles_114-117.csv"
congress_twitter_handles = pd.read_csv(join(src, fname))
congress_twitter_handles = congress_twitter_handles\
    .sort_values(by="congress", ascending=False)\
    .drop_duplicates(subset="handle")\
    .reset_index(drop=True)

users = pd.merge(users, congress_twitter_handles, how="left", left_on="handle",
                 right_on="handle")
del congress_twitter_handles

## Add share of untrustworthy domains (NewsGuard)

In [44]:
cols = ["author_id", "NG_unreliable"]
unreliable_user_count = tweets[tweets["retweeted"] == False][cols]\
    .groupby("author_id")\
    .agg(["sum", "count"])

unreliable_user_count["NG_unreliable_share"] = \
    unreliable_user_count["NG_unreliable"]["sum"] / \
    unreliable_user_count["NG_unreliable"]["count"]
    
# flatten the hierarchical indices
unreliable_user_count = unreliable_user_count.reset_index()
unreliable_user_count.columns = ['_'.join(col).strip("_") \
                            for col in unreliable_user_count.columns.values]

unreliable_user_count.head(2)

Unnamed: 0,author_id,NG_unreliable_sum,NG_unreliable_count,NG_unreliable_share
0,1009269193,0.0,221,0.0
1,1011053278304592000,0.0,0,


In [45]:
cols = ["NG_unreliable_share", "author_id"]
users = pd.merge(
    users, 
    unreliable_user_count[cols],
    how="left",
    left_on="author_id",
    right_on="author_id"
)

## Add average NewsGuard score

In [46]:
average_NG_scores = tweets[tweets["retweeted"] == False][["author_id", "NG_score"]]\
    .groupby("author_id")\
    .mean()\
    .reset_index()\
    .rename(columns={"NG_score":"NG_score_mean"})
users = pd.merge(
    users, 
    average_NG_scores, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add average accuracy & transparency score

In [47]:
average_accuracy_transparency = tweets[tweets["retweeted"] == False]\
    [["author_id", "accuracy", "transparency"]]\
    .groupby("author_id")\
    .mean()\
    .reset_index()\
    .rename(columns={
        "accuracy":"accuracy_mean",
        "transparency":"transparency_mean"
    })
users = pd.merge(
    users, 
    average_accuracy_transparency, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add share of unstrustworthy domains (independent list)

In [48]:
unreliable_user_count = tweets[tweets["retweeted"] == False]\
    [["author_id", "independent_unreliable"]]\
    .groupby("author_id")\
    .agg(["sum", "count"])

unreliable_user_count["independent_unreliable_share"] = \
    unreliable_user_count["independent_unreliable"]["sum"] / \
    unreliable_user_count["independent_unreliable"]["count"]
    
# flatten the hierarchical indices
unreliable_user_count = unreliable_user_count.reset_index()
unreliable_user_count.columns = ['_'.join(col).strip("_") \
                            for col in unreliable_user_count.columns.values]

users = pd.merge(
    users, 
    unreliable_user_count[["author_id", "independent_unreliable_share"]],
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)
del unreliable_user_count

## Add average belief-speaking and truth-seeking score different embeddings

In [49]:
scores = ["avg_belief_score_glove", "avg_belief_score_word2vec", "avg_belief_score_fasttext",
        "avg_truth_score_glove", "avg_truth_score_word2vec", "avg_truth_score_fasttext"]
cols = ["author_id", "created_at"] + scores 
        
honesty_tweets_score = tweets[tweets["retweeted"] == False][cols]\
    .dropna(subset=scores)\
    .copy()

In [50]:
# all honesty component tweets
honesty_score_average = honesty_tweets_score\
    [["author_id"] + scores]\
    .groupby("author_id")\
    .mean()
honesty_score_average = honesty_score_average.reset_index()

In [52]:
users = users.merge(
    honesty_score_average[["author_id"] + scores], 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)
del honesty_score_average
del honesty_tweets_score

## Add average belief-speaking and truth-seeking score dictionary bootstraps

In [54]:
scores = [f"avg_truth_score_{i}" for i in range(100)] + \
         [f"avg_belief_score_{i}" for i in range(100)]
cols = ["author_id", "created_at"] + scores 
        
honesty_tweets_score = tweets[tweets["retweeted"] == False][cols]\
    .dropna(subset=scores)\
    .copy()

In [55]:
# all honesty component tweets
honesty_score_average = honesty_tweets_score\
    [["author_id"] + scores]\
    .groupby("author_id")\
    .mean()
honesty_score_average = honesty_score_average.reset_index()

In [58]:
users = users.merge(
    honesty_score_average[["author_id"] + scores], 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)
del honesty_score_average
del honesty_tweets_score

## Add average emotion scores

In [59]:
emotions = ['LIWC_analytic', 'LIWC_authentic', 'LIWC_emo_pos',
            'LIWC_emo_neg', 'LIWC_moral']
average_emotion_scores = tweets[tweets["retweeted"] == False]\
    [["author_id"] + emotions]\
    .groupby("author_id")\
    .mean()\
    .reset_index()
emotion_map = {em:f"{em}_mean" for em in emotions}
average_emotion_scores = average_emotion_scores.rename(columns=emotion_map)

users = pd.merge(
    users, 
    average_emotion_scores, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add ideology scores

In [60]:
src = "../../data/utilities"
fname = "govtrack-stats-{}-{}-ideology.csv"
ideology_scores = pd.DataFrame()
for year in range(2013, 2021):
    for chamber in ["house", "senate"]:
        tmp = pd.read_csv(join(src, "ideology_scores",
                               fname.format(year, chamber)))
        tmp["year"] = year
        tmp["name"] = tmp["name"].apply(lambda x: x.replace("b'", ""))
        tmp["name"] = tmp["name"].apply(lambda x: x.replace("'", "").lower())
        ideology_scores = pd.concat([ideology_scores, tmp])

In [61]:
# match politician Twitter account names to govtrack politician names

# a single politician can have at maximum 8 entries for 8 different years
# 2013 to 2020
counts = ideology_scores["name"].value_counts()
unique_names = list(counts[counts <= 8].index)

unique_scores = ideology_scores[ideology_scores["name"].isin(unique_names)]\
    .sort_values(by="year", ascending=False)\
    .drop_duplicates(subset=["name"])\
    .set_index("name")
unique_names = list(set(unique_scores.index))

def match_score(account_name):
    '''Matches govtrack politician names to Twitter account names.'''
    if account_name == account_name:
        account_name = set(account_name.lower().split(" "))
        for name in unique_names:
            # hard matching: if the govtrack name string is completely included
            # in the Twitter account name string, record a match
            if name in account_name:
                return unique_scores.loc[name]["id"]
    else:
        return np.nan
    
users["ideology_score_id"] = users["name"].apply(match_score)

In [62]:
# add hand-matched missing scores
src = "../../data/utilities"
fname = "missing_govtrack_ideology_scores.csv"
missing_scores = pd.read_csv(join(src, fname))
missing_scores = {row["handle"]:row["ideology_score_id"] \
                  for i, row in missing_scores.iterrows()}

# index on the handle since this seems to be the most consistent index between
# the two datasets
users = users.set_index("handle")
for handle, score_id in missing_scores.items():
    if handle in users.index:
        users.loc[handle, "ideology_score_id"] = score_id
users = users.reset_index()

In [63]:
# for many accounts, there is more than one ideology score since they were 
# active over many years. We calculate the mean, std and count of the ideology
# score for each user and add this information to the user_df
ideology_scores_agg = ideology_scores[["id", "ideology"]]\
    .groupby("id")\
    .agg(["mean", "std", "count"])
ideology_scores_agg = ideology_scores_agg.reset_index()
ideology_scores_agg.columns = ['_'.join(col).strip("_") \
                            for col in ideology_scores_agg.columns.values]

In [64]:
users = users.merge(ideology_scores_agg, how="left", 
                      left_on="ideology_score_id", right_on="id")
del ideology_scores
del ideology_scores_agg

## Add Politifact scores

In [65]:
src = "../../data/utilities"
fname = "misinfo_score_politifact.csv"
pf_scores = pd.read_csv(join(src, fname), 
        usecols=["pf_score", "elite_account"])\
    .rename(columns={"elite_account":"handle"})

users = pd.merge(users, pf_scores, how="left", left_on="handle", right_on="handle")
del pf_scores

# Data exports

In [67]:
dst = "../../data/"

In [70]:
# URL data frame
fname = "US_politician_URLs_2010-11-06_to_2022-03-16_rb.csv.gzip"
urls = urls[urls["has_url"] == True]
urls = urls.drop(columns=["url", "domain", "status_code", "handle", "name", "has_url"])
urls.to_csv(join(dst, "urls", fname), index=False, compression="gzip")

In [71]:
# user data frame
fname = "US_politician_accounts_2010-11-06_to_2022-03-16_rb.csv"
users = users.drop(columns=["ideology_score_id", "id"])
users.to_csv(join(dst, "users", fname), index=False)

In [72]:
# tweet data frame
fname = "US_politician_tweets_2010-11-06_to_2022-03-16_rb.csv.gzip"
tweets = tweets.drop(columns=["handle", "name"])
tweets.to_csv(join(dst, "tweets", fname), index=False, compression="gzip")