In [None]:
# author: Jana Lasser

In [1]:
import pandas as pd
import numpy as np
from os.path import join

# Create a URL data frame

## Expand URL lists

In [91]:
# load the cleaned timeline-data
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"
cols = ["id", "author_id", "created_at", "expanded_urls",
        "retweeted", "quoted", "reply"]
tweets = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols)
tweets = tweets.drop_duplicates(subset="id")

In [92]:
# parse the URL lists
tweets["expanded_urls"] = tweets["expanded_urls"].fillna("[]")
tweets["expanded_urls"] = tweets["expanded_urls"].apply(lambda x: eval(x))
tweets["has_url"] = tweets["expanded_urls"].apply(lambda x: len(x) > 0)

In [93]:
tweets["N_urls"] = tweets["expanded_urls"].apply(lambda x: len(x))

In [None]:
# expand only entries with multiple URLs
multiple_urls = tweets[tweets["N_urls"] > 1]
expanded_urls = pd.DataFrame()
for idx, entry in multiple_urls.iterrows():
    row = {key:val for key, val in entry.items()}
    expanded_urls = pd.concat([expanded_urls, pd.DataFrame(row)])
    
expanded_urls = expanded_urls.set_index("id")
urls = tweets.copy()
urls = urls.set_index("id")
# drop entries with mutiple URLs
urls = urls.drop(multiple_urls["id"].values)
# add expanded entries with one line for each URL
urls = pd.concat([urls, expanded_urls])
urls = urls.reset_index()

In [None]:
len(urls)

In [None]:
# now, some URLs are stored as singular entries of a list, and some as string.
# empty entries are stored as empty list. Below we streamline URL entries such
# that every entry is a single string
def extract_URL_from_list(entry):
    if len(entry) == 0:
        return np.nan
    elif len(entry) == 1:
        return entry[0]
    else:
        return entry
    
urls["expanded_urls"] = urls["expanded_urls"].apply(extract_URL_from_list)
urls = urls.drop(columns=["urls", "entities.urls"])
urls = urls.rename(columns={"expanded_urls":"url"})

In [None]:
# some tweets contain the same URL twice. We drop these
N = len(urls)
urls = urls.drop_duplicates(subset=["id", "url"])
print(f"dropped {N - len(urls)} duplicate URL entries")

In [None]:
del tweets

In [None]:
# save the outcome
dst = "../../data/urls"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2021-03-16_clean_urls.csv.gzip"
urls.to_csv(join(dst, fname), compression="gzip", index=False)

In [44]:
# load the data frame with the expanded URLs
src = "../../data/urls"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2021-03-16_clean_urls.csv.gzip"
cols = ["id", "author_id", "created_at", "url", "retweeted",
        "quoted", "reply", "has_url"]
urls = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols,
    parse_dates=["created_at"],
    dtype={"author_id":str, "id":str}
)

In [3]:
# load the public metrics information for the collected tweets
# note: this is not needed for the analysis in this publication, but might be
# handy for analyses of tweet engagement metrics
'''
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"
tweet_metrics = pd.read_csv(join(src, fname),
                 compression="gzip",
                 usecols=["id", "retweet_count",
                          "reply_count", "like_count", "quote_count"],
                dtype={"id":str})
tweet_metrics = tweet_metrics.drop_duplicates(subset="id")
# merge the tweet metrics with the tweet data frame
urls = pd.merge(urls, tweet_metrics, how="left", left_on="id", right_on="id")
del tweet_metrics
'''

'\nsrc = "../../data/tweets"\nfname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"\ntweet_metrics = pd.read_csv(join(src, fname),\n                 compression="gzip",\n                 usecols=["id", "retweet_count",\n                          "reply_count", "like_count", "quote_count"],\n                dtype={"id":str})\ntweet_metrics = tweet_metrics.drop_duplicates(subset="id")\n# merge the tweet metrics with the tweet data frame\nurls = pd.merge(urls, tweet_metrics, how="left", left_on="id", right_on="id")\ndel tweet_metrics\n'

## Add unraveled URLs

In [45]:
# load the list of originally shortened URLs with their expansions to their true
# destination
src = "../../data/urls"
fname = "US_unraveled_urls.csv.xz"
unraveled_urls = pd.read_csv(join(src, fname), compression="xz")

In [46]:
# add URL information
urls = pd.merge(urls, unraveled_urls, left_on="url", right_on="url", how="left")

# add indicator of whether the URL was originally shortened
urls["shortened_url"] = False
urls.loc[urls["unraveled_url"].dropna().index, "shortened_url"] = True

# replace the shortened URL with the unraveled URL
urls.loc[urls["unraveled_url"].dropna().index, "url"] = \
    urls.loc[urls["unraveled_url"].dropna().index, "unraveled_url"]
urls = urls.drop(columns=["unraveled_url"])

In [47]:
# extract the domain from the URL. Note: a few "found malformed URL" warnings
# are acceptable
def extract_domain(url):
    '''Given an ULR, extracts the domain name in the form XXXXX.YY'''
    if url != url:
        return np.nan
    # reformat entries that have the domain after a general name in parantheses
    if url.find('(') > 0:
        url = url.split('(')[-1]
        url = url.strip(')')
    # trailing "/" and spaces
    url = url.strip('/').strip()
    # transform all domains to lowercase
    url = url.lower()
    # remove any white spaces
    url = url.replace(' ', '')
    # if present: remove the protocol
    if url.startswith(("http", "https")):
        try:
            url = url.split('//')[1]
        except IndexError:
            print(f"found malformed URL {url}")
            return np.nan
    # remove "www." 
    url = url.replace('www.', '')
    url = url.split("/")[0]
    return url

urls["domain"] = urls["url"].apply(extract_domain)

found malformed URL https
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http


## Add NewsGuard nutrition scores

Newsguard rating threshold to label a domain as "unreliable": 60 (see [description](https://www.newsguardtech.com/ratings/rating-process-criteria/)).

In [48]:
threshold = 60

In [49]:
# load the nutrition labels
src = "../../data/utilities/"
fname = "NewsGuard_labels.csv"
cols = ["Domain", "Score", "Last Updated"]
NG_scores = pd.read_csv(join(src, fname), usecols=cols)
# if more than one score exists for the same domain, keep the most recent one
NG_scores = NG_scores.sort_values(by=["Domain","Last Updated"], ascending=False)
NG_scores = NG_scores.drop_duplicates(subset=["Domain"])
NG_scores = NG_scores.rename(columns={"Domain":"domain", "Score":"NG_score"})
NG_scores = NG_scores.drop(columns=["Last Updated"])

# threshold scores at various cutoffs to define untrustworthy domains
NG_scores["NG_unreliable"] = 0
NG_scores.loc[NG_scores[NG_scores["NG_score"] < threshold].index, "NG_unreliable"] = 1

# add the nutrition information to the tweet data table
urls = pd.merge(urls, NG_scores,
         left_on="domain", right_on="domain", how="left")
del NG_scores

## Add alternative trustworthiness labels

In [50]:
# load the list of independently compiled trustworthiness labels for 
# news sources
src = "../../data/utilities"
fname = "independent_labels.csv"
alt_labels = pd.read_csv(join(src, fname))
alt_labels = alt_labels.rename(columns = {
    "type":"independent_unreliable", 
    "url":"domain"})

# convert reliability labels to binary
alt_labels["independent_unreliable"] = alt_labels["independent_unreliable"]\
    .replace({"reliable":0, "unreliable":1})

# merge with the tweet data table
urls = pd.merge(urls, alt_labels[["accuracy", "transparency", 
        "independent_unreliable", "domain"]], how="left", left_on="domain",
         right_on="domain")
del alt_labels

## Add truth seeking & belief speaking scores

In [51]:
# load the embedding scores for belief-speaking and truth-seeking
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_honesty_component_scores_glove.csv.gzip"
honesty_scores = pd.read_csv(
    join(src, fname),
    dtype={"id":str}, 
    compression="gzip"
)

In [52]:
urls = pd.merge(
    honesty_scores, 
    urls, 
    how="right", 
    left_on="id", 
    right_on="id"
)
del honesty_scores

## Add truth seeking & belief speaking scores for dictionary bootstraps

**Note** include this code if you have generated honesty component similarities using the bootstrapped dictionaries by running `label_lexicon_loop.sh`.

In [172]:
'''
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_honesty_component_scores_glove_bootstrap.csv.gzip"
honesty_scores_bootstrap = pd.read_csv(
    join(src, fname),
    compression="gzip",
    dtype={"id":str}
)

urls = pd.merge(
    honesty_scores_bootstrap, 
    urls, 
    how="right", 
    left_on="id", 
    right_on="id"
)
del honesty_scores_bootstrap
'''

## Add party affiliation

In [53]:
src = "../../data/utilities"
fname = "party_affiliations_complete.csv"
party_affiliation = pd.read_csv(join(src, fname), dtype={"author_id":str})
urls = pd.merge(urls, party_affiliation, how="left", left_on="author_id",
    right_on="author_id")
del party_affiliation

## Export URLs for article scraping & statistical modelling

In [54]:
# export the list of all URLs for article text straping
dst = "../../data/articles/"
fname = "url_list_for_article_scraping.csv.gzip"
url_export = urls[["url", "party"]].copy()
url_export = url_export.drop_duplicates().dropna(subset=["url"])
url_export.to_csv(join(dst, fname), index=False, compression="gzip")

fname = "url_NG_scores.csv.gzip"
url_export = urls[["url", "NG_score"]].copy()
url_export = url_export.drop_duplicates().dropna(subset=["url"])
url_export.to_csv(join(dst, fname), index=False, compression="gzip")

fname = "url_independent_scores.csv.gzip"
url_export = urls[["url", "accuracy", "transparency"]].copy()
url_export = url_export.drop_duplicates().dropna(subset=["url"])
url_export.to_csv(join(dst, fname), index=False, compression="gzip")

# Create a tweet data frame

In [55]:
# the current "url" data frame contains one row per URL, i.e. the same
# tweet can be present more than once. To calculate the share of tweets with
# unreliable information, we first calculate the mean NewsGuard score (and 
# mean accuracy and transparency) per tweet by averaging over all scores 
# of URLs that are present in a given tweet and then assigning "fishy" and
# "unreliable" labels on the tweet level

# columns that are defined on the tweet level
#tweet_cols = ["id", "author_id", "created_at", "retweeted", "quoted", "reply",
#              "has_url", "handle", "name", "party"] + \
#             ["avg_belief_score", "avg_truth_score"] + \
#             [f"avg_belief_score_{i}" for i in range(100)] + \
#             [f"avg_truth_score_{i}" for i in range(100)]

# note: use above columns if you run the script including the dictionary 
# robustness data
tweet_cols = ["id", "author_id", "created_at", "retweeted", "quoted", "reply",
              "has_url", "handle", "name", "party"] + \
             ["avg_belief_score", "avg_truth_score"]
tweets = urls[tweet_cols].drop_duplicates(subset=["id"]).copy()

## Add LIWC scores

In [56]:
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean_mask_LIWC.csv.gzip"
cols = ["id", "WC", "Analytic", "Authentic", "moral", "emo_pos", "emo_neg"]
LIWC_scores = pd.read_csv(
    join(src, fname), 
    compression="gzip",
    usecols=cols,
    dtype={"id":str},
)
LIWC_scores = LIWC_scores.rename(columns={
    "WC":"word_count",
    "Analytic":"LIWC_analytic",
    "Authentic":"LIWC_authentic",
    "moral":"LIWC_moral",
    "emo_pos":"LIWC_emo_pos",
    "emo_neg":"LIWC_emo_neg"
})
tweets = pd.merge(tweets, LIWC_scores, how="left", left_on="id", right_on="id")

## Calculate average NewsGuard score and misinfo components

In [57]:
average_scores = urls[["id", "NG_score", "transparency", "accuracy"]]\
    .groupby("id")\
    .agg("mean")

average_scores["NG_unreliable"] = np.nan
average_scores.loc[average_scores[\
            average_scores["NG_score"] < 60].index, "NG_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["NG_score"] >= 60].index, "NG_unreliable"] = 0

## Calculate average accuracy & transparency score and unreliable domains

In [58]:
average_scores["independent_unreliable"] = np.nan
# original definition: sources with transparency = 1 are unreliable
# since transparency can have non-integer values after averaging, we decide
# to label tweets with an average domain transparency value of links of
# <= 1.5 as "unreliable", since that means that the majority of domains 
# linked to in the tweet are unreliable. If one domain with transparency 1
# and one domain with transparency 2 are linked, the tweet is unreliable
average_scores.loc[average_scores[\
            average_scores["transparency"] <= 1.5].index, "independent_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["transparency"] > 1.5].index, "independent_unreliable"] = 0
# original defintion: sources with accuracy = 1 or 2 are unreliable
# since accuracy can have non-integer values after averaging, we decide to
# label tweets with an average domain accuracy value of links of <= 2.5 as
# "unreliable", since that means that the majority of domains linked to in 
# the tweet are unreliable. If one domain with accuracy 2 and one domain 
# with accuracy 3 are linked, the tweet is unreliable.
average_scores.loc[average_scores[\
            average_scores["accuracy"] <= 2.5].index, "independent_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["accuracy"] > 2.5].index, "independent_unreliable"] = 0

tweets = pd.merge(tweets, average_scores, how="left", left_on="id", right_on="id")
del average_scores

# Create a user data frame

In [59]:
users = tweets[["author_id", "handle", "name", "party", "id"]]\
    .groupby(["author_id", "handle", "name", "party"])\
    .agg("count")\
    .reset_index()\
    .rename(columns={"id":"N_tweets"})

## Add account stats

In [60]:
src = "../../data/users"
fname = "US_politician_twitter_accounts_clean.csv"
cols = ["followers_count", "following_count", "tweet_count", "created_at", 
        "author_id"]
account_stats = pd.read_csv(
    join(src, fname),
    parse_dates=["created_at"],
    usecols=cols,
    dtype={"author_id":str}
)

users = pd.merge(users, account_stats, how="left", left_on="author_id", right_on="author_id")
del account_stats

## Add Congress information

In [61]:
src = "../../data/users/clean"
fname = "congress-member-twitter-handles_114-117.csv"
congress_twitter_handles = pd.read_csv(join(src, fname))
congress_twitter_handles = congress_twitter_handles\
    .sort_values(by="congress", ascending=False)\
    .drop_duplicates(subset="handle")\
    .reset_index(drop=True)

users = pd.merge(users, congress_twitter_handles, how="left", left_on="handle",
                 right_on="handle")
del congress_twitter_handles

## Add share of untrustworthy domains (NewsGuard)

In [62]:
cols = ["author_id", "NG_unreliable"]
unreliable_user_count = tweets[tweets["retweeted"] == False][cols]\
    .groupby("author_id")\
    .agg(["sum", "count"])

unreliable_user_count["NG_unreliable_share"] = \
    unreliable_user_count["NG_unreliable"]["sum"] / \
    unreliable_user_count["NG_unreliable"]["count"]
    
# flatten the hierarchical indices
unreliable_user_count = unreliable_user_count.reset_index()
unreliable_user_count.columns = ['_'.join(col).strip("_") \
                            for col in unreliable_user_count.columns.values]

unreliable_user_count.head(2)

Unnamed: 0,author_id,NG_unreliable_sum,NG_unreliable_count,NG_unreliable_share
0,1009269193,0.0,221,0.0
1,1011053278304592000,0.0,0,


In [63]:
cols = ["NG_unreliable_share", "author_id"]
users = pd.merge(
    users, 
    unreliable_user_count[cols],
    how="left",
    left_on="author_id",
    right_on="author_id"
)

## Add average NewsGuard score

In [64]:
average_NG_scores = tweets[tweets["retweeted"] == False][["author_id", "NG_score"]]\
    .groupby("author_id")\
    .mean()\
    .reset_index()\
    .rename(columns={"NG_score":"NG_score_mean"})
users = pd.merge(
    users, 
    average_NG_scores, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add average accuracy & transparency score

In [65]:
average_accuracy_transparency = tweets[tweets["retweeted"] == False]\
    [["author_id", "accuracy", "transparency"]]\
    .groupby("author_id")\
    .mean()\
    .reset_index()\
    .rename(columns={
        "accuracy":"accuracy_mean",
        "transparency":"transparency_mean"
    })
users = pd.merge(
    users, 
    average_accuracy_transparency, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add share of unstrustworthy domains (independent list)

In [66]:
unreliable_user_count = tweets[tweets["retweeted"] == False]\
    [["author_id", "independent_unreliable"]]\
    .groupby("author_id")\
    .agg(["sum", "count"])

unreliable_user_count["independent_unreliable_share"] = \
    unreliable_user_count["independent_unreliable"]["sum"] / \
    unreliable_user_count["independent_unreliable"]["count"]
    
# flatten the hierarchical indices
unreliable_user_count = unreliable_user_count.reset_index()
unreliable_user_count.columns = ['_'.join(col).strip("_") \
                            for col in unreliable_user_count.columns.values]

users = pd.merge(
    users, 
    unreliable_user_count[["author_id", "independent_unreliable_share"]],
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)
del unreliable_user_count

## Add average belief-speaking and truth-seeking score

In [67]:
honesty_tweets_score = tweets[tweets["retweeted"] == False]\
    [["author_id", "avg_belief_score", "avg_truth_score", "created_at"]]\
    .dropna(subset=["avg_belief_score", "avg_truth_score"])\
    .copy()

In [68]:
# all honesty component tweets
honesty_score_average = honesty_tweets_score\
    [["author_id", "avg_belief_score", "avg_truth_score"]]\
    .groupby("author_id")\
    .mean()

honesty_score_average = honesty_score_average.reset_index()

In [69]:
honesty_tweets_score = honesty_tweets_score.set_index("created_at")

In [70]:
# only first 4 years
honesty_score_average_first = honesty_tweets_score[honesty_tweets_score.index.year <= 2013]\
    .groupby("author_id")\
    .mean()

honesty_score_average_first = honesty_score_average_first.reset_index()
cols = ["avg_belief_score", "avg_truth_score"]
honesty_score_average_first = honesty_score_average_first\
    .rename(columns={col:col + "_2010_to_2013" for col in cols}) 

In [71]:
# only last 4 years
honesty_score_average_last = honesty_tweets_score[honesty_tweets_score.index.year >= 2019]\
    .groupby("author_id")\
    .mean()
honesty_score_average_last = honesty_score_average_last.reset_index()
cols = ["avg_belief_score", "avg_truth_score"]
honesty_score_average_last = honesty_score_average_last\
    .rename(columns={col:col + "_2019_to_2022" for col in cols}) 

In [72]:
users = users.merge(honesty_score_average[["author_id", "avg_belief_score", 
                    "avg_truth_score"]], how="left", left_on="author_id", 
                    right_on="author_id")
del honesty_score_average

users = users.merge(
    honesty_score_average_first,
    how="left",
    left_on="author_id", 
    right_on="author_id"
)
del honesty_score_average_first

users = users.merge(
    honesty_score_average_last,
    how="left",
    left_on="author_id", 
    right_on="author_id"
)
del honesty_score_average_last

## Add average emotion scores

In [73]:
emotions = ['LIWC_analytic', 'LIWC_authentic', 'LIWC_emo_pos',
            'LIWC_emo_neg', 'LIWC_moral']
average_emotion_scores = tweets[tweets["retweeted"] == False]\
    [["author_id"] + emotions]\
    .groupby("author_id")\
    .mean()\
    .reset_index()
emotion_map = {em:f"{em}_mean" for em in emotions}
average_emotion_scores = average_emotion_scores.rename(columns=emotion_map)

users = pd.merge(
    users, 
    average_emotion_scores, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add ideology scores

In [74]:
src = "../../data/utilities"
fname = "govtrack-stats-{}-{}-ideology.csv"
ideology_scores = pd.DataFrame()
for year in range(2013, 2021):
    for chamber in ["house", "senate"]:
        tmp = pd.read_csv(join(src, "ideology_scores",
                               fname.format(year, chamber)))
        tmp["year"] = year
        tmp["name"] = tmp["name"].apply(lambda x: x.replace("b'", ""))
        tmp["name"] = tmp["name"].apply(lambda x: x.replace("'", "").lower())
        ideology_scores = pd.concat([ideology_scores, tmp])

In [75]:
# match politician Twitter account names to govtrack politician names

# a single politician can have at maximum 8 entries for 8 different years
# 2013 to 2020
counts = ideology_scores["name"].value_counts()
unique_names = list(counts[counts <= 8].index)

unique_scores = ideology_scores[ideology_scores["name"].isin(unique_names)]\
    .sort_values(by="year", ascending=False)\
    .drop_duplicates(subset=["name"])\
    .set_index("name")
unique_names = list(set(unique_scores.index))

def match_score(account_name):
    '''Matches govtrack politician names to Twitter account names.'''
    if account_name == account_name:
        account_name = set(account_name.lower().split(" "))
        for name in unique_names:
            # hard matching: if the govtrack name string is completely included
            # in the Twitter account name string, record a match
            if name in account_name:
                return unique_scores.loc[name]["id"]
    else:
        return np.nan
    
users["ideology_score_id"] = users["name"].apply(match_score)

In [76]:
# add hand-matched missing scores
src = "../../data/utilities"
fname = "missing_govtrack_ideology_scores.csv"
missing_scores = pd.read_csv(join(src, fname))
missing_scores = {row["handle"]:row["ideology_score_id"] \
                  for i, row in missing_scores.iterrows()}

# index on the handle since this seems to be the most consistent index between
# the two datasets
users = users.set_index("handle")
for handle, score_id in missing_scores.items():
    if handle in users.index:
        users.loc[handle, "ideology_score_id"] = score_id
users = users.reset_index()

In [77]:
# for many accounts, there is more than one ideology score since they were 
# active over many years. We calculate the mean, std and count of the ideology
# score for each user and add this information to the user_df
ideology_scores_agg = ideology_scores[["id", "ideology"]]\
    .groupby("id")\
    .agg(["mean", "std", "count"])
ideology_scores_agg = ideology_scores_agg.reset_index()
ideology_scores_agg.columns = ['_'.join(col).strip("_") \
                            for col in ideology_scores_agg.columns.values]

In [78]:
users = users.merge(ideology_scores_agg, how="left", 
                      left_on="ideology_score_id", right_on="id")
del ideology_scores
del ideology_scores_agg

## Add Politifact scores

In [79]:
src = "../../data/utilities"
fname = "misinfo_score_politifact.csv"
pf_scores = pd.read_csv(join(src, fname), 
        usecols=["pf_score", "elite_account"])\
    .rename(columns={"elite_account":"handle"})

users = pd.merge(users, pf_scores, how="left", left_on="handle", right_on="handle")
del pf_scores

# Data exports

## URL, tweet and user data frames

**Note**: if you are running this code including the data for the dictionary robustness ananlysis, saving the files takes a while because they are pretty large.

In [80]:
dst = "../../data/"

In [81]:
# URL data frame
fname = "US_politician_URLs_2010-11-06_to_2022-03-16.csv.gzip"
urls = urls[urls["has_url"] == True]
urls = urls.drop(columns=["url", "status_code", "handle", "name", "has_url"])
urls.to_csv(join(dst, "urls", fname), index=False, compression="gzip")

In [82]:
# user data frame
fname = "users.csv"
users = users.drop(columns=["ideology_score_id", "id"])
users.to_csv(join(dst, "users", fname), index=False)

In [83]:
# tweet data frame
fname = "US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip"
tweets = tweets.drop(columns=["handle", "name"])
tweets.to_csv(join(dst, "tweets", fname), index=False, compression="gzip")

## Data for linear mixed effects modelling

In [65]:
cols = [
    "retweeted", # used to filter out retweets
    "author_id", # data grouping: independent random variable
    "party", # characteristic of author: independent fixed variable
    "avg_belief_score", # fixed variable
    "avg_truth_score", # fixed variable
    "NG_score", # dependent variable
    "accuracy", # dependent variable
    "transparency", # dependent variable
]
tweets_lme = tweets[cols].copy()
tweets_lme = tweets_lme[tweets_lme["retweeted"] == False] # remove retweets
tweets_lme = tweets_lme.drop(columns=["retweeted"])
tweets_lme = tweets_lme[tweets_lme["party"].isin(["Democrat", "Republican"])] # remove independents
tweets_lme = tweets_lme.dropna(subset=["avg_belief_score", "avg_truth_score"]) # remove tweets without NG, belief or truth score
len(tweets_lme)

1791859

In [66]:
# filter out authors with only a single tweet
tweet_counts = tweets_lme["author_id"]\
    .value_counts()\
    .reset_index()\
    .rename(columns={"index":"author_id", "author_id":"count"})

tweets_lme = tweets_lme[tweets_lme["author_id"].isin(tweet_counts[tweet_counts["count"] > 1]["author_id"])]
len(tweets_lme)

1791858

In [67]:
cols = ["author_id", "ideology_mean"]
users_lme = users[cols].copy()

In [68]:
tweets_lme = pd.merge(
    tweets_lme,
    users_lme,
    how="left"
)

In [69]:
# center similarity scores
tweets_lme["belief"] = tweets_lme["avg_belief_score"] - tweets_lme["avg_belief_score"].mean()
tweets_lme["truth"] = tweets_lme["avg_truth_score"] - tweets_lme["avg_truth_score"].mean()
# normalize trustworthiness scores by maximum scale value
tweets_lme["NG"] = tweets_lme["NG_score"] / 100
tweets_lme["accuracy"] = tweets_lme["accuracy"] / 5
tweets_lme["transparency"] = tweets_lme["transparency"] / 3

In [72]:
cols = ["author_id", "party", "belief", "truth"]
# only tweets that have a NewsGuard score
tweets_NG = tweets_lme[cols + ["NG"]].dropna()
# only tweets that have an accuracy/transparency score
tweets_independent = tweets_lme[cols + ["accuracy", "transparency"]].dropna()

dst = "../../data/tweets"
tweets_NG.to_csv(join(dst, "tweets_for_lme_modelling_NG.csv"), index=False)
tweets_independent.to_csv(join(dst, "tweets_for_lme_modelling_independent.csv"), index=False)

## Data for topic modelling

In [77]:
# read the tweet data
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"
cols = ["id", "author_id", "text"]
tweets = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols,
    dtype={"id":str, "author_id":str}
)
tweets["id"] = tweets["id"].str.replace('"', '')
tweets["author_id"] = tweets["author_id"].str.replace('"', '')

# read the honesty scores
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_honesty_component_scores_glove.csv.gzip"
cols = ["id", "avg_belief_score", "avg_truth_score"]
honesty_scores = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols,
    dtype={"id":str}
)

# add honesty component scores
tweets = pd.merge(
    tweets,
    honesty_scores,
    how="left",
    left_on="id",
    right_on="id"
)

# add party information
tweets = pd.merge(
    tweets,
    users[["author_id", "party"]],
    how="left",
    left_on="author_id",
    right_on="author_id"
)

# drop all tweets without party and honesty score information (retweets and
# tweets with too short text that don't have honesty scores)
tweets = tweets.dropna()

# drop all tweets that are not from Democrats or Republicans
tweets = tweets[tweets["party"].isin(["Democrat", "Republican"])].copy()

In [78]:
# lemmatize text - note: this takes a while
from pandarallel import pandarallel
import spacy
pandarallel.initialize(nb_workers=8)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
tweets["lemmatized"] = tweets['text']\
    .parallel_apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))

tweets['lemmatized'] = tweets['lemmatized'].str.replace(r'\s+|\\n', ' ', regex=True)

## Calculate quantiles

In [144]:
# identify tweets that are in the top belief-speaking similarity and 
# truth-seeking similarity quantiles and assign them to the "belief" and "truth"
# categories. If a tweet is in both top quantiles, assign them to the category
# with the higher similarity
belief_quant = tweets["avg_belief_score"].quantile(0.8)
truth_quant = tweets["avg_truth_score"].quantile(0.8)
tweets["belief"] = 0
tweets["truth"] = 0

# assign categories based on quantiles
tweets.loc[tweets[tweets["avg_belief_score"] >= belief_quant].index, "belief"] = 1
tweets.loc[tweets[tweets["avg_truth_score"] >= truth_quant].index, "truth"] = 1

# tweets that are in both categories
tweets.loc[tweets[
    (tweets["belief"] == 1) & (tweets["truth"] == 1) & \
    (tweets["avg_belief_score"] > tweets["avg_truth_score"])].index, "truth"] = 0
tweets.loc[tweets[
    (tweets["belief"] == 1) & (tweets["truth"] == 1) & \
    (tweets["avg_truth_score"] > tweets["avg_belief_score"])].index, "belief"] = 0

In [147]:
# assign tweets to honesty component X party categories
tweets["classes_quant"] = np.nan
tweets.loc[tweets[(tweets["party"] == "Democrat") & (tweets["belief"] == 1)].index, "classes_quant"] = "db"
tweets.loc[tweets[(tweets["party"] == "Democrat") & (tweets["truth"] == 1)].index, "classes_quant"] = "dt"
tweets.loc[tweets[(tweets["party"] == "Republican") & (tweets["belief"] == 1)].index, "classes_quant"] = "rb"
tweets.loc[tweets[(tweets["party"] == "Republican") & (tweets["truth"] == 1)].index, "classes_quant"] = "rt"

tweets.loc[tweets[(tweets["party"] == "Democrat") & (tweets["belief"] == 0) & (tweets["truth"] == 0)].index, "classes_quant"] = "dn"
tweets.loc[tweets[(tweets["party"] == "Republican") & (tweets["belief"] == 0) & (tweets["truth"] == 0)].index, "classes_quant"] = "rn"

In [162]:
# save the data
dst = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_lemma.csv.gzip"
cols = ["id", "author_id", "lemmatized", "party", "avg_belief_score",
        "avg_truth_score", "classes_quant"]
tweets[cols].to_csv(
    join(dst, fname),
    compression="gzip",
    index=False
)