In [None]:
# author: Jana Lasser

In [1]:
import pandas as pd
import numpy as np
from os.path import join

# Create a URL data frame

## Expand URL lists

In [2]:
# load the cleaned timeline-data
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"
cols = ["id", "author_id", "created_at", "expanded_urls",
        "retweeted", "quoted", "reply"]
tweets = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols)
tweets = tweets.drop_duplicates(subset="id")

In [3]:
# parse the URL lists
tweets["expanded_urls"] = tweets["expanded_urls"].fillna("[]")
tweets["expanded_urls"] = tweets["expanded_urls"].apply(lambda x: eval(x))
tweets["has_url"] = tweets["expanded_urls"].apply(lambda x: len(x) > 0)

In [4]:
tweets["N_urls"] = tweets["expanded_urls"].apply(lambda x: len(x))

In [5]:
# expand only entries with multiple URLs
multiple_urls = tweets[tweets["N_urls"] > 1]
expanded_urls = pd.DataFrame()
for idx, entry in multiple_urls.iterrows():
    row = {key:val for key, val in entry.items()}
    expanded_urls = pd.concat([expanded_urls, pd.DataFrame(row)])
    
expanded_urls = expanded_urls.set_index("id")
urls = tweets.copy()
urls = urls.set_index("id")
# drop entries with mutiple URLs
urls = urls.drop(multiple_urls["id"].values)
# add expanded entries with one line for each URL
urls = pd.concat([urls, expanded_urls])
urls = urls.reset_index()

In [6]:
len(urls)

2377899

In [None]:
# now, some URLs are stored as singular entries of a list, and some as string.
# empty entries are stored as empty list. Below we streamline URL entries such
# that every entry is a single string
def extract_URL_from_list(entry):
    if len(entry) == 0:
        return np.nan
    elif len(entry) == 1:
        return entry[0]
    else:
        return entry
    
urls["expanded_urls"] = urls["expanded_urls"].apply(extract_URL_from_list)
urls = urls.drop(columns=["urls", "entities.urls"])
urls = urls.rename(columns={"expanded_urls":"url"})

In [10]:
# some tweets contain the same URL twice. We drop these
N = len(urls)
urls = urls.drop_duplicates(subset=["id", "url"])
print(f"dropped {N - len(urls)} duplicate URL entries")

dropped 182435 duplicate URL entries


In [12]:
del tweets

In [None]:
# save the outcome
dst = "../../data/urls"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2021-03-16_clean_urls.csv.gzip"
urls.to_csv(join(dst, fname), compression="gzip", index=False)

In [14]:
# load the data frame with the expanded URLs
src = "../../data/urls"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2021-03-16_clean_urls.csv.gzip"
cols = ["id", "author_id", "created_at", "url", "retweeted",
        "quoted", "reply", "has_url"]
urls = pd.read_csv(
    join(src, fname),
    compression="gzip",
    usecols=cols,
    parse_dates=["created_at"],
    dtype={"author_id":str, "id":str}
)

In [15]:
# load the public metrics information for the collected tweets
'''
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"
tweet_metrics = pd.read_csv(join(src, fname),
                 compression="gzip",
                 usecols=["id", "retweet_count",
                          "reply_count", "like_count", "quote_count"],
                dtype={"id":str})
tweet_metrics = tweet_metrics.drop_duplicates(subset="id")
# merge the tweet metrics with the tweet data frame
urls = pd.merge(urls, tweet_metrics, how="left", left_on="id", right_on="id")
del tweet_metrics
'''

'\nsrc = "../../data/tweets"\nfname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"\ntweet_metrics = pd.read_csv(join(src, fname),\n                 compression="gzip",\n                 usecols=["id", "retweet_count",\n                          "reply_count", "like_count", "quote_count"],\n                dtype={"id":str})\ntweet_metrics = tweet_metrics.drop_duplicates(subset="id")\n# merge the tweet metrics with the tweet data frame\nurls = pd.merge(urls, tweet_metrics, how="left", left_on="id", right_on="id")\ndel tweet_metrics\n'

## Add unraveled URLs

In [16]:
# load the list of originally shortened URLs with their expansions to their true
# destination
src = "../../data/urls"
fname = "US_unraveled_urls.csv.xz"
unraveled_urls = pd.read_csv(join(src, fname), compression="xz")

In [17]:
# add URL information
urls = pd.merge(urls, unraveled_urls, left_on="url", right_on="url", how="left")

# add indicator of whether the URL was originally shortened
urls["shortened_url"] = False
urls.loc[urls["unraveled_url"].dropna().index, "shortened_url"] = True

# replace the shortened URL with the unraveled URL
urls.loc[urls["unraveled_url"].dropna().index, "url"] = \
    urls.loc[urls["unraveled_url"].dropna().index, "unraveled_url"]
urls = urls.drop(columns=["unraveled_url"])

In [18]:
# extract the domain from the URL
def extract_domain(url):
    '''Given an ULR, extracts the domain name in the form XXXXX.YY'''
    if url != url:
        return np.nan
    # reformat entries that have the domain after a general name in parantheses
    if url.find('(') > 0:
        url = url.split('(')[-1]
        url = url.strip(')')
    # trailing "/" and spaces
    url = url.strip('/').strip()
    # transform all domains to lowercase
    url = url.lower()
    # remove any white spaces
    url = url.replace(' ', '')
    # if present: remove the protocol
    if url.startswith(("http", "https")):
        try:
            url = url.split('//')[1]
        except IndexError:
            print(f"found malformed URL {url}")
            return np.nan
    # remove "www." 
    url = url.replace('www.', '')
    url = url.split("/")[0]
    return url

urls["domain"] = urls["url"].apply(extract_domain)

found malformed URL https
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http
found malformed URL http


## Add NewsGuard nutrition scores

Newsguard rating cutoff: 60 (see [description](https://www.newsguardtech.com/ratings/rating-process-criteria/)).

In [19]:
# load the nutrition labels
src = "../../data/utilities/"
fname = "NewsGuard_labels.csv"
cols = ["Domain", "Score", "Last Updated"]
NG_scores = pd.read_csv(join(src, fname), usecols=cols)
# if more than one score exists for the same domain, keep the most recent one
NG_scores = NG_scores.sort_values(by=["Domain","Last Updated"], ascending=False)
NG_scores = NG_scores.drop_duplicates(subset=["Domain"])
NG_scores = NG_scores.rename(columns={"Domain":"domain", "Score":"NG_score"})
NG_scores = NG_scores.drop(columns=["Last Updated"])

# threshold scores at various cutoffs to define untrustworthy domains
NG_scores["NG_unreliable"] = 0
NG_scores.loc[NG_scores[NG_scores["NG_score"] < 60].index, "NG_unreliable"] = 1

# add the nutrition information to the tweet data table
urls = pd.merge(urls, NG_scores,
         left_on="domain", right_on="domain", how="left")
del NG_scores

In [20]:
# export the list of all URLs with a NewsGuard score for article text straping
dst = "../../data/urls/"
fname = "url_list_for_article_scraping.csv.gzip"
url_export = urls[["url", "domain", "NG_score"]].copy()
url_export = url_export.drop_duplicates().dropna(subset=["url"])
url_export.to_csv(join(dst, fname), index=False, compression="gzip")
del url_export

## Add alternative trustworthiness labels

In [21]:
# load the list of independently compiled trustworthiness labels for 
# news sources
src = "../../data/utilities"
fname = "independent_labels.csv"
alt_labels = pd.read_csv(join(src, fname))
alt_labels = alt_labels.rename(columns = {
    "type":"independent_unreliable", 
    "url":"domain"})

# convert reliability labels to binary
alt_labels["independent_unreliable"] = alt_labels["independent_unreliable"]\
    .replace({"reliable":0, "unreliable":1})

# merge with the tweet data table
urls = pd.merge(urls, alt_labels[["accuracy", "transparency", 
        "independent_unreliable", "domain"]], how="left", left_on="domain",
         right_on="domain")
del alt_labels

## Add truth seeking & belief speaking scores

In [41]:
# load the word matching counts for belief-speaking and truth-seeking
src = "../../data/tweets"
#fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_honesty_component_labels.csv.gzip"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_p0.05_swapped_label_wn_word_definition_sbert_avgsim_nolemma.csv.gzip"
cols = ["id", "belief_count", "truth_count"]
honesty_labels = pd.read_csv(
    join(src, fname),
    usecols=cols,
    dtype={"id":str}, 
    compression="gzip"
)
honesty_labels["id"] = honesty_labels["id"].apply(lambda x: x.replace('"', ''))

In [50]:
# belief-speaking and truth-seeking labels for each tweet are assigned based on
# the majority of words matching to one of the two components. If there is a 
# tie, the tweet is assigned to both components this results in
# 1602160 neutral tweets
# 91280 unambiguous belief tweets
# 106355 unambiguous truth tweets
# 7050 ties with count > 0 including 86 ties with count > 1 and 
# 1 tie with count > 2
honesty_labels["belief"] = 0
honesty_labels["truth"] = 0
honesty_labels["neutral"] = 0

# unambigous majority votes
honesty_labels.loc[honesty_labels[honesty_labels["belief_count"] > \
                    honesty_labels["truth_count"]].index, "belief"] = 1
honesty_labels.loc[honesty_labels[honesty_labels["truth_count"] > \
                    honesty_labels["belief_count"]].index, "truth"] = 1

# ties
honesty_labels.loc[honesty_labels[(honesty_labels["truth_count"] == \
                    honesty_labels["belief_count"]) & \
                    (honesty_labels["truth_count"] > 0)].index, "truth"] = 1
honesty_labels.loc[honesty_labels[(honesty_labels["truth_count"] == \
                    honesty_labels["belief_count"]) &\
                    (honesty_labels["truth_count"] > 0)].index, "belief"] = 1

# neutral
honesty_labels.loc[honesty_labels[(honesty_labels["truth_count"] == 0) & \
                    (honesty_labels["belief_count"] == 0)].index, "neutral"] = 1

In [51]:
urls = pd.merge(honesty_labels[["id", "belief", "truth", "neutral"]], 
         urls, how="right", left_on="id", right_on="id")
del honesty_labels

## Add party affiliation

In [52]:
src = "../../data/utilities"
fname = "party_affiliations_complete.csv"
party_affiliation = pd.read_csv(join(src, fname), dtype={"author_id":str})
urls = pd.merge(urls, party_affiliation, how="left", left_on="author_id",
    right_on="author_id")
del party_affiliation

# Create a tweet data frame

In [53]:
# the current "url" data frame contains one row per URL, i.e. the same
# tweet can be present more than once. To calculate the share of tweets with
# unreliable information, we first calculate the mean NewsGuard score (and 
# mean accuracy and transparency) per tweet by averaging over all scores 
# of URLs that are present in a given tweet and then assigning "fishy" and
# "unreliable" labels on the tweet level

# columns that are defined on the tweet level
tweet_cols = ["id", "belief", "truth", "neutral", "author_id",
              "created_at", "retweeted", "quoted", "reply", "has_url",
              "handle", "name", "party"]
tweets = urls[tweet_cols].drop_duplicates(subset=["id"]).copy()

## Add LIWC scores

In [54]:
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean_mask_LIWC.csv.gzip"
cols = ["id", "WC", "Analytic", "Authentic", "moral", "emo_pos", "emo_neg"]
LIWC_scores = pd.read_csv(
    join(src, fname), 
    compression="gzip",
    usecols=cols,
    dtype={"id":str},
)
LIWC_scores = LIWC_scores.rename(columns={
    "WC":"word_count",
    "Analytic":"LIWC_analytic",
    "Authentic":"LIWC_authentic",
    "moral":"LIWC_moral",
    "emo_pos":"LIWC_emo_pos",
    "emo_neg":"LIWC_emo_neg"
})
tweets = pd.merge(tweets, LIWC_scores, how="left", left_on="id", right_on="id")

## Calculate average NewsGuard score and misinfo components

In [55]:
average_scores = urls[["id", "NG_score", "transparency", "accuracy"]]\
    .groupby("id")\
    .agg("mean")

average_scores["NG_unreliable"] = np.nan
average_scores.loc[average_scores[\
            average_scores["NG_score"] < 60].index, "NG_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["NG_score"] >= 60].index, "NG_unreliable"] = 0

## Calculate average accuracy & transparency score and unreliable domains

In [56]:
average_scores["independent_unreliable"] = np.nan
# original definition: sources with transparency = 1 are unreliable
# since transparency can have non-integer values after averaging, we decide
# to label tweets with an average domain transparency value of links of
# <= 1.5 as "unreliable", since that means that the majority of domains 
# linked to in the tweet are unreliable. If one domain with transparency 1
# and one domain with transparency 2 are linked, the tweet is unreliable
average_scores.loc[average_scores[\
            average_scores["transparency"] <= 1.5].index, "independent_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["transparency"] > 1.5].index, "independent_unreliable"] = 0
# original defintion: sources with accuracy = 1 or 2 are unreliable
# since accuracy can have non-integer values after averaging, we decide to
# label tweets with an average domain accuracy value of links of <= 2.5 as
# "unreliable", since that means that the majority of domains linked to in 
# the tweet are unreliable. If one domain with accuracy 2 and one domain 
# with accuracy 3 are linked, the tweet is unreliable.
average_scores.loc[average_scores[\
            average_scores["accuracy"] <= 2.5].index, "independent_unreliable"] = 1
average_scores.loc[average_scores[\
            average_scores["accuracy"] > 2.5].index, "independent_unreliable"] = 0

tweets = pd.merge(tweets, average_scores, how="left", left_on="id", right_on="id")
del average_scores

# Create a user data frame

In [57]:
users = tweets[["author_id", "handle", "name", "party", "id"]]\
    .groupby(["author_id", "handle", "name", "party"])\
    .agg("count")\
    .reset_index()\
    .rename(columns={"id":"N_tweets"})

## Add account stats

In [58]:
src = "../../data/users"
fname = "US_politician_twitter_accounts_clean.csv"
cols = ["followers_count", "following_count", "tweet_count", "created_at", 
        "author_id"]
account_stats = pd.read_csv(
    join(src, fname),
    parse_dates=["created_at"],
    usecols=cols,
    dtype={"author_id":str}
)

users = pd.merge(users, account_stats, how="left", left_on="author_id", right_on="author_id")
del account_stats

## Add Congress information

In [59]:
src = "../../data/users/clean"
fname = "congress-member-twitter-handles_114-117.csv"
congress_twitter_handles = pd.read_csv(join(src, fname))
congress_twitter_handles = congress_twitter_handles\
    .sort_values(by="congress", ascending=False)\
    .drop_duplicates(subset="handle")\
    .reset_index(drop=True)

users = pd.merge(users, congress_twitter_handles, how="left", left_on="handle",
                 right_on="handle")
del congress_twitter_handles

## Add share of untrustworthy domains (NewsGuard)

In [60]:
cols = ["author_id", "NG_unreliable"]
unreliable_user_count = tweets[tweets["retweeted"] == False][cols]\
    .groupby("author_id")\
    .agg(["sum", "count"])

unreliable_user_count["NG_unreliable_share"] = \
    unreliable_user_count["NG_unreliable"]["sum"] / \
    unreliable_user_count["NG_unreliable"]["count"]
    
# flatten the hierarchical indices
unreliable_user_count = unreliable_user_count.reset_index()
unreliable_user_count.columns = ['_'.join(col).strip("_") \
                            for col in unreliable_user_count.columns.values]

unreliable_user_count.head(2)

Unnamed: 0,author_id,NG_unreliable_sum,NG_unreliable_count,NG_unreliable_share
0,1009269193,0.0,221,0.0
1,1011053278304592000,0.0,0,


In [61]:
cols = ["NG_unreliable_share", "author_id"]
users = pd.merge(
    users, 
    unreliable_user_count[cols],
    how="left",
    left_on="author_id",
    right_on="author_id"
)

## Add average NewsGuard score

In [62]:
average_NG_scores = tweets[tweets["retweeted"] == False][["author_id", "NG_score"]]\
    .groupby("author_id")\
    .mean()\
    .reset_index()\
    .rename(columns={"NG_score":"NG_score_mean"})
users = pd.merge(
    users, 
    average_NG_scores, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add average accuracy & transparency score

In [63]:
average_accuracy_transparency = tweets[tweets["retweeted"] == False]\
    [["author_id", "accuracy", "transparency"]]\
    .groupby("author_id")\
    .mean()\
    .reset_index()\
    .rename(columns={
        "accuracy":"accuracy_mean",
        "transparency":"transparency_mean"
    })
users = pd.merge(
    users, 
    average_accuracy_transparency, 
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)

## Add share of unstrustworthy domains (independent list)

In [64]:
unreliable_user_count = tweets[tweets["retweeted"] == False]\
    [["author_id", "independent_unreliable"]]\
    .groupby("author_id")\
    .agg(["sum", "count"])

unreliable_user_count["independent_unreliable_share"] = \
    unreliable_user_count["independent_unreliable"]["sum"] / \
    unreliable_user_count["independent_unreliable"]["count"]
    
# flatten the hierarchical indices
unreliable_user_count = unreliable_user_count.reset_index()
unreliable_user_count.columns = ['_'.join(col).strip("_") \
                            for col in unreliable_user_count.columns.values]

users = pd.merge(
    users, 
    unreliable_user_count[["author_id", "independent_unreliable_share"]],
    how="left", 
    left_on="author_id", 
    right_on="author_id"
)
del unreliable_user_count

## Add share of belief-speaking and truth-seeking

In [65]:
honesty_tweets = tweets[tweets["retweeted"] == False]\
    [["author_id", "belief", "truth", "created_at"]]\
    .dropna(subset=["belief", "truth"])\
    .copy()

In [66]:
# all honesty component tweets
honesty_label_count = honesty_tweets[["author_id", "belief", "truth"]]\
    .groupby("author_id")\
    .agg(["sum", "count"])

for col in ["belief", "truth"]:
    honesty_label_count[f"{col}_share"] = honesty_label_count[col]["sum"] / \
    honesty_label_count[col]["count"]
    
honesty_label_count.columns = ['_'.join(col).strip("_") \
                            for col in honesty_label_count.columns.values]
honesty_label_count = honesty_label_count.reset_index()

In [67]:
honesty_tweets = honesty_tweets.set_index("created_at")

In [68]:
# only first 4 years
honesty_label_count_first = honesty_tweets[honesty_tweets.index.year <= 2013]\
    .groupby("author_id")\
    .agg(["sum", "count"])

for col in ["belief", "truth"]:
    honesty_label_count_first[f"{col}_share_2010_to_2013"] = \
        honesty_label_count_first[col]["sum"] / \
        honesty_label_count_first[col]["count"]
    
honesty_label_count_first.columns = ['_'.join(col).strip("_") \
                            for col in honesty_label_count_first.columns.values]
honesty_label_count_first = honesty_label_count_first.reset_index()
cols = ["belief_sum", "belief_count", "truth_sum", "truth_count"]
honesty_label_count_first = honesty_label_count_first\
    .rename(columns={col:col + "_2010_to_2013" for col in cols}) 

In [69]:
# only last 4 years
honesty_label_count_last = honesty_tweets[honesty_tweets.index.year >= 2019]\
    [["author_id", "belief", "truth"]]\
    .groupby("author_id")\
    .agg(["sum", "count"])

for col in ["belief", "truth"]:
    honesty_label_count_last[f"{col}_share_2019_to_2022"] = \
    honesty_label_count_last[col]["sum"] / \
    honesty_label_count_last[col]["count"]
    
honesty_label_count_last.columns = ['_'.join(col).strip("_") \
                            for col in honesty_label_count_last.columns.values]
honesty_label_count_last = honesty_label_count_last.reset_index()
cols = ["belief_sum", "belief_count", "truth_sum", "truth_count"]
honesty_label_count_last = honesty_label_count_last\
    .rename(columns={col:col + "_2019_to_2022" for col in cols}) 

In [70]:
users = users.merge(honesty_label_count[["author_id", "belief_share", 
                    "truth_share"]], how="left", left_on="author_id", 
                    right_on="author_id")
del honesty_label_count

cols = ["author_id", "belief_share_2010_to_2013","truth_share_2010_to_2013"]
users = users.merge(
    honesty_label_count_first[cols],
    how="left",
    left_on="author_id", 
    right_on="author_id"
)
del honesty_label_count_first

cols = ["author_id","belief_share_2019_to_2022", "truth_share_2019_to_2022"]
users = users.merge(
    honesty_label_count_last[cols],
    how="left",
    left_on="author_id", 
    right_on="author_id"
)
del honesty_label_count_last

## Add share of neutral tweets

In [71]:
honesty_tweets = honesty_tweets.reset_index()
neutral_count = honesty_tweets[honesty_tweets[["belief", "truth"]]\
    .sum(axis=1) == 0][["author_id", "created_at"]]\
    .groupby("author_id")\
    .agg("count")\
    .reset_index()\
    .rename(columns={"created_at":"neutral_count"})

users = pd.merge(
    users, 
    neutral_count, 
    how="left", 
    left_on="author_id",
    right_on="author_id"
)#.dropna(subset=["neutral_count"])
users["neutral_share"] = users["neutral_count"] / users["N_tweets"]
users = users.drop(columns=["neutral_count"])
del honesty_tweets
del neutral_count

## Add ideology scores

In [72]:
src = "../../data/utilities"
fname = "govtrack-stats-{}-{}-ideology.csv"
ideology_scores = pd.DataFrame()
for year in range(2013, 2021):
    for chamber in ["house", "senate"]:
        tmp = pd.read_csv(join(src, "ideology_scores",
                               fname.format(year, chamber)))
        tmp["year"] = year
        tmp["name"] = tmp["name"].apply(lambda x: x.replace("b'", ""))
        tmp["name"] = tmp["name"].apply(lambda x: x.replace("'", "").lower())
        ideology_scores = pd.concat([ideology_scores, tmp])

In [73]:
# match politician Twitter account names to govtrack politician names

# a single politician can have at maximum 8 entries for 8 different years
# 2013 to 2020
counts = ideology_scores["name"].value_counts()
unique_names = list(counts[counts <= 8].index)

unique_scores = ideology_scores[ideology_scores["name"].isin(unique_names)]\
    .sort_values(by="year", ascending=False)\
    .drop_duplicates(subset=["name"])\
    .set_index("name")
unique_names = list(set(unique_scores.index))

def match_score(account_name):
    '''Matches govtrack politician names to Twitter account names.'''
    if account_name == account_name:
        account_name = set(account_name.lower().split(" "))
        for name in unique_names:
            # hard matching: if the govtrack name string is completely included
            # in the Twitter account name string, record a match
            if name in account_name:
                return unique_scores.loc[name]["id"]
    else:
        return np.nan
    
users["ideology_score_id"] = users["name"].apply(match_score)

In [74]:
# add hand-matched missing scores
src = "../../data/utilities"
fname = "missing_govtrack_ideology_scores.csv"
missing_scores = pd.read_csv(join(src, fname))
missing_scores = {row["handle"]:row["ideology_score_id"] \
                  for i, row in missing_scores.iterrows()}

# index on the handle since this seems to be the most consistent index between
# the two datasets
users = users.set_index("handle")
for handle, score_id in missing_scores.items():
    if handle in users.index:
        users.loc[handle, "ideology_score_id"] = score_id
users = users.reset_index()

In [75]:
# for many accounts, there is more than one ideology score since they were 
# active over many years. We calculate the mean, std and count of the ideology
# score for each user and add this information to the user_df
ideology_scores_agg = ideology_scores[["id", "ideology"]]\
    .groupby("id")\
    .agg(["mean", "std", "count"])
ideology_scores_agg = ideology_scores_agg.reset_index()
ideology_scores_agg.columns = ['_'.join(col).strip("_") \
                            for col in ideology_scores_agg.columns.values]

In [76]:
users = users.merge(ideology_scores_agg, how="left", 
                      left_on="ideology_score_id", right_on="id")
del ideology_scores
del ideology_scores_agg

## Add Politifact scores

In [77]:
src = "../../data/utilities"
fname = "misinfo_score_politifact.csv"
pf_scores = pd.read_csv(join(src, fname), 
        usecols=["pf_score", "elite_account"])\
    .rename(columns={"elite_account":"handle"})

users = pd.merge(users, pf_scores, how="left", left_on="handle", right_on="handle")
del pf_scores

# Data exports

In [78]:
dst = "../../data/"

In [79]:
# URL data frame
fname = "US_politician_URLs_2010-11-06_to_2022-03-16.csv.gzip"
urls = urls[urls["has_url"] == True]
urls = urls.drop(columns=["url", "domain", "status_code", "handle", "name", "has_url"])
urls.to_csv(join(dst, "urls", fname), index=False, compression="gzip")

In [80]:
# user data frame
fname = "US_politician_accounts_2010-11-06_to_2022-03-16.csv"
users = users.drop(columns=["ideology_score_id", "id"])
users.to_csv(join(dst, "users", fname), index=False)

In [81]:
# tweet data frame
fname = "US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip"
tweets = tweets.drop(columns=["handle", "name"])
tweets.to_csv(join(dst, "tweets", fname), index=False, compression="gzip")