In [1]:
import pandas as pd
from pathlib import Path

In [55]:
src = "../../data"

# Tweets

In [64]:
fname = "US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip"
tweets = pd.read_csv(
    Path(src, "tweets", fname),
    compression="gzip",
    dtype={"id":str, "author_id":str}
)

In [65]:
cols = [
    "retweeted", "quoted", "reply", "created_at",
    "party", "has_url", "word_count",
    "NG_score", "accuracy", "transparency", 
    "avg_belief_score", "avg_truth_score",
    "LIWC_analytic", "LIWC_authentic", "LIWC_emo_pos",
    "LIWC_emo_neg", "LIWC_moral",
    "author_id"
]

In [None]:
# note: if running the dictionary robustness analysis, you will also have to
# include the following columns
# cols.extend([f"avg_belief_score_{i}" for i in range(100)])
# cols.extend([f"avg_truth_score_{i}" for i in range(100)])

In [66]:
tweets[cols].to_csv(
    Path(src, "tweets", "tweets.csv.gzip"),
    compression="gzip",
    index=False
)

# ULRs

In [13]:
fname = "US_politician_URLs_2010-11-06_to_2022-03-16.csv.gzip"
urls = pd.read_csv(
    Path(src, "urls", fname),
    compression="gzip",
    dtype={"id":str, "author_id":str}
)

In [15]:
# remove all entries with urls that point to large social media (twitter, 
# facebook, youtube, instagram), search (google, yahoo) or e-commerce (amazon) 
# sites
excluded_domains = ["twitter.com", "youtube.com", "facebook.com",
            "instagram.com", "cards.twitter.com", "google.com", "yahoo.com"]
urls = urls[~urls["domain"].isin(excluded_domains)]

In [17]:
cols = [
    "retweeted", "quoted", "reply", "created_at",
    "party", "NG_score", "accuracy", "transparency", 
    "NG_unreliable", "independent_unreliable",
    "avg_belief_score", "avg_truth_score",
    "shortened_url"
]

In [18]:
urls[cols].to_csv(
    Path(src, "urls", "urls.csv.gzip"),
    compression="gzip",
    index=False
)

# Articles

In [40]:
fname = "article_corpus_clean_honesty_component_scores_glove.csv.gzip"
article_honesty_scores = pd.read_csv(
    Path(src, "articles", fname), 
    compression="gzip"
)

fname = "url_NG_scores.csv.gzip"
article_NG_scores = pd.read_csv(
    Path(src, "articles", fname),
    compression="gzip"
)

fname = "url_independent_scores.csv.gzip"
article_independent_scores = pd.read_csv(
    Path(src, "articles", fname), 
    compression="gzip"
)

In [41]:
articles = pd.merge(
    article_honesty_scores,
    article_NG_scores[["url", "party", "NG_score"]],
    how="left",
    left_on="url",
    right_on="url"
)
articles = pd.merge(
    articles,
    article_independent_scores[["url", "accuracy", "transparency"]],
    how="left",
    left_on="url",
    right_on="url"
)
articles = articles[articles["party"].isin(["Democrat", "Republican"])]

In [42]:
party_counts = articles[["url", "party"]]\
    .groupby("url")\
    .count()\
    .reset_index()\
    .rename(columns={"party":"party_count"})

In [43]:
articles = articles[articles["url"].isin(party_counts[party_counts["party_count"] == 1]["url"])]

In [44]:
len(articles)

125220

In [45]:
cols = [
    "party", "NG_score", "accuracy", "transparency",
    "avg_belief_score", "avg_truth_score",
]

In [46]:
articles[cols].to_csv(
    Path(src, "articles", "articles.csv.gzip"),
    compression="gzip",
    index=False
)

In [47]:
del articles

# NYT

In [48]:
# get honesty scores
fname = "NYT_abstracts_honesty_component_scores_glove.csv.gzip"
honesty_scores = pd.read_csv(Path(src, "NYT", fname), compression="gzip")

fname = "NYT_abstracts.csv.gzip"
# get abstract categories
cols = ["id", "section"]
abstracts = pd.read_csv(Path(src, "NYT", fname), compression="gzip", usecols=cols)

abstracts = pd.merge(
    abstracts,
    honesty_scores,
    how="left",
    left_on="id",
    right_on="id"
)

In [50]:
cols = [
    "section", "avg_belief_score", "avg_truth_score",
]

In [53]:
abstracts[cols].to_csv(
    Path(src, "NYT", "abstracts.csv.gzip"),
    compression="gzip",
    index=False
)