In [36]:
import pandas as pd
from pathlib import Path
from scipy.stats import linregress

In [2]:
src = "../../data"

# Tweets

In [21]:
fname = "US_politician_tweets_2010-11-06_to_2022-12-31.csv.gzip"
tweets = pd.read_csv(
    Path(src, "tweets", fname),
    compression="gzip",
    dtype={"id":str, "author_id":str}
)

In [22]:
# load the cleaned timeline-data
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_clean.csv.gzip"
interactions = pd.read_csv(
    Path(src, "tweets", fname),
    compression="gzip",
    usecols=["id", "like_count", "retweet_count", "reply_count"],
    dtype={"id":str}
)
interactions = interactions.drop_duplicates(subset="id")
interactions["id"] = interactions["id"].apply(lambda x: x.replace('"', ''))

In [23]:
tweets = pd.merge(
    tweets,
    interactions,
    how="left",
    left_on="id",
    right_on="id"
)

In [24]:
cols = [
    "retweeted", "quoted", "reply", "created_at",
    "like_count", "retweet_count", "reply_count",
    "party", "has_url", "tweet_length",
    "NG_score", "accuracy", "transparency", 
    "avg_belief_score", "avg_truth_score",
    "avg_belief_score_word2vec", "avg_truth_score_word2vec",
    "avg_belief_score_fasttext", "avg_truth_score_fasttext",
    "LIWC_analytic", "LIWC_authentic", "LIWC_moral",
    "VADER_neg", "VADER_pos", "VADER_neu", "VADER_compound",
    "author_id"
]

In [25]:
# note: if running the dictionary robustness analysis, you will also have to
# include the following columns
#cols.extend([f"avg_belief_score_{i}" for i in range(100)])
#cols.extend([f"avg_truth_score_{i}" for i in range(100)])

In [26]:
tweets[cols].to_csv(
    Path(src, "tweets", "tweets.csv.gzip"),
    compression="gzip",
    index=False
)

# ULRs

In [27]:
fname = "US_politician_URLs_2010-11-06_to_2022-12-31.csv.gzip"
urls = pd.read_csv(
    Path(src, "urls", fname),
    compression="gzip",
    dtype={"id":str, "author_id":str}
)

In [28]:
# remove all entries with urls that point to large social media (twitter, 
# facebook, youtube, instagram), search (google, yahoo) or e-commerce (amazon) 
# sites
excluded_domains = ["twitter.com", "youtube.com", "facebook.com",
            "instagram.com", "cards.twitter.com", "google.com", "yahoo.com"]
urls = urls[~urls["domain"].isin(excluded_domains)]

In [29]:
cols = [
    "retweeted", "quoted", "reply", "created_at", "party",
    "like_count", "retweet_count", "reply_count",
    "party", "NG_score", "accuracy", "transparency", 
    "NG_unreliable", "independent_unreliable",
    "avg_belief_score", "avg_truth_score",
    "shortened_url"
]

In [30]:
urls[cols].to_csv(
    Path(src, "urls", "urls.csv.gzip"),
    compression="gzip",
    index=False
)

# Articles

In [28]:
fname = "article_corpus_clean_honesty_component_scores_glove.csv.gzip"
article_honesty_scores = pd.read_csv(
    Path(src, "articles", fname), 
    compression="gzip"
).drop_duplicates()

fname = "url_NG_scores.csv.gzip"
article_NG_scores = pd.read_csv(
    Path(src, "articles", fname),
    compression="gzip"
)

fname = "url_independent_scores.csv.gzip"
article_independent_scores = pd.read_csv(
    Path(src, "articles", fname), 
    compression="gzip"
)

fname = "article_corpus_clean.csv.gzip"
texts = pd.read_csv(
    Path(src, "articles", fname), 
    compression="gzip",
    usecols = ["url", "wc"]
).rename(columns={"wc":"word_count"})

In [29]:
articles = pd.merge(
    article_honesty_scores,
    article_NG_scores[["url", "party", "NG_score"]],
    how="left",
    left_on="url",
    right_on="url"
)
articles = pd.merge(
    articles,
    article_independent_scores[["url", "accuracy", "transparency"]],
    how="left",
    left_on="url",
    right_on="url"
)
articles = pd.merge(
    articles,
    texts,
    how="left",
    left_on="url",
    right_on="url"
)
articles = articles[articles["party"].isin(["Democrat", "Republican"])]

In [30]:
party_counts = articles[["url", "party"]]\
    .groupby("url")\
    .count()\
    .reset_index()\
    .rename(columns={"party":"party_count"})

In [31]:
articles = pd.merge(
    articles,
    party_counts,
    how="left",
    left_on="url",
    right_on="url"
)
articles = articles.drop_duplicates(subset=["url"])

In [32]:
articles["party_count"].value_counts()

1    348662
4      2833
Name: party_count, dtype: int64

In [33]:
2833 / 348662

0.008125347758000586

In [34]:
articles = articles.rename(columns={
    "avg_belief_score":"avg_belief_score_raw",
    "avg_truth_score":"avg_truth_score_raw"
})

In [37]:
slope_belief, intercept_belief, rval_belief, pval_belief, stderr_belief = \
linregress(articles["word_count"], articles[f"avg_belief_score_raw"])
print(f"belief-speaking slope: {slope_belief}, intercept: {intercept_belief}")

def predict_belief_similarity(tweet_length):
    return intercept_belief + slope_belief * tweet_length

slope_truth, intercept_truth, rval_truth, pval_truth, stderr_truth = \
    linregress(articles["word_count"], articles[f"avg_truth_score_raw"])
print(f"truth-seeking slope: {slope_truth}, intercept: {intercept_truth}")

def predict_truth_similarity(tweet_length):
    return intercept_truth + slope_truth * tweet_length

articles[f"avg_belief_score"] = articles\
    .apply(lambda x: x[f"avg_belief_score_raw"] - predict_belief_similarity(x["word_count"]), axis=1)    
articles[f"avg_truth_score"] = articles\
    .apply(lambda x: x[f"avg_truth_score_raw"] - predict_truth_similarity(x["word_count"]), axis=1)

belief-speaking slope: 9.305211874234313e-06, intercept: 0.7575757653517651
truth-seeking slope: 4.458930029948073e-06, intercept: 0.7095668342641687


In [39]:
cols = [
    "party", "NG_score", "accuracy", "transparency",
    "avg_belief_score", "avg_truth_score",
    "avg_belief_score_raw", "avg_truth_score_raw",
    "party_count", "word_count"
]

In [40]:
articles[cols].to_csv(
    Path(src, "articles", "articles.csv.gzip"),
    compression="gzip",
    index=False
)

# NYT

In [49]:
# get honesty scores
fname = "NYT_abstracts_honesty_component_scores_glove.csv.gzip"
honesty_scores = pd.read_csv(Path(src, "NYT", fname), compression="gzip")

fname = "NYT_abstracts.csv.gzip"
# get abstract categories
cols = ["id", "section"]
abstracts = pd.read_csv(
    Path(src, "NYT", fname),
    compression="gzip",
    usecols=cols
)

abstracts = pd.merge(
    abstracts,
    honesty_scores,
    how="left",
    left_on="id",
    right_on="id"
)

In [50]:
cols = [
    "section", "avg_belief_score", "avg_truth_score",
]

In [51]:
abstracts[cols].to_csv(
    Path(src, "NYT", "abstracts.csv.gzip"),
    compression="gzip",
    index=False
)