In [1]:
import pandas as pd
from pathlib import Path

In [None]:
src = "../../data/"

In [66]:
# load the tweet metadata
fname = "US_politician_tweets_2010-11-06_to_2022-03-16.csv.gzip"
tweets = pd.read_csv(
    Path(src, "tweets", fname),
    compression="gzip",
    dtype={"id":str, "author_id":str},
    parse_dates=["created_at"]
)

In [None]:
# clean the tweets
# drop retweets
N = len(tweets)
tweets = tweets[tweets["retweeted"] == False]
print(f"dropped {N - len(tweets)} retweets")

# drop tweets without honesty component (distill RoBERTa filtering)
N = len(tweets)
tweets = tweets.dropna(subset=["avg_belief_score", "avg_truth_score"])
print(f"dropped {N - len(tweets)} tweets without an honesty score")

# drop tweets from 2010
tweets["year"] = pd.to_datetime(tweets["created_at"]).dt.year
tweets = tweets.set_index("created_at")
N = len(tweets)
tweets = tweets[tweets.index.year > 2010]
print(f"dropped {N - len(tweets)} tweets from before 2011")

In [74]:
# load the tweet texts
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"
cols = ["id", "text"]
texts = pd.read_csv(
    Path(src, "tweets", fname),
    compression="gzip",
    usecols=cols)
texts = texts.drop_duplicates(subset="id")
texts["id"] = texts["id"].apply(lambda x: x.replace('"', ''))

In [75]:
tweets = pd.merge(
    tweets,
    texts,
    how="left",
    left_on="id",
    right_on="id"
)

In [85]:
cols = ["id", "party", "author_id", "avg_belief_score", "avg_truth_score", "text"]
belief_cutoff_top = tweets["avg_belief_score"].quantile(0.75)
truth_cutoff_top = tweets["avg_truth_score"].quantile(0.75)
belief_cutoff_bottom = tweets["avg_belief_score"].quantile(0.25)
truth_cutoff_bottom = tweets["avg_truth_score"].quantile(0.25)
high_belief_tweets = tweets[
    (tweets["avg_belief_score"] >= belief_cutoff_top) &\
    (tweets["avg_truth_score"] <= truth_cutoff_bottom)
][cols]

high_truth_tweets = tweets[
    (tweets["avg_truth_score"] >= truth_cutoff_top) &\
    (tweets["avg_belief_score"] <= belief_cutoff_bottom)
][cols]

low_honesty_tweets = tweets[
    (tweets["avg_truth_score"] <= truth_cutoff_bottom) &\
    (tweets["avg_belief_score"] <= belief_cutoff_bottom)
][cols]

In [89]:
high_belief_dem = high_belief_tweets[high_belief_tweets["party"] == "Democrat"].sample(n=10, random_state=42)
high_belief_dem["category"] = "high_belief_dem"
high_belief_rep = high_belief_tweets[high_belief_tweets["party"] == "Republican"].sample(n=10, random_state=42)
high_belief_rep["category"] = "high_belief_rep"
high_truth_dem = high_truth_tweets[high_truth_tweets["party"] == "Democrat"].sample(n=10, random_state=42)
high_truth_dem["category"] = "high_truth_dem"
high_truth_rep = high_truth_tweets[high_truth_tweets["party"] == "Republican"].sample(n=10, random_state=42)
high_truth_rep["category"] = "high_truth_rep"
low_honesty_dem = low_honesty_tweets[low_honesty_tweets["party"] == "Democrat"].sample(n=10, random_state=42)
low_honesty_dem["category"] = "low_honesty_dem"
low_honesty_rep = low_honesty_tweets[low_honesty_tweets["party"] == "Republican"].sample(n=10, random_state=42)
low_honesty_rep["category"] = "low_honesty_rep"

In [94]:
sample = pd.concat([
    high_belief_dem,
    high_belief_rep,
    high_truth_dem,
    high_truth_rep,
    low_honesty_dem,
    low_honesty_rep
]).reset_index(drop=True).sample(frac=1, random_state=42)

In [98]:
dst = "../../data"
sample.to_csv(Path(dst, "tweets", "document_validation_sample.csv"), index=False)