In [1]:
import pandas as pd
from os.path import join

# Hydrate tweets from tweet IDs

In [None]:
# US
# 2,717,708 tweet IDs
# takes about 8 hours as the rate limit for hydrating is 360,000 tweets / hour
# nees about 8 GB of space on hard drive
! twarc2 hydrate ../../data/tweets/combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean_tweetIDs.txt ../../data/tweets/combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16.jsonl

In [None]:
# convert JSON to csv
# needs about 6 GB uncompressed
! twarc2 csv --input-data-type tweets ../../data/tweets/combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16.jsonl ../../data/tweets/combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_hydrated.csv

In [3]:
# clean up JSON 
! rm ../../data/tweets/combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16.jsonl

In [None]:
# compress csv
! xz -v ../../data/tweets/combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_hydrated.csv

# Clean hydrated tweets

In [11]:
def extract_urls(tweets):
    '''Extracts URLs from the JSON objects the twitter API returns'''
    urls = []
    expanded_urls = []
    for obj in tweets["entities.urls"]:
        if obj != obj:
            urls.append([])
            expanded_urls.append([])
        else:
            obj = eval(obj)
            tmp_urls = []
            tmp_expanded_urls = []
            for entry in obj:
                tmp_urls.append(entry["url"])
                tmp_expanded_urls.append(entry["expanded_url"])
            urls.append(tmp_urls)
            expanded_urls.append(tmp_expanded_urls)
            
    return urls, expanded_urls

In [None]:
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_hydrated.csv.xz"

tweets = pd.read_csv(join(src, fname.format(country)),
                     dtype={"id":str, "author_id":str},
                     compression="xz")

print(f"{country}: {len(tweets)} tweets")

tweets["retweeted"] = False
tweets["quoted"] = False
tweets["reply"] = False
tweets.loc[tweets["referenced_tweets.retweeted.id"].dropna().index, "retweeted"] = True
tweets.loc[tweets["referenced_tweets.quoted.id"].dropna().index, "quoted"] = True
tweets.loc[tweets["referenced_tweets.replied_to.id"].dropna().index, "reply"] = True

# clean up column names
tweets = tweets.rename(columns={
    'public_metrics.like_count':'like_count',
    'public_metrics.reply_count':'reply_count',
    'public_metrics.retweet_count':'retweet_count',
    'public_metrics.quote_count':'quote_count',
})

# extract URLs
urls, expanded_urls = extract_urls(tweets)
tweets["urls"] = urls
tweets["expanded_urls"] = expanded_urls

# dump data
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-03-16_clean.csv.gzip"
cols = ["id", "author_id", "created_at", "expanded_urls", "retweeted",
        "quoted", "reply", "text", "retweet_count", "reply_count", 
        "like_count", "quote_count"]
tweets[cols].to_csv(join(src, fname), index=False, compression="gzip")