In [None]:
# author: Jana Lasser

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from os import listdir
import os

# Get user timelines

**Note**: on 2023-02-11 we discovered that we were using the twarc2 timelines command without the --use-search parameter. This resulted in us getting only the last 3200 tweets from every account. We repeated the data collection on 2023-02-11 and 2023-02-12 to retrieve the missing tweets. The old data is in the folder ../data_2022-02-11.

In [11]:
src = "../../data/users"
fname = "US_politician_twitter_accounts_clean.csv"
users = pd.read_csv(Path(src, fname), dtype={"author_id":str})

In [20]:
# split the list of accounts into batches
dst = "../../data/users"
if not os.path.exists(Path(dst, "user_batches")): os.mkdir(Path(dst, "user_batches"))
N_keys = 11
batch_size = int(len(users) / N_keys)
for i in range(N_keys):
    batch = users["author_id"][i * batch_size : (i+ 1) * batch_size]
    np.savetxt(Path(dst, "user_batches", f"US_politician_twitter_accounts_batch_{i}.txt"), 
               batch, fmt="%s")

In [None]:
# upload the account ID batches to the server
! rsync -avze ssh ../../data/users/user_batches/US_politician_twitter_accounts_batch_* jlasser@medea:/data/honesty/corpora/Twitter/US_politician_twitter_accounts_ontology/

In [None]:
# commands on medea to download the user timelines for each batch using twarc2
cd /data/honesty/corpora/Twitter/US_politician_twitter_accounts_ontology
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_0.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_0.jsonl
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_1.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_1.jsonl
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_2.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_2.jsonl
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_3.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_3.jsonl
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_4.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_4.jsonl
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_5.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_5.jsonl
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_6.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_6.jsonl
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_7.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_7.jsonl
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_8.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_8.jsonl
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_9.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_9.jsonl

# additional accounts of the 118th congress
twarc2 timelines --use-search --no-context-annotations --start-time 2010-11-06 --end-time 2023-02-11 US_politician_twitter_accounts_batch_10.txt ../US_politician_timelines_ontology_update/US_politician_twitter_accounts_batch_10.jsonl

In [None]:
# commands on the server to convert the raw .json files to .csv
cd ../US_politician_timelines_ontology_update
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_0.jsonl US_politician_twitter_accounts_batch_0.csv
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_1.jsonl US_politician_twitter_accounts_batch_1.csv
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_2.jsonl US_politician_twitter_accounts_batch_2.csv
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_3.jsonl US_politician_twitter_accounts_batch_3.csv
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_4.jsonl US_politician_twitter_accounts_batch_4.csv
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_5.jsonl US_politician_twitter_accounts_batch_5.csv
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_6.jsonl US_politician_twitter_accounts_batch_6.csv
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_7.jsonl US_politician_twitter_accounts_batch_7.csv
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_8.jsonl US_politician_twitter_accounts_batch_8.csv
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_9.jsonl US_politician_twitter_accounts_batch_9.csv

# additional accounts of the 118th congress
twarc2 csv --extra-input-columns "public_metrics.impression_count,edit_history_tweet_ids" --input-data-type tweets US_politician_twitter_accounts_batch_10.jsonl US_politician_twitter_accounts_batch_10.csv

In [None]:
# commands on the server to compress the .csv files
xz -v US_politician_twitter_accounts_batch_0.csv
xz -v US_politician_twitter_accounts_batch_1.csv
xz -v US_politician_twitter_accounts_batch_2.csv
xz -v US_politician_twitter_accounts_batch_3.csv
xz -v US_politician_twitter_accounts_batch_4.csv
xz -v US_politician_twitter_accounts_batch_5.csv
xz -v US_politician_twitter_accounts_batch_6.csv
xz -v US_politician_twitter_accounts_batch_7.csv
xz -v US_politician_twitter_accounts_batch_8.csv
xz -v US_politician_twitter_accounts_batch_9.csv

# additional accounts of the 118th congress
xz -v US_politician_twitter_accounts_batch_10.csv

In [1]:
! mkdir ../../data/tweets/timeline_batches

In [7]:
# download the .csv files from the server
! rsync -avze ssh jlasser@medea:/data/honesty/corpora/Twitter/US_politician_timelines_ontology_update/*.xz ../../data/tweets/timeline_batches 

receiving incremental file list
US_politician_twitter_accounts_batch_10.csv.xz

sent 43 bytes  received 35,404,389 bytes  7,867,651.56 bytes/sec
total size is 1,133,057,136  speedup is 32.00


In [8]:
src = "../../data/tweets/timeline_batches"
fnames = listdir(src)
fnames = [f for f in fnames if f.endswith(".xz")]
timelines = pd.concat([pd.read_csv(Path(src, fname), compression="xz", 
                                   dtype={"id":str, "author_id":str}, low_memory=False) \
                       for fname in fnames])
timelines = timelines.reset_index(drop=True)

In [14]:
timelines = timelines[timelines["author_id"].isin(users["author_id"])]

In [16]:
dst = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2023-02-11_raw.csv.gzip"
timelines.to_csv(Path(dst, fname), compression="gzip", index=False)

# Clean the data

In [2]:
src = "../../data/tweets"
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2023-02-11_raw.csv.gzip"
timelines = pd.read_csv(
    Path(src, fname), 
    compression="gzip", 
    dtype={"id":str, "author_id":str})
timelines["created_at"] = pd.to_datetime(timelines["created_at"], errors="coerce")

  timelines = pd.read_csv(


In [3]:
# make sure all tweets are within the desired time window
start = pd.to_datetime("2010-11-06", utc="UTC")
end = pd.to_datetime("2022-12-31", utc="UTC")
N = len(timelines)
timelines = timelines[timelines["created_at"] >= start]
timelines = timelines[timelines["created_at"] <= end]
print(f"dropped {N - len(timelines)} tweets outside the desired time window")

dropped 74283 tweets outside the desired time window


In [4]:
len(timelines)

5926497

In [6]:
N = len(timelines)
timelines = timelines.drop_duplicates(subset="id")
print(f"dropped {N - len(timelines)} duplicates")

dropped 47337 duplicates


In [7]:
len(timelines)

5879160

In [8]:
# get tweet type
timelines["retweeted"] = False
timelines["quoted"] = False
timelines["reply"] = False
timelines.loc[timelines["referenced_tweets.retweeted.id"].dropna().index, "retweeted"] = True
timelines.loc[timelines["referenced_tweets.quoted.id"].dropna().index, "quoted"] = True
timelines.loc[timelines["referenced_tweets.replied_to.id"].dropna().index, "reply"] = True

In [9]:
# drop retweets
N = len(timelines)
timelines = timelines[timelines["retweeted"] == False]
print(f"dropped {N - len(timelines)} retweets")

dropped 1351346 retweets


In [10]:
# new 2010-11-06 to 2022-03-16: 5392309 tweets remaining
# original 2010-11-06 to 2022-03-16 : 2205517 tweets remaining
print(f"{len(timelines)} tweets remaining")

4527814 tweets remaining


In [11]:
print("{} quote tweets".format(len(timelines[timelines["quoted"] == True])))
print("{} replies".format(len(timelines[timelines["reply"] == True])))

529389 quote tweets
572113 replies


In [7]:
# clean up column names
timelines = timelines.rename(columns={
    'public_metrics.like_count':'like_count',
    'public_metrics.reply_count':'reply_count',
    'public_metrics.retweet_count':'retweet_count',
    'public_metrics.quote_count':'quote_count',
})

In [8]:
# extract URL objects
urls = []
expanded_urls = []
for obj in timelines["entities.urls"]:
    if obj != obj:
        urls.append([])
        expanded_urls.append([])
    else:
        obj = eval(obj)
        tmp_urls = []
        tmp_expanded_urls = []
        for entry in obj:
            tmp_urls.append(entry["url"])
            tmp_expanded_urls.append(entry["expanded_url"])
        urls.append(tmp_urls)
        expanded_urls.append(tmp_expanded_urls)

In [9]:
timelines["urls"] = urls
timelines["expanded_urls"] = expanded_urls

# Export the data

In [16]:
fname = "combined_US_politician_twitter_timelines_2010-11-06_to_2022-12-31_clean.csv.gzip"
dst = "../../data/tweets"
cols = ["id", "author_id", "created_at", "expanded_urls", "retweeted", "quoted",
        "reply", "text", "retweet_count", "reply_count", "like_count",
        "quote_count"]
timelines[cols].to_csv(Path(dst, fname), index=False, compression="gzip")

In [17]:
# export tweet IDs for hydration
fname = "tweet_IDs.txt.xz"
dst = "../../data/tweets/"
np.savetxt(Path(dst, fname), timelines["id"].values, fmt="%s")

# Export URLs to unravel

In [30]:
def extract_domain(url):
    if url != url:
        return np.nan
    # trailing "/" and spaces
    url = url.strip('/').strip()
    # transform all domains to lowercase
    url = url.lower()
    # remove any white spaces
    url = url.replace(' ', '')
    # if present: remove the protocol
    if url.startswith(("http", "https")):
        try:
            url = url.split('//')[1]
        except IndexError:
            print(f"found malformed URL {url}")
            return np.nan
    # remove "www." 
    url = url.replace('www.', '')
    url = url.split("/")[0]
    return url


In [31]:
URLs = []
for url_list in timelines["expanded_urls"]:
    URLs.extend(url_list)
URLs = pd.DataFrame({"url":list(set(URLs))})
URLs["domain"] = URLs["url"].apply(extract_domain)

In [32]:
# initial list of shorteners from this repo:
# https://github.com/boutetnico/url-shorteners
src = "../../data/utilities"
url_shorteners = list(np.loadtxt(Path(src, "url_shorteners.txt"), dtype=str))

# add URL shorteners based on manual inspections of all URLs that appeared >100
# times in the dataset
url_shorteners.extend([
    "fb.me", "buff.ly", "nyti.ms", "wapo.st", "youtu.be", "1.usa.gov", "fxn.ws",
    "on.fb.me", "politi.co", "trib.al", "washex.am", "hill.cm", "cnb.cx",
    "hubs.ly", "cs.pn","n.pr", "conta.cc", "mi.tt", "usat.ly", "abcn.ws",
    "reut.rs", "cbsn.ws", "huff.to", "instagr.am", "bloom.bg", "fw.to", 
    "ift.tt", "strib.mn", "lat.ms", "afs.mn", "dpo.st", "mailchi.mp",
    "dailysign.al", "tmblr.co", "rub.io", "yhoo.it", "omny.fm", "chrl.ie",
    "tulsi.to", "apne.ws", "hrc.io", "ed.gr", "ti.me", "herit.ag", "indy.st",
    "ofa.bo", "trib.in", "azc.cc", "bsun.md", "wjcf.co", "bityl.co", "go.shr.lc",
    "t1p.de", "m.bild.de", "sz.de", "m.faz.net", "zpr.io", "m.tagesspiegel.de",
    "to.welt.de", "gleft.de", "nol.is", "m.spiegel.de", "m.youtube.com", 
    "m.facebook.com", "m.focus.de", "loom.ly", "t.me", "4sq.com", "diplo.de",
    "p.dw.com", "owl.li", "tmi.me", "m.haz.de", "ly.zdf.de", "chng.it", "img.ly",
    "m.augsburger-allgemeine.de", "x.swr.de", "m.fr.de", "ebx.sh", "m.fr.de",
    "fcld.ly", "spoti.fi", "shar.es", "s.rlp.de", "m.welt.de", "bbc.in", 
    "on.ft.com", "fb.watch", "mol.im", "crowd.in ", "zcu.io", "gu.com",
    "lnkd.in", "shorturl.at", "m.huffingtonpost.co.uk", "fal.cn", "lght.ly", 
    "econ.st", "huffp.st", "l-bc.co", "wbs.wales", "aca.st ", "ind.pn", "cutt.ly",
    "dailym.ai", "ow.ly"
])

In [33]:
# original: 139972 shortened and 1160971 unshortened URLs
shortened_urls = URLs[URLs["domain"].isin(url_shorteners)]
unshortened_urls = URLs[~URLs["domain"].isin(url_shorteners)]
print(f"{len(shortened_urls)} shortened and {len(unshortened_urls)} unshortened URLs")

565035 shortened and 3269841 unshortened URLs


In [47]:
fname = "US_politician_URLs.csv.gzip"
dst = "../../data/urls"
shortened_urls["url"].to_csv(Path(dst, fname),
                    compression="gzip", index=False)

In [49]:
# copy the URLs that need unravelling to the server
! rsync -avze ssh ../../data/urls/US_politician_URLs.csv.gzip jlasser@medea:/home/jlasser/Honesty-project/data/

sending incremental file list
US_politician_URLs.csv.gzip

sent 4,305,719 bytes  received 35 bytes  1,722,301.60 bytes/sec
total size is 4,303,149  speedup is 1.00


In [None]:
# run on the server: follow all the URLs. Note: this takes about a day per 100k URLs
! python unravel_urls.py ../../data/urls/US_politician_URLs.csv.gzip -dst ../../data/urls/US_politician_unraveled_urls/ -v 1

# Load unraveled URLs

In [None]:
# load unraveled URLs from server (they are stored in batches of 1000 URLs
! rsync -avze ssh jlasser@medea:/home/jlasser/Honesty-project/data/US_politician_unraveled_urls/ ../../data/urls/US_politician_unraveled_urls/

In [2]:
src = '../../data/urls/US_politician_unraveled_urls'
files = listdir(src)
unraveled_urls = pd.DataFrame()
for i,f in enumerate(files):
    if i%1000 == 0:
        print(f"{i}/{len(files)}")
    tmp = pd.read_csv(Path(src, f), compression="gzip")
    unraveled_urls = pd.concat([unraveled_urls, tmp])
unraveled_urls = unraveled_urls.reset_index(drop=True)

0/5651
1000/5651
2000/5651
3000/5651
4000/5651
5000/5651


# Add hosts from timeouts

In [3]:
timeouts = len(unraveled_urls) - len(unraveled_urls["status_code"].dropna())
print("{} timeouts ({:1.2f}%)".format(\
        timeouts,
        (timeouts / len(unraveled_urls["status_code"].dropna()) * 100)))

45559 timeouts (8.77%)


In [4]:
def extract_host(unraveled_url):
    if unraveled_url == unraveled_url and unraveled_url.startswith("Cannot"):
        host = unraveled_url.split(" ")[4].split(":")[0]
        return host
    else:
        return unraveled_url

In [5]:
unraveled_urls["unraveled_url"] = unraveled_urls["unraveled_url"].apply(extract_host)

In [6]:
dst = '../../data/urls'
unraveled_urls.to_csv(Path(dst, "US_unraveled_urls.csv.xz"), index=False,
                      compression="xz")

# Timeline updates

In [None]:
# commands on medea to download the user timelines for each batch using twarc2
cd /data/honesty/corpora/Twitter/US_politician_twitter_accounts_ontology
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_0.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_0.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_1.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_1.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_2.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_2.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_3.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_3.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_4.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_4.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_5.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_5.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_6.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_6.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_7.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_7.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_8.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_8.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_9.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_9.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time <new_start_time> --end-time <new_end_time> US_politician_twitter_accounts_batch_10.txt ../US_politician_timelines_update/US_politician_twitter_accounts_batch_10.jsonl

In [None]:
# commands on the server to convert the raw .json files to .csv
cd ../US_politician_timelines_update
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_0.jsonl US_politician_twitter_accounts_batch_0.csv
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_1.jsonl US_politician_twitter_accounts_batch_1.csv
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_2.jsonl US_politician_twitter_accounts_batch_2.csv
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_3.jsonl US_politician_twitter_accounts_batch_3.csv
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_4.jsonl US_politician_twitter_accounts_batch_4.csv
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_5.jsonl US_politician_twitter_accounts_batch_5.csv
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_6.jsonl US_politician_twitter_accounts_batch_6.csv
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_7.jsonl US_politician_twitter_accounts_batch_7.csv
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_8.jsonl US_politician_twitter_accounts_batch_8.csv
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_9.jsonl US_politician_twitter_accounts_batch_9.csv
twarc2 csv --input-data-type tweets US_politician_twitter_accounts_batch_10.jsonl US_politician_twitter_accounts_batch_10.csv

In [None]:
# commands on the server to compress the .csv files
xz -v US_politician_twitter_accounts_batch_0.csv
xz -v US_politician_twitter_accounts_batch_1.csv
xz -v US_politician_twitter_accounts_batch_2.csv
xz -v US_politician_twitter_accounts_batch_3.csv
xz -v US_politician_twitter_accounts_batch_4.csv
xz -v US_politician_twitter_accounts_batch_5.csv
xz -v US_politician_twitter_accounts_batch_6.csv
xz -v US_politician_twitter_accounts_batch_7.csv
xz -v US_politician_twitter_accounts_batch_8.csv
xz -v US_politician_twitter_accounts_batch_9.csv
xz -v US_politician_twitter_accounts_batch_10.csv

In [6]:
! mkdir ../../data/tweets/timeline_batches_update

In [None]:
# download the .csv files from the server
! rsync -avze ssh jlasser@medea:/data/honesty/corpora/Twitter/US_politician_timelines_update/*.xz ../../data/tweets/timeline_batches_update 

In [None]:
src = "../../data/tweets/timeline_batches_update"
fnames = listdir(src)
fnames = [f for f in fnames if f.endswith(".xz")]
timelines = pd.concat([pd.read_csv(Path(src, fname), compression="xz", 
                                   dtype={"id":str, "author_id":str}) \
                       for fname in fnames])
timelines = timelines.reset_index(drop=True)

In [9]:
timelines = timelines[timelines["author_id"].isin(users["author_id"])]

In [10]:
dst = "../../data/tweets"
new_start_time = # insert
new_end_time = # insert
fname = f"combined_US_politician_twitter_timelines_{new_start_time}_to_{new_end_time}_raw.csv.gzip"
timelines.to_csv(Path(dst, fname), compression="gzip", index=False)

## Clean the data

In [11]:
src = "../../data/tweets"
fname = f"combined_US_politician_twitter_timelines_{new_start_time}_to_{new_end_time}_raw.csv.gzip"
timelines = pd.read_csv(
    Path(src, fname), 
    compression="gzip", 
    dtype={"id":str, "author_id":str, "conversation_id":str})
timelines["created_at"] = pd.to_datetime(timelines["created_at"], errors="coerce")

  timelines = pd.read_csv(


In [12]:
# get tweet type
timelines["retweeted"] = False
timelines["quoted"] = False
timelines["reply"] = False
timelines.loc[timelines["referenced_tweets.retweeted.id"].dropna().index, "retweeted"] = True
timelines.loc[timelines["referenced_tweets.quoted.id"].dropna().index, "quoted"] = True
timelines.loc[timelines["referenced_tweets.replied_to.id"].dropna().index, "reply"] = True

In [13]:
print(f"{len(timelines)} tweets in update")

170466 tweets in update


In [14]:
# clean up column names
timelines = timelines.rename(columns={
    'public_metrics.like_count':'like_count',
    'public_metrics.reply_count':'reply_count',
    'public_metrics.retweet_count':'retweet_count',
    'public_metrics.quote_count':'quote_count',
})

In [15]:
# extract URL objects
urls = []
expanded_urls = []
for obj in timelines["entities.urls"]:
    if obj != obj:
        urls.append([])
        expanded_urls.append([])
    else:
        obj = eval(obj)
        tmp_urls = []
        tmp_expanded_urls = []
        for entry in obj:
            tmp_urls.append(entry["url"])
            tmp_expanded_urls.append(entry["expanded_url"])
        urls.append(tmp_urls)
        expanded_urls.append(tmp_expanded_urls)

In [16]:
timelines["urls"] = urls
timelines["expanded_urls"] = expanded_urls

## Export the data

In [17]:
fname = f"combined_US_politician_twitter_timelines_{new_start_time}_to_{new_end_time}_clean.csv.gzip"
dst = "../../data/tweets"
cols = ["id", "author_id", "created_at", "expanded_urls", "retweeted", "quoted",
        "reply", "text", "retweet_count", "reply_count", "like_count",
        "quote_count"]
timelines[cols].to_csv(Path(dst, fname), index=False, compression="gzip")

In [18]:
# export tweet IDs for hydration
fname = "tweet_IDs_update.txt.xz"
dst = "../../data/tweets/"
np.savetxt(Path(dst, fname), timelines["id"].values, fmt="%s")