In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from os import listdir

# Get user timelines

In [2]:
src = "../data"
dst = "../data"

In [3]:
fname = "candidate_twitter_profiles.csv"
users = pd.read_csv(
    Path(src, "tmp", fname),
    dtype={"author_id":str},
    parse_dates=["created_at"]
)

In [4]:
# split the list of accounts into batches
if not os.path.exists(Path(dst, "tmp", "user_batches")): os.mkdir(Path(dst, "tmp", "user_batches"))

N_keys = 7  
batch_size = int(len(users) / N_keys)
for i in range(N_keys):
    batch = users["author_id"][i * batch_size : (i+ 1) * batch_size]
    np.savetxt(Path(dst, "tmp", "user_batches", f"candidate_twitter_accounts_batch_{i}.txt"), 
               batch, fmt="%s")

In [None]:
# upload the account ID batches to the remote server
! rsync -avze ssh ../data/tmp/user_batches/candidate_twitter_accounts_batch_* jlasser@medea:/data/honesty/corpora/Twitter/midterm_candidate_accounts/

In [None]:
# commands on remote server to download the user timelines for each batch using 
# twarc2 - only works with a valid bearer token
cd /data/honesty/corpora/Twitter/midterm_candidate_accounts
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time 2022-01-01 --end-time 2023-05-01 --use-search candidate_twitter_accounts_batch_0.txt ../midterm_candidate_timelines/midterm_candidate_timelines_batch_0.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time 2022-01-01 --end-time 2023-05-01 --use-search candidate_twitter_accounts_batch_1.txt ../midterm_candidate_timelines/midterm_candidate_timelines_batch_1.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time 2022-01-01 --end-time 2023-05-01 --use-search candidate_twitter_accounts_batch_2.txt ../midterm_candidate_timelines/midterm_candidate_timelines_batch_2.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time 2022-01-01 --end-time 2023-05-01 --use-search candidate_twitter_accounts_batch_3.txt ../midterm_candidate_timelines/midterm_candidate_timelines_batch_3.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time 2022-01-01 --end-time 2023-05-01 --use-search candidate_twitter_accounts_batch_4.txt ../midterm_candidate_timelines/midterm_candidate_timelines_batch_4.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time 2022-01-01 --end-time 2023-05-01 --use-search candidate_twitter_accounts_batch_5.txt ../midterm_candidate_timelines/midterm_candidate_timelines_batch_5.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time 2022-01-01 --end-time 2023-05-01 --use-search candidate_twitter_accounts_batch_6.txt ../midterm_candidate_timelines/midterm_candidate_timelines_batch_6.jsonl
twarc2 --bearer-token XXX timelines --no-context-annotations --start-time 2022-01-01 --end-time 2023-05-01 --use-search candidate_twitter_accounts_batch_7.txt ../midterm_candidate_timelines/midterm_candidate_timelines_batch_7.jsonl


In [None]:
# commands on the server to convert the raw .json files to .csv
cd /data/honesty/corpora/Twitter/midterm_candidate_timelines
twarc2 csv --extra-input-columns "edit_history_tweet_ids,public_metrics.impression_count" --input-data-type tweets midterm_candidate_timelines_batch_0.jsonl midterm_candidate_timelines_batch_0.csv
twarc2 csv --extra-input-columns "edit_history_tweet_ids,public_metrics.impression_count" --input-data-type tweets midterm_candidate_timelines_batch_1.jsonl midterm_candidate_timelines_batch_1.csv
twarc2 csv --extra-input-columns "edit_history_tweet_ids,public_metrics.impression_count" --input-data-type tweets midterm_candidate_timelines_batch_2.jsonl midterm_candidate_timelines_batch_2.csv
twarc2 csv --extra-input-columns "edit_history_tweet_ids,public_metrics.impression_count" --input-data-type tweets midterm_candidate_timelines_batch_3.jsonl midterm_candidate_timelines_batch_3.csv
twarc2 csv --extra-input-columns "edit_history_tweet_ids,public_metrics.impression_count" --input-data-type tweets midterm_candidate_timelines_batch_4.jsonl midterm_candidate_timelines_batch_4.csv
twarc2 csv --extra-input-columns "edit_history_tweet_ids,public_metrics.impression_count" --input-data-type tweets midterm_candidate_timelines_batch_5.jsonl midterm_candidate_timelines_batch_5.csv
twarc2 csv --extra-input-columns "edit_history_tweet_ids,public_metrics.impression_count" --input-data-type tweets midterm_candidate_timelines_batch_6.jsonl midterm_candidate_timelines_batch_6.csv
twarc2 csv --extra-input-columns "edit_history_tweet_ids,public_metrics.impression_count" --input-data-type tweets midterm_candidate_timelines_batch_7.jsonl midterm_candidate_timelines_batch_7.csv

In [None]:
# commands on the server to compress the .csv files
xz -v midterm_candidate_timelines_batch_0.csv
xz -v midterm_candidate_timelines_batch_0.jsonl
xz -v midterm_candidate_timelines_batch_1.csv
xz -v midterm_candidate_timelines_batch_1.jsonl
xz -v midterm_candidate_timelines_batch_2.csv
xz -v midterm_candidate_timelines_batch_2.jsonl
xz -v midterm_candidate_timelines_batch_3.csv
xz -v midterm_candidate_timelines_batch_3.jsonl
xz -v midterm_candidate_timelines_batch_4.csv
xz -v midterm_candidate_timelines_batch_4.jsonl
xz -v midterm_candidate_timelines_batch_5.csv
xz -v midterm_candidate_timelines_batch_5.jsonl
xz -v midterm_candidate_timelines_batch_6.csv
xz -v midterm_candidate_timelines_batch_6.jsonl
xz -v midterm_candidate_timelines_batch_7.csv
xz -v midterm_candidate_timelines_batch_7.jsonl

In [7]:
! mkdir -p ../data/tmp/midterm_candidate_timelines

In [None]:
# download the .csv files from the server
! rsync -avze ssh jlasser@medea:/data/honesty/corpora/Twitter/midterm_candidate_timelines/*csv.xz ../data/tmp/midterm_candidate_timelines --progress 

# Clean the data

In [2]:
fnames = listdir(Path(src, "tmp", "midterm_candidate_timelines"))
fnames = [f for f in fnames if f.endswith("csv.xz")]
timelines = pd.concat([pd.read_csv(
    Path(src, "tmp", "midterm_candidate_timelines", fname), 
    compression="xz", 
    dtype={"id":str, "author_id":str}, 
    low_memory=False
) for fname in fnames])
timelines = timelines.reset_index(drop=True)

In [5]:
N = len(timelines)
timelines = timelines[timelines["author_id"].isin(users["author_id"])]
print(f"dropped {N - len(timelines)} tweets from unknown users")

dropped 1060 tweets from unknown users


In [6]:
timelines["created_at"] = pd.to_datetime(timelines["created_at"], errors="coerce")

In [7]:
# get tweet type
timelines["retweeted"] = False
timelines["quoted"] = False
timelines["reply"] = False
timelines.loc[timelines["referenced_tweets.retweeted.id"].dropna().index, "retweeted"] = True
timelines.loc[timelines["referenced_tweets.quoted.id"].dropna().index, "quoted"] = True
timelines.loc[timelines["referenced_tweets.replied_to.id"].dropna().index, "reply"] = True

In [8]:
# make sure all tweets are within the desired time window
start = pd.to_datetime("2022-01-01", utc="UTC")
end = pd.to_datetime("2023-05-01", utc="UTC")
N = len(timelines)
timelines = timelines[timelines["created_at"] >= start]
timelines = timelines[timelines["created_at"] < end]
print(f"dropped {N - len(timelines)} tweets outside the desired time window")

dropped 0 tweets outside the desired time window


In [10]:
# drop duplicates
N = len(timelines)
timelines = timelines.sort_values("__twarc.retrieved_at", ascending=False)
timelines = timelines.drop_duplicates(subset=["id"])
print(f"dropped {N - len(timelines)} duplicates")

dropped 166389 duplicates


In [11]:
# drop retweets
N = len(timelines)
timelines = timelines[timelines["retweeted"] == False]
print(f"dropped {N - len(timelines)} retweets")

dropped 436280 retweets


In [12]:
print(f"{len(timelines)} tweets remaining")

1084776 tweets remaining


In [13]:
# clean up column names
timelines = timelines.rename(columns={
    'public_metrics.like_count':'like_count',
    'public_metrics.reply_count':'reply_count',
    'public_metrics.retweet_count':'retweet_count',
    'public_metrics.quote_count':'quote_count',
})

In [14]:
# extract URL objects
urls = []
expanded_urls = []
for obj in timelines["entities.urls"]:
    if obj != obj:
        urls.append([])
        expanded_urls.append([])
    else:
        obj = eval(obj)
        tmp_urls = []
        tmp_expanded_urls = []
        for entry in obj:
            if type(entry) == dict:
                tmp_urls.append(entry["url"])
            elif type(entry) == str:
                tmp_urls.append(entry)
            else:
                print("unknown entry type!")
                
            if type(entry) == dict:
                tmp_expanded_urls.append(entry["expanded_url"])
            elif type(entry) == str:
                tmp_expanded_urls.append(entry)
            else:
                print("unknown entry type!")
                
        urls.append(tmp_urls)
        expanded_urls.append(tmp_expanded_urls)

In [15]:
timelines["urls"] = urls
timelines["expanded_urls"] = expanded_urls

# Export the data

In [16]:
fname = "combined_midterm_candidate_timelines_2022-01-01_to_2023-05-01_clean.csv.gzip"
cols = ["id", "author_id", "created_at", "expanded_urls", "retweeted", "quoted",
        "reply", "text", "retweet_count", "reply_count", "like_count",
        "quote_count"]
timelines[cols].to_csv(Path(dst, "raw", fname), index=False, compression="gzip")

In [17]:
# export tweet IDs for hydration
fname = "combined_midterm_candidate_timelines_2022-01-01_to_2023-05-01_clean_tweetIDs.txt"
np.savetxt(Path(dst, "raw", fname), timelines["id"].values, fmt="%s")

# Exploratory analysis: resolve URLs

## Export URLs to unravel

In [18]:
def extract_domain(url):
    if url != url:
        return np.nan
    # trailing "/" and spaces
    url = url.strip('/').strip()
    # transform all domains to lowercase
    url = url.lower()
    # remove any white spaces
    url = url.replace(' ', '')
    # if present: remove the protocol
    if url.startswith(("http", "https")):
        try:
            url = url.split('//')[1]
        except IndexError:
            print(f"found malformed URL {url}")
            return np.nan
    # remove "www." 
    url = url.replace('www.', '')
    url = url.split("/")[0]
    return url

In [19]:
URLs = []
for url_list in timelines["expanded_urls"]:
    URLs.extend(url_list)
URLs = pd.DataFrame({"url":list(set(URLs))})
URLs["domain"] = URLs["url"].apply(extract_domain)

In [20]:
# initial list of shorteners from this repo:
# https://github.com/boutetnico/url-shorteners
url_shorteners = list(np.loadtxt(Path("utilities", "url_shorteners.txt"), dtype=str))

# add URL shorteners based on manual inspections of all URLs that appeared >100
# times in the dataset
url_shorteners.extend([
    "fb.me", "buff.ly", "nyti.ms", "wapo.st", "youtu.be", "1.usa.gov", "fxn.ws",
    "on.fb.me", "politi.co", "trib.al", "washex.am", "hill.cm", "cnb.cx",
    "hubs.ly", "cs.pn","n.pr", "conta.cc", "mi.tt", "usat.ly", "abcn.ws",
    "reut.rs", "cbsn.ws", "huff.to", "instagr.am", "bloom.bg", "fw.to", 
    "ift.tt", "strib.mn", "lat.ms", "afs.mn", "dpo.st", "mailchi.mp",
    "dailysign.al", "tmblr.co", "rub.io", "yhoo.it", "omny.fm", "chrl.ie",
    "tulsi.to", "apne.ws", "hrc.io", "ed.gr", "ti.me", "herit.ag", "indy.st",
    "ofa.bo", "trib.in", "azc.cc", "bsun.md", "wjcf.co", "bityl.co", "go.shr.lc",
    "t1p.de", "m.bild.de", "sz.de", "m.faz.net", "zpr.io", "m.tagesspiegel.de",
    "to.welt.de", "gleft.de", "nol.is", "m.spiegel.de", "m.youtube.com", 
    "m.facebook.com", "m.focus.de", "loom.ly", "t.me", "4sq.com", "diplo.de",
    "p.dw.com", "owl.li", "tmi.me", "m.haz.de", "ly.zdf.de", "chng.it", "img.ly",
    "m.augsburger-allgemeine.de", "x.swr.de", "m.fr.de", "ebx.sh", "m.fr.de",
    "fcld.ly", "spoti.fi", "shar.es", "s.rlp.de", "m.welt.de", "bbc.in", 
    "on.ft.com", "fb.watch", "mol.im", "crowd.in ", "zcu.io", "gu.com",
    "lnkd.in", "shorturl.at", "m.huffingtonpost.co.uk", "fal.cn", "lght.ly", 
    "econ.st", "huffp.st", "l-bc.co", "wbs.wales", "aca.st ", "ind.pn", "cutt.ly",
    "dailym.ai", "ow.ly"
])

In [21]:
shortened_urls = URLs[URLs["domain"].isin(url_shorteners)]
unshortened_urls = URLs[~URLs["domain"].isin(url_shorteners)]
print(f"{len(shortened_urls)} shortened and {len(unshortened_urls)} unshortened URLs")

16625 shortened and 506786 unshortened URLs


In [22]:
fname = "midterm_candidates_URLs.csv.gzip"
shortened_urls["url"].to_csv(Path(dst, "tmp", fname),
                    compression="gzip", index=False)

In [None]:
# copy the URLs that need unravelling to the remote server
! rsync -avze ssh ../data/tmp/midterm_candidates_URLs.csv.gzip jlasser@medea:/home/jlasser/Honesty-project/data/

In [None]:
# run on the server: follow all the URLs. Note: this takes about a day per 100k URLs
#cd Honesty-project/code/
! mkdir -p ../data/midterm_candidates_unraveled_urls
! python ../../../../utilities/unravel_urls/unravel_urls.py midterm_candidates_URLs.csv.gzip -dst ../data/midterm_candidates_unraveled_urls/ -v 1

## Load unraveled URLs

In [None]:
# load unraveled URLs from server (they are stored in batches of 1000 URLs
! rsync -avze ssh jlasser@medea:/home/jlasser/Honesty-project/data/midterm_candidates_unraveled_urls/ ../data/tmp/midterm_candidates_unraveled_urls/

In [2]:
files = listdir(Path(src, "tmp", "midterm_candidates_unraveled_urls"))
unraveled_urls = pd.DataFrame()
for i,f in enumerate(files):
    if i%1000 == 0:
        print(f"{i}/{len(files)}")
    tmp = pd.read_csv(Path(src, f), compression="gzip")
    unraveled_urls = pd.concat([unraveled_urls, tmp])
unraveled_urls = unraveled_urls.reset_index(drop=True)

0/167


## Add hosts from timeouts

In [3]:
timeouts = len(unraveled_urls) - len(unraveled_urls["status_code"].dropna())
print("{} timeouts ({:1.2f}%)".format(\
        timeouts,
        (timeouts / len(unraveled_urls["status_code"].dropna()) * 100)))

689 timeouts (4.32%)


In [4]:
def extract_host(unraveled_url):
    if unraveled_url == unraveled_url and unraveled_url.startswith("Cannot"):
        host = unraveled_url.split(" ")[4].split(":")[0]
        return host
    else:
        return unraveled_url

In [5]:
unraveled_urls["unraveled_url"] = unraveled_urls["unraveled_url"].apply(extract_host)

In [6]:
fname = "midterm_candidates_unraveled_urls.csv.xz"

unraveled_urls.to_csv(
    Path(dst, "tmp", fname),
    index=False,
    compression="xz"
)