In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

In [2]:
src = "../data"
dst = "../data"

## Download current account information from the Twitter API

In [3]:
# cleaned KnowWho candidate profiles from the script clean_KnowWho_data.ipynb
fname = "KnowWho_profiles_clean.csv"
knowwho_profiles = pd.read_csv(Path(src, "tmp", fname))

In [4]:
np.savetxt(Path(dst, "tmp", "midterm_candidate_profiles", "candidate_twitter_handles_knowwho.txt"), 
           knowwho_profiles["screen_name"].values, fmt="%s")

In [10]:
# get twitter account information - only works with a valid bearer token
# last executed: 2023-05-12
! twarc2 --bearer-token XXX users --usernames ../data/tmp/midterm_candidate_profiles/candidate_twitter_handles_knowwho.txt ../data/tmp/midterm_candidate_profiles/candidate_twitter_profiles_knowwho.jsonl

100%|████████| Processed 1703/1703 lines of input file [00:15<00:00, 111.50it/s]


In [11]:
# transform account information from JSON to csv
! twarc2 csv --input-data-type users ../data/tmp/midterm_candidate_profiles/candidate_twitter_accounts_knowwho.jsonl ../data/raw/candidate_twitter_profiles_knowwho.csv

100%|██████████████| Processed 2.50M/2.50M of input file [00:00<00:00, 10.6MB/s]

ℹ️
Parsed 1437 users objects from 18 lines in the input file.
23 were duplicates. Wrote 1414 rows and output 27 columns in the CSV.



# Clean & save user data

In [5]:
fname = "candidate_twitter_profiles_knowwho.csv"
users = pd.read_csv(
    Path(src, "raw", fname),
    dtype={"id":str},
    parse_dates=["created_at", "__twarc.retrieved_at"]
)
users = users.rename(columns={
    "id":"author_id",
    "username":"handle",
    "public_metrics.following_count":"following_count",
    "public_metrics.followers_count":"followers_count",
    "public_metrics.tweet_count":"tweet_count"
})
# de-duplicate keeping the most recent entry
users["handle"] = users["handle"].str.lower()
users = users.sort_values(by="__twarc.retrieved_at", ascending=False)
users = users.drop_duplicates(subset=["handle"])

In [6]:
diff = list(set(knowwho_profiles["screen_name"].str.lower())\
            .difference(set(users["handle"].str.lower())))
print(f"{len(diff)} user profiles from knowwho could not be retrieved")

240 user profiles from knowwho could not be retrieved


In [7]:
N = len(users)
users = users[users["protected"] == False]
print(f"{N - len(users)} user profiles were protected")

23 user profiles were protected


In [8]:
fname = "candidate_twitter_profiles.csv"
users.to_csv(Path(dst, "tmp", fname), index=False)