This notebook makes calls to the Twitter API with the package Tweepy to obtain certain features for each account in question.

In [1]:
import config
import os
#must be Tweepy version 4 or later (this was tested on 4.10)
import tweepy
from tweepy.errors import Forbidden, NotFound
import csv
import pandas as pd

consumer_key = config.CONSUMER_KEY
consumer_secret = config.CONSUMER_SECRET
access_token = config.ACCESS_TOKEN
access_token_secret = config.ACCESS_TOKEN_SECRET

In [None]:
#run this cell for as long as you want to get a sample of current tweets
stream_output = "./data_csv/intermediate/stream_output.txt"

#stream sample example:
class TweetPrinter(tweepy.StreamingClient):

    def on_tweet(self, user):
        with open(stream_output, 'a') as f:
            print(str(user.id))
            f.write(str(user.id) + "\n")


printer = TweetPrinter(config.BEARER_TOKEN, wait_on_rate_limit=True)
printer.sample(user_fields="created_at,description,entities,id,name,profile_image_url,public_metrics,username,verified", expansions="author_id")


In [None]:
#get the features of the users from the sampled current tweets

tweet_input = "stream_output.txt"
stream_users_output = "./data_csv/intermediate/stream_users.csv"

assert not os.path.exists(stream_users_output), f"{stream_users_output} already exists"

sampled_time = "2022-09-28 4:48:00+00:00"

tweet_ids = []

with open(tweet_input, "r") as f:
    tweet_ids = f.read().splitlines() 

# authorization of consumer key and consumer secret
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# set access to user's access key and access secret 
auth.set_access_token(access_token, access_token_secret)
# calling the api 
api = tweepy.API(auth)

with open(stream_users_output, "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')

    writer.writerow(["id", "created_at", "sampled_at", "name", "screen_name", "description", "statuses_count", "followers_count", "friends_count", "favourites_count", "listed_count", "default_profile", "profile_banner_url", "verified"])
    for _id in tweet_ids:
        try: 
            status = api.get_status(_id)
            user = status.user
            if not hasattr(user, "profile_banner_url"):
                user.profile_banner_url = "N/A"

            writer.writerow([user.id, user.created_at, sampled_time, user.name, user.screen_name, user.description, user.statuses_count, user.followers_count, user.friends_count, user.favourites_count, user.listed_count, user.default_profile, user.profile_banner_url, user.verified])
        except (Forbidden, NotFound) as e:
            print(e)

In [None]:
#get the features of the users in the varol-2017 dataset from their ids

varol_input = "./original_datasets/varol/varol-2017.tsv"
varol_output = "./data_csv/intermediate/varol-17.csv"

assert not os.path.exists(varol_output), f"{varol_output} already exists"

sampled_time = "2022-09-28 4:48:00+00:00"

df = pd.read_csv(varol_input, delim_whitespace=True, names=range(2))
ids = df[0]

# authorization of consumer key and consumer secret
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# set access to user's access key and access secret 
auth.set_access_token(access_token, access_token_secret)
# calling the api 
api = tweepy.API(auth, wait_on_rate_limit=True)

with open(varol_output, "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')

    writer.writerow(["id", "created_at", "sampled_at", "name", "screen_name", "description", "statuses_count", "followers_count", "friends_count", "favourites_count", "listed_count", "default_profile", "profile_banner_url", "verified", "is_bot"])

    for i, _id in enumerate(ids):
        try: 
            user = api.get_user(user_id=_id)
            if not hasattr(user, "profile_banner_url"):
                user.profile_banner_url = "N/A"

            writer.writerow([user.id, user.created_at, sampled_time, user.name, user.screen_name, user.description, user.statuses_count, user.followers_count, user.friends_count, user.favourites_count, user.listed_count, user.default_profile, user.profile_banner_url, user.verified, df[1][i]])
            print(f"Got {_id}")
        except (Forbidden, NotFound) as e:
            print(f"Error for {_id}: {e}")
