In [None]:
import os
import time
import json
import datetime
import tweepy

In [None]:
similar_users = ["", ""]
different_users = ["", ""]

twitter_api_key = ""
twitter_api_secret = ""

collections_dir = "collections"
max_pages = 100
max_page_fail_retries = 3
max_consecutive_pages_failed = 3
request_wait = 1
page_fail_wait = 30

In [None]:
tweet_attributes = [
    "display_text_range",
    "entities",
    "favorite_count",
    "full_text",
    "id",
    "retweet_count",
    "truncated"
]

def copy_attributes(dest, src, attributes):
    for attribute in attributes:
        try:
            dest[attribute] = src[attribute]
        except KeyError:
            i = None
            try:
                i = src["id"]
            except KeyError:
                pass
            print("KeyError: {}, {}".format(attribute, i))

def copy_tweet(tweet):
    dest = {}
    copy_attributes(dest, vars(tweet), tweet_attributes)
    dest["author_screen_name"] = tweet.author.screen_name
    dest["created_at_timestamp"] = tweet.created_at.timestamp()
    dest["retweeted_status"] = hasattr(tweet, "retweeted_status")
    if hasattr(tweet, "extended_entities"):
        dest["extended_entities"] = tweet.extended_entities
    return dest

In [None]:
def retrieve_comments(twitter_client, name):

    tweets = []
    page = 1
    page_fail_retries = 0
    consecutive_pages_failed = 0

    print("Retrieving {}".format(name))

    while page < max_pages and consecutive_pages_failed < max_consecutive_pages_failed:

        statuses = twitter_client.user_timeline(id=name, tweet_mode="extended", page=page)

        if statuses:
            for status in statuses:
                tweets.append(copy_tweet(status))

            page += 1
            page_fail_retries = 0
            consecutive_pages_failed = 0

            print("Retrieved Tweets: {} (Pages: {}/{})".format(len(tweets), page, max_pages), end="\r")
            time.sleep(request_wait)

        else:

            if page_fail_retries < max_page_fail_retries:
                page_fail_retries += 1

                print("Failed to retrieve page {} attempts {}/{}".format(page, page_fail_retries, max_page_fail_retries))
                print("Sleeping {} seconds".format(page_fail_wait))
                print("Retrieved Tweets: {} (Pages: {}/{})".format(len(tweets), page, max_pages), end="\r")

            else:
                consecutive_pages_failed += 1

                print("Failed to retrieve page {} attempts {}/{}".format(page, max_page_fail_retries, max_page_fail_retries))
                print("{} consecutive failed attempts on page {}, skipping page {}, ( Consecutive pages failed {}/{})".format(
                    max_page_fail_retries,
                    page, page,
                    consecutive_pages_failed,
                    max_consecutive_pages_failed))
                print("Sleeping {} seconds".format(page_fail_wait))
                print("Retrieved Tweets: {} (Pages: {}/{})".format(len(tweets), page, max_pages), end="\r")

                page += 1
                page_fail_retries = 0

            time.sleep(page_fail_wait)

    return tweets

def get_collection(twitter_client, name):

    tweets = retrieve_comments(twitter_client, name)

    collection = {
        "collection_name": name,
        "tweets": tweets,
        "date": datetime.datetime.now().isoformat()
    }

    return collection

In [None]:
def get_collection_path(collection_dir, name):
    return os.path.join(collection_dir, "{}.json".format(name))

def get_collections(names):

    twitter_client = None
    collections = {}

    os.makedirs(collections_dir, exist_ok=True)

    for collection_name in names:

        collection_filename = get_collection_path(collections_dir, collection_name)

        if os.path.isfile(collection_filename):
            # read collection
            print("Using saved collection: {}".format(collection_filename))
            with open(collection_filename) as collection_file:
                collection = json.load(collection_file)

            print("  Tweets: {}".format(len(collection["tweets"])))

        else:
            # retrieve collection

            if not twitter_client:
                twitter_auth = tweepy.AppAuthHandler(twitter_api_key, twitter_api_secret)
                twitter_client = tweepy.API(twitter_auth)

            collection = get_collection(twitter_client, collection_name)

            print("Saving collection: {}".format(collection_filename))
            with open(collection_filename, "w") as collection_file:
                json.dump(collection, collection_file)

        collections[collection_name] = collection

    return collections