In [1]:
import os
import json
import csv

from tqdm import tqdm
import pandas as pd
import tweepy

In [2]:
with open('keys.json', 'rb') as f:
    keys = json.load(f)

In [3]:
auth = tweepy.OAuthHandler(
    keys['API_key'], 
    keys['API_key_secret']
)
auth.set_access_token(
    keys['access_token'], 
    keys['access_token_secret']
)

api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [4]:
def hydrate(file_id, last_index = 0):
    
    tweet_ids = pd.read_csv(os.path.join(
        'data',
        'dataverse_files',
        f'election-filter{file_id}.txt'
    ), header = None)[0].values
    
    l = len(tweet_ids)
    rounded = int(l - l % 100)
    tweet_ids = tweet_ids[:rounded].reshape(l // 100, 100)
    
    headers1 = [
        'author_id',
        'author_screen_name',
        'status_id',
        'created_at',
        'body',
        'lang',
        'favorite_count',
        'retweet_count'
    ]
    
    headers2 = [
        'user_id',
        'user_screename',
        'follower_count',
        'verified',
        'statuses_count'
    ]
    
    filename = os.path.join('data', f'tweets_{file_id}.csv')
    write_mode = 'w'
    write_header = True
    if filename in os.listdir('data'):
        write_mode = 'a'
        write_header = False

    with open(filename, write_mode, newline = '', encoding = 'utf-8') as csv_file_tweets:
        writer1 = csv.DictWriter(csv_file_tweets, fieldnames = headers1)
        if write_header:
            writer1.writeheader()
        with open(os.path.join('data',f'users_{file_id}.csv'), write_mode, newline = '') as csv_file_users:
            writer2 = csv.DictWriter(csv_file_users, fieldnames = headers2)
            if write_header:    
                writer2.writeheader()
            for i, id_list in tqdm(enumerate(tweet_ids)):
                if i > last_index:
                    try:
                        statuses = api.statuses_lookup(
                            list(id_list)
                        )            
                    except tweepy.error.TweepError:
                        continue
                    else:
                        for status in statuses:
                            author = status.user

                            author_id = author.id
                            author_screen_name = author.screen_name

                            status_id = status.id
                            created_at = status.created_at

                            if hasattr(status, 'retweeted_status'):
                                try:
                                    body = status.retweeted_status.extended_tweet['full_text']
                                except AttributeError:
                                    body = status.retweeted_status.text
                            else:
                                try:
                                    body = status.extended_tweet['full_text']
                                except AttributeError:
                                    body = status.text

                            lang = status.lang

                            favorite_count = status.favorite_count
                            retweet_count = status.retweet_count

                            writer1.writerow({
                                'author_id' : author_id,
                                'author_screen_name' : author_screen_name,
                                'status_id' : status_id,
                                'created_at' : created_at,
                                'body' : body,
                                'lang' : lang,
                                'favorite_count' : favorite_count,
                                'retweet_count' : retweet_count
                            })

                            writer2.writerow({
                                'user_id' : author_id,
                                'user_screename' : author_screen_name,
                                'follower_count' : author.followers_count,
                                'verified' : author.verified,
                                'statuses_count' : author.statuses_count
                            })


In [None]:
hydrate(1, 0)

57358it [285:21:10,  1.70s/it]   