In [1]:
import tweepy
import pandas as pd
import os
import glob
from datetime import datetime
from keys import *

In [2]:
# define scraper class
class twitter_scraper:
    def __init__(self, consumer_key, consumer_secret, access_token, access_secret):
        """
        Class for scraping tweets from twitter's resting API.
        Pass a set of consumer and access keys
        """
        self.auth = tweepy.OAuthHandler(consumer_key = consumer_key, consumer_secret = consumer_secret)
        self.auth.set_access_token(access_token, access_secret)
        self.api = tweepy.API(self.auth)
        
    def original_tweet(self, status):
        """
        Check if a tweet is original or a retweet/reply. 
        
        """
        if hasattr(status, 'retweeted_status'):
            return False
        elif status.in_reply_to_status_id != None:
            return False
        elif status.in_reply_to_screen_name != None:
            return False
        elif status.in_reply_to_user_id != None:
            return False
        else:
            return True
    
    def pull_tweets(self, twitter_id, count = 200, since = None, tweet_mode = 'extended'):
        """
        Scrapes twitter the twitter timeline for original tweets from the given twitter id and returns them as a list of status objects
        Default is set to 200 most recent tweets.
        If you want to pull from a specific tweet pass a tweet ID to the 'since' variable
        """
        user_tweets = self.api.user_timeline(user_id = twitter_id, count = count, since_id = since, tweet_mode = tweet_mode)
        user_tweets = [tweet for tweet in user_tweets if self.original_tweet(tweet) == True]
        return user_tweets
    
    def tweets_to_df(self, tweet_list):
        """
        Convert a list of tweet objects to a pandas data frame
        columns include: tweet id, tweet text, date created, retweet, and user id
        """
        tweet_ids = []
        tweet_texts = []
        created_at = []
        retweet = []
        user_id = []

        for tweet in tweet_list:
            tweet_ids.append(tweet.id)
            tweet_texts.append(tweet.full_text)
            created_at.append(tweet.created_at)
            retweet.append(tweet.retweeted)
            user_id.append(tweet.user.id)

        tweets_df = pd.DataFrame(zip(tweet_ids, tweet_texts, created_at, retweet, user_id), columns = ["tweet_id", 'text', 'created', 'retweet', 'user_id'])
        return tweets_df

### Initial Pull

In [45]:
%%time
# initialize scraper
scraper = twitter_scraper(consumer_key, consumer_secret, access_token, access_secret)

# import congress twitter user IDs 
congress_meta = pd.read_csv('congress_meta_data.csv')
congress_ids = congress_meta['id']

# loop through each member of congress and append their tweets to a list
tweets = []
for user in congress_ids:
    user_tweets = scraper.pull_tweets(user)
    tweets = tweets + user_tweets

# convert the tweets to a dataframe 
tweets_df = scraper.tweets_to_df(tweets)

tweets_df.to_csv('congressional_tweets_' + datetime.now().strftime('%Y_%m_%d') + '.csv', index = False)

KeyboardInterrupt: 

### Daily Pull

In [13]:
# Get the most recent tweet from all users
# Import unique IDs
aggregated_tweets = pd.read_csv('Data/aggregated_tweets.csv')
unique_ids = aggregated_tweets['user_id'].unique()

# Get the last tweet for each unique ID
last_tweets = []
for user in unique_ids:
    last_tweet = max(aggregated_tweets[aggregated_tweets['user_id'] == user]['tweet_id'])
    last_tweets.append(last_tweet)

# Pass to a dictionary of IDs and their most recent tweet
last_tweet_dict = dict(zip(unique_ids, last_tweets))

In [14]:
%%time
# initialize scraper
scraper = twitter_scraper(consumer_key, consumer_secret, access_token, access_secret)

# import congress twitter user IDs 
congress_meta = pd.read_csv('Meta Data/congress_meta_data.csv')
congress_ids = congress_meta['id']

# loop through each member of congress and append their tweets to a list
tweets = []
for key, value in last_tweet_dict.items():
    user_tweets = scraper.pull_tweets(key, since = value)
    tweets = tweets + user_tweets

# Convert the tweets to a dataframe 
tweets_df = scraper.tweets_to_df(tweets)
tweets_df.to_csv('raw_data/congressional_tweets_' + datetime.now().strftime('%Y_%m_%d_%H_%M') + '.csv', index = False)

# Merge new tweets with previously aggregated tweets and write to csv
merged_tweets = pd.concat([aggregated_tweets, tweets_df])
merged_tweets.to_csv('Data/aggregated_tweets.csv', index = False)

CPU times: user 23.3 s, sys: 626 ms, total: 23.9 s
Wall time: 3min 13s
