In [43]:
import tweepy
import pandas as pd
import os
import glob
from datetime import datetime
from keys import *

In [44]:
# define scraper class
class twitter_scraper:
    def __init__(self, consumer_key, consumer_secret, access_token, access_secret):
        """
        Class for scraping tweets from twitter's resting API.
        Pass a set of consumer and access keys
        """
        self.auth = tweepy.OAuthHandler(consumer_key = consumer_key, consumer_secret = consumer_secret)
        self.auth.set_access_token(access_token, access_secret)
        self.api = tweepy.API(self.auth)
        
    def original_tweet(self, status):
        """
        Check if a tweet is original or a retweet/reply. 
        
        """
        if hasattr(status, 'retweeted_status'):
            return False
        elif status.in_reply_to_status_id != None:
            return False
        elif status.in_reply_to_screen_name != None:
            return False
        elif status.in_reply_to_user_id != None:
            return False
        else:
            return True
    
    def pull_tweets(self, twitter_id, count = 200, since = None, tweet_mode = 'extended'):
        """
        Scrapes twitter the twitter timeline for original tweets from the given twitter id and returns them as a list of status objects
        Default is set to 200 most recent tweets.
        If you want to pull from a specific tweet pass a tweet ID to the 'since' variable
        """
        user_tweets = self.api.user_timeline(user_id = twitter_id, count = count, since_id = since, tweet_mode = tweet_mode)
        user_tweets = [tweet for tweet in user_tweets if self.original_tweet(tweet) == True]
        return user_tweets
    
    def tweets_to_df(self, tweet_list):
        """
        Convert a list of tweet objects to a pandas data frame
        columns include: tweet id, tweet text, date created, retweet, and user id
        """
        tweet_ids = []
        tweet_texts = []
        created_at = []
        retweet = []
        user_id = []

        for tweet in tweet_list:
            tweet_ids.append(tweet.id)
            tweet_texts.append(tweet.full_text)
            created_at.append(tweet.created_at)
            retweet.append(tweet.retweeted)
            user_id.append(tweet.user.id)

        tweets_df = pd.DataFrame(zip(tweet_ids, tweet_texts, created_at, retweet, user_id), columns = ["tweet_id", 'text', 'created', 'retweet', 'user_id'])
        return tweets_df

### Initial Pull

In [45]:
%%time
# initialize scraper
scraper = twitter_scraper(consumer_key, consumer_secret, access_token, access_secret)

# import congress twitter user IDs 
congress_meta = pd.read_csv('congress_meta_data.csv')
congress_ids = congress_meta['id']

# loop through each member of congress and append their tweets to a list
tweets = []
for user in congress_ids:
    user_tweets = scraper.pull_tweets(user)
    tweets = tweets + user_tweets

# convert the tweets to a dataframe 
tweets_df = scraper.tweets_to_df(tweets)

tweets_df.to_csv('congressional_tweets_' + datetime.now().strftime('%Y_%m_%d') + '.csv', index = False)

KeyboardInterrupt: 

### Daily Pull

In [46]:
# Get the most recent tweet from all users
# Import unique IDs
aggregated_tweets = pd.read_csv('Data/aggregated_tweets.csv')
unique_ids = aggregated_tweets['user_id'].unique()

# Get the last tweet for each unique ID
last_tweets = []
for user in unique_ids:
    last_tweet = max(aggregated_tweets[aggregated_tweets['user_id'] == user]['tweet_id'])
    last_tweets.append(last_tweet)

# Pass to a dictionary of IDs and their most recent tweet
last_tweet_dict = dict(zip(unique_ids, last_tweets))

In [47]:
%%time
# initialize scraper
scraper = twitter_scraper(consumer_key, consumer_secret, access_token, access_secret)

# import congress twitter user IDs 
congress_meta = pd.read_csv('congress_meta_data.csv')
congress_ids = congress_meta['id']

# loop through each member of congress and append their tweets to a list
tweets = []
for key, value in last_tweet_dict.items():
    user_tweets = scraper.pull_tweets(key, since = value)
    tweets = tweets + user_tweets

# Convert the tweets to a dataframe 
tweets_df = scraper.tweets_to_df(tweets)
tweets_df.to_csv('Data/congressional_tweets_' + datetime.now().strftime('%Y_%m_%d_%H_%M') + '.csv', index = False)

# Merge new tweets with previously aggregated tweets and write to csv
merged_tweets = pd.concat([aggregated_tweets, tweets_df])
merged_tweets.to_csv('Data/aggregated_tweets.csv', index = False)

CPU times: user 30.8 s, sys: 887 ms, total: 31.7 s
Wall time: 2min 46s


In [31]:
text = merged_tweets['text']

In [42]:
import re
re.sub()

## Development

# To do
1. implement pull since
2. implement concat all tweets
3. Automate for daily pull

In [8]:
test_id = congress_ids[0]

In [17]:
test_pull = scraper.pull_tweets(twitter_id = test_id)

In [20]:
scraper.tweets_to_df(test_pull)

Unnamed: 0,tweet_id,text,created,retweet,user_id
0,1190038962376720384,That didn’t take long. After a vote to make th...,2019-10-31 22:52:53,False,1090328229548826627
1,1189921234592636928,Looking forward to showing @RepMarkGreen aroun...,2019-10-31 15:05:04,False,1090328229548826627
2,1189909515032055808,A yes vote on this resolution gives a stamp of...,2019-10-31 14:18:30,False,1090328229548826627
3,1189896903640723456,The House will vote on a resolution that will ...,2019-10-31 13:28:23,False,1090328229548826627
4,1189627626484310022,My DC team and I were happy to help @the_USO t...,2019-10-30 19:38:23,False,1090328229548826627
...,...,...,...,...,...
107,1148630022581829634,This week we'll be looking at the NDAA. What d...,2019-07-09 16:28:32,False,1090328229548826627
108,1144355855757979653,"After a day's worth of unnecessary drama, the ...",2019-06-27 21:24:31,False,1090328229548826627
109,1144277617945788416,There are 50+ reps in line w/me &amp; all want...,2019-06-27 16:13:38,False,1090328229548826627
110,1144226917102641153,It's PTSD Awareness Day. Many vets are suffer...,2019-06-27 12:52:10,False,1090328229548826627


In [3]:
auth = tweepy.OAuthHandler(consumer_key = consumer_key, consumer_secret = consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth)

In [7]:
#import congress IDs 
congress_meta = pd.read_csv('congress_meta_data.csv')
congress_ids = congress_meta['id']

In [5]:
# get rid of and just tag original tweets?
def original_tweet(status):
    if hasattr(status, 'retweeted_status'):
        return False
    elif status.in_reply_to_status_id != None:
        return False
    elif status.in_reply_to_screen_name != None:
        return False
    elif status.in_reply_to_user_id != None:
        return False
    else:
        return True

In [27]:
# Does an initial pull of the 200 most recent tweets
def pull_tweets(twitter_id, since = None):
    user_tweets = api.user_timeline(user_id = twitter_id, count = 200, since_id = since, tweet_mode='extended')
    user_tweets = [tweet for tweet in user_tweets if original_tweet(tweet) == True]
    return user_tweets

In [12]:
%%time
# loop through all users and append tweets to master tweet list
tweets = []
for user in congress_ids:
    user_tweets = pull_tweets(user)
    tweets = tweets + user_tweets

CPU times: user 1min 7s, sys: 2.55 s, total: 1min 9s
Wall time: 8min 4s


In [29]:
def tweets_to_df(tweet_list):
    tweet_ids = []
    tweet_texts = []
    created_at = []
    retweet = []
    user_id = []

    for tweet in tweet_list:
        tweet_ids.append(tweet.id)
        tweet_texts.append(tweet.full_text)
        created_at.append(tweet.created_at)
        retweet.append(tweet.retweeted)
        user_id.append(tweet.user.id)
    
    tweets_df = pd.DataFrame(zip(tweet_ids, tweet_texts, created_at, retweet, user_id), columns = ["tweet_id", 'text', 'created', 'retweet', 'user_id'])
    return tweets_df

In [23]:
tweets_to_df(tweet_list)
tweets_df.to_csv("congressional_tweets.csv", index = False)

In [27]:
# Pull all unique user id's from the data set
unique_ids = tweets_df['user_id'].unique()

In [8]:
# Gets the tweet id from the most recent tweet in the database
def get_latest_tweet(user_id):
    latest_tweet = max(tweets_df[tweets_df['user_id'] == user]['tweet_id'])
    return latest_tweet

In [None]:
# get the most recent tweet from each user
last_tweets = []
for user in unique_ids:
    last_tweet = get_latest_tweet(user)
    last_tweets.append(last_tweet)

In [64]:
# create a dictionary of the user ids and the associated most recent tweet id
last_tweet_dict = dict(zip(unique_ids, last_tweets))

In [74]:
%%time
# search from last tweet
user_tweet_list = []
for key, value in last_tweet_dict.items():
    user_tweets = pull_tweets(key, since = value)
    user_tweet_list = user_tweet_list + user_tweets

CPU times: user 30.8 s, sys: 1.44 s, total: 32.2 s
Wall time: 3min 9s


In [80]:
latest_tweets = tweets_to_df(user_tweet_list)
latest_tweets.head()

Unnamed: 0,tweet_id,text,created,retweet,user_id
0,1189251026081243136,20.\n\nThat's the number of legislative days l...,2019-10-29 18:41:54,False,1090328229548826627
1,1189206238460035073,"It’s too little, too late. The Democrat’s moti...",2019-10-29 15:43:56,False,1090328229548826627
2,1189226346419048453,I signed a letter to the EPA advising that its...,2019-10-29 17:03:50,False,1083474782602125318
3,1188908497578139648,"Actually, I did sign that letter. Another of e...",2019-10-28 20:00:49,False,1083474782602125318
4,1188616010124943360,Such an honor to spend several hours with Mank...,2019-10-28 00:38:34,False,1083474782602125318


In [85]:
# combine original tweets and latest tweets
latest_tweet_df = pd.merge(tweets_df, latest_tweets, how='outer')

In [103]:
# writes the latest tweets to a csv with the date recorded.
latest_tweet_df.to_csv('congressional_tweets_' + datetime.now().strftime('%Y_%m_%d') + '.csv')

## Repeated scrape
1. Read in the most recent csv
2. get a dictionary of most recent tweets for users
3. Search from the most recent tweet
4. Combine recent tweets with all data and save

In [10]:
import os
import glob
recent_tweets_filename = max(glob.iglob("*.csv"), key=os.path.getmtime)

In [28]:
%%time
# Import most recent data
tweets_df = pd.read_csv(recent_tweets_filename)
# Get unique user IDs
unique_ids = tweets_df['user_id'].unique()

# Get a list of the most recent tweets from each user
last_tweets = []
for user in unique_ids:
    last_tweet = get_latest_tweet(user)
    last_tweets.append(last_tweet)

# Create a dictionary of users and their most recent tweets
last_tweet_dict = dict(zip(unique_ids, last_tweets))

# Pull the most recent tweets from the API
user_tweet_list = []
for key, value in last_tweet_dict.items():
    user_tweets = pull_tweets(key, since = value)
    user_tweet_list = user_tweet_list + user_tweets

# Convert to a data frame
latest_tweets = tweets_to_df(user_tweet_list)

# Write to csv
latest_tweets.to_csv('congressional_tweets_' + datetime.now().strftime('%Y_%m_%d') + '.csv', index = False)



RateLimitError: [{'message': 'Rate limit exceeded', 'code': 88}]

Still to do:
1. trouble shoot above daily run script
2. specify pathways so I save data in a csv folder
3. Organize above script