In [16]:
# import libraries 
import pandas as pd
from collections import namedtuple
import os 
import datetime
import tweepy
import json 
import csv
import time
import glob

In [17]:
# Get all the twitter API keys and secrets form enviroment variables 
# These are used to make calls to all the Twitter apis used bellow

consumer_key = os.getenv("TWITTER_CONSUMER_KEY")
consumer_secret = os.getenv("TWITTER_CONSUMER_SECRET")
access_key = os.getenv("TWITTER_ACCESS_KEY")
access_secret = os.getenv("TWITTER_ACCESS_SECRET")



In [161]:
def get_timeline(screen_name, state_name):
    
    # Authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True, 
                     wait_on_rate_limit_notify=True)

    # Assign metadata fields
    Data = namedtuple("Data", ["handle", "handle_id", "date_scraped", 
                               "latest_tweet_id", "total_tweets", 
                               "tweets_per_week", "account_created_date", 
                               "account_status"])
    
    alltweets = []  # Will store all the scraped tweets 
    
    try:
        # Make initial call
        user = api.get_user(screen_name)
        
        # Check if the account is protected
        if user.protected == True:  
            print('protected')
            metadata = Data(screen_name, 'NA', 'NA', 'NA', 'NA', 'NA', 'NA',
                            'protected')  

        # If not, make initial request
        else:

            print('not protected')
            new_tweets = api.user_timeline(screen_name=screen_name, 
                                           count=200, 
                                           exclude_replies=False, 
                                           tweet_mode="extended")    

            # Check if the account posted any tweets 
            if len(new_tweets)==0:
                print('zero tweets')
                metadata = Data(screen_name, 'NA', 'NA', 'NA', 'NA', 'NA', 'NA',
                                'no tweets')
       
            # Check if the account has been active in the last year
            else:    
                time_delta = datetime.date.today() - new_tweets[0].created_at.date()
                if time_delta.days > 365:
                    print('account inactive')
                    metadata = Data(screen_name, 'NA', 'NA', 'NA', 'NA', 'NA',
                                    'NA', 'inactive')   
                    
                else:
                    # If the account is active add tweets to alltweets 
                    alltweets.extend(new_tweets)
                    
                    # Save the id of the oldest tweet less one
                    oldest = alltweets[-1].id - 1
                    
                    # Keep grabbing tweets until there are no tweets left to grab
                    while len(new_tweets) > 0:
                        print ("getting tweets before %s" % (oldest))
                        
                        new_tweets = api.user_timeline(
                            screen_name = screen_name, 
                            count=200, max_id=oldest, 
                            exclude_replies=False,                       
                            tweet_mode="extended")

                        alltweets.extend(new_tweets)

                        # Update the id of the oldest tweet less one
                        oldest = alltweets[-1].id - 1
                        print ("...%s tweets downloaded so far" % 
                               (len(alltweets)))
                        
                    # Check how many tweets were posted in the last 7 days
                    eight_days_ago = datetime.date.today() - datetime.timedelta(days = 8)
                    counter = 0
                    for tweet in alltweets:
                        if tweet.created_at.date() < eight_days_ago:
                            break
                        else:
                            counter+=1
                            
                    # Collect metedata fields 
                    date_scraped = datetime.date.today()          
                    account_id = alltweets[0].user.id
                    latest_tweet_id = alltweets[0].id
                    total_tweets = len(alltweets)
                    last_week_total_tweets = counter
                    active_since = alltweets[0].user.created_at.date()
                    
                    metadata = Data(screen_name, account_id, date_scraped, 
                                    latest_tweet_id, total_tweets, 
                                    last_week_total_tweets, active_since, 
                                    'active')
                    
    # Except exception if the user does not exist or is suspended 
    except tweepy.error.TweepError as e:
        if (e.api_code == 50) or (e.api_code == 63):
            print(e)
            metadata = Data(screen_name, 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 
                            e.args[0][0]['message'])
            
        else:
            raise

    # Return results         
    if len(alltweets)>0:
        results_dic = {'tweets': alltweets, 'metadata': metadata}
    else:
        results_dic = {'tweets':'NA', 'metadata' : metadata}
        
    return results_dic               
                               
    

In [160]:
# continue timeline collection 

def get_timeline_since(screen_name, since_id):
    
    
# Authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True,
                    wait_on_rate_limit_notify=True)
    
    date_scraped = datetime.date.today()          
    
    # Create a tupple to collect metadata
    Data = namedtuple("Data", ["handle", "date_scraped", 
                           "latest_tweet_id", "total_tweets", "active"])

    # Initialize empty list to store tweets
    alltweets =[]
    
    try: 
        
        # Make initial call
        user = api.get_user(screen_name)
        
        # Check if the account is protected
        if user.protected == True:  
            print('protected')
            
            # Store results
            metadata = Data(screen_name, date_scraped, since_id,
                            'NA', 'protected')
            
            results_dic = {'tweets': alltweets, 
                           'metadata' : metadata}
                            
        # If not protected, make a call to collect all tweets 
        else: 
            for tweet in tweepy.Cursor(api.user_timeline, 
                                       screen_name= screen_name, 
                                       since_id = since_id, 
                                       count=200, 
                                       exclude_replies=False, 
                                       tweet_mode="extended").items():

                    alltweets.append(tweet)

            print(len(alltweets))
            
            # Check if there are non 0 tweets
            if len(alltweets) > 0:
                
                # Store results
                latest_tweet_id = alltweets[0].id
                total_tweets = len(alltweets)
                
                metadata = Data(screen_name, date_scraped, latest_tweet_id, 
                                total_tweets, 'active')
                
                results_dic = {'tweets':alltweets, 
                               'metadata' : metadata}

            else: 
                # Store results
                metadata = Data(screen_name, 
                                date_scraped, 
                                since_id, 
                                0,
                                'active')

                results_dic = {'tweets':alltweets, 
                               'metadata' : metadata}
            
    # Except Exception if: 
    # 1. the user does not exist 
    # 2. the user is suspended  
    # 3. the page does not exist
    
    except tweepy.error.TweepError as e:
        if (e.api_code == 50) or (e.api_code == 63) or ('404' in e.reason):
            print(e)
            
            # Store results 
            metadata = Data(screen_name, date_scraped, since_id, 
                            'NA', e)
            
            results_dic = {'tweets':alltweets, 
                           'metadata' : metadata}
            
        # Raise unexpected exceptions    
        else:
            raise
        
    # Return results -- the tweets and metadata are stored in a dictionary
    return results_dic


In [102]:
# Define a function for collecting the the ids of accounts that follow our target account 
# Makes use of Twitter's followers_ids api and grabs 5000 followes in every request
# 15 requests/15 minutes are allowed 

# Assumes your are in the main directory (the state folder) where subfolders of each account are present
# Example: Set the directory to 'California' to collect followers of legislators from California 


def get_follower_ids(screen_name):    

    # Authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True, 
                     wait_on_rate_limit_notify=True)
    
    
    try:
        followerids =[]
        
        for user in tweepy.Cursor(api.followers_ids, screen_name= screen_name ,count=5000).items():
            followerids.append(user)
          
        # Create file name 
        date_scraped = datetime.date.today().strftime('%B%d')
        follower_len = len(followerids)
        file_name = '%s/%s_FollowerIds_%s_%s.csv' %(screen_name, screen_name, date_scraped, follower_len)
        
        # Write to file
        pd.Series(followerids).to_csv(file_name, index = False, header = False)
           

    except Exception as e:
        print(e)    
    

In [103]:
# Define a function for collecting the the ids of accounts that are followed by our target account 
# Makes use of Twitter's friends_ids api and grabs 5000 followes in every request
# 15 requests/15 minutes are allowed 

# Assumes your are in the main directory (the state folder) where subfolders of each account are present
# Example: Set the directory to 'California' to collect friends of legislators from California 

def get_friend_ids(screen_name):

    # Authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    try:
        friendsids =[]
        
        for user in tweepy.Cursor(api.friends_ids, screen_name= screen_name,count=5000).items():
            friendsids.append(user)
        
        # Create file name
        date_scraped = datetime.date.today().strftime('%B%d')
        friends_len = len(friendsids)
        file_name = '%s/%s_FriendsIds_%s_%s.csv' %(screen_name, screen_name, date_scraped, friends_len)
        
        # Write to file
        pd.Series(friendsids).to_csv(file_name, index = False, header = False)
            
    except Exception as e:
        print(e)
    
    

In [104]:
# State names and their abbreviations 

us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}


In [13]:
# # Create metadat file 

# def create_metadata(StateName, metadata_num, col, handles):
    
#         df = pd.DataFrame(columns= cols,  
#                             index= handles)  # create it 

#         today = datetime.date.today().strftime("%B%d") 
#         fname = '%s_' % state_name + metadata_num + '.csv'
#         df.to_csv(fname)  

#         status = 'Created file named: %s'%fname)
        
        
#     else: 
#         print('File already exists: %s' %path[0]) 
#         return path[0]
    

