In [4]:
import os 
import datetime
import tweepy
import json 
import csv
import time
import pandas as pd
import numpy as np
from collections import namedtuple

In [5]:
os.chdir('/Users/ishitagopal/Box/Projects/')

In [6]:
# Helper function for writing to metadata 
# input the metadatfile to write to; input the row to write 

def metadata_writer(metadata_filename, row):
    
    with open(metadata_filename, 'a') as f:
        csvwriter = csv.writer(f)
        csvwriter.writerow(row)
    

In [7]:
# get all the twitter API keys and secrets form enviroment variables 

consumer_key = os.getenv("TWITTER_CONSUMER_KEY")
consumer_secret = os.getenv("TWITTER_CONSUMER_SECRET")
access_key = os.getenv("TWITTER_ACCESS_KEY")
access_secret = os.getenv("TWITTER_ACCESS_SECRET")



In [8]:
def get_timeline(screen_name, state_name):
    
        # Authorize twitter, initialize tweepy
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_key, access_secret)
        api = tweepy.API(auth, wait_on_rate_limit=True, 
                     wait_on_rate_limit_notify=True)

    # Assign the name of fields to stored as metadata    
    Data = namedtuple("Data", ["handle", "handle_id", "date_scraped", 
                               "latest_tweet_id", "total_tweets", 
                               "tweets_per_week", "account_created_date", 
                               "account_status"])
    
    alltweets = []  # Will store all the scraped tweets 
    
    try:
        # Make initial call
        user = api.get_user(screen_name)
        
        # Check if the account is protected
        if user.protected == True:  
            print('protected')
            metadata = Data(screen_name, 'NA', 'NA', 'NA', 'NA', 'NA', 'NA',
                            'protected')  

        # If not, make initial request
        else:
            print('not protected')
            new_tweets = api.user_timeline(screen_name=screen_name, 
                                           count=200, 
                                           exclude_replies=False, 
                                           tweet_mode="extended")    

            # Check if the account posted any tweet 
            if len(new_tweets)==0:
                print('zero tweets')
                metadata = Data(screen_name, 'NA', 'NA', 'NA', 'NA', 'NA', 'NA',
                                'no tweets')
       
            # Check if the account has been active in the last year
            else:    
                time_delta = datetime.date.today() - new_tweets[0].created_at.date()
                if time_delta.days > 365:
                    print('account inactive')
                    metadata = Data(screen_name, 'NA', 'NA', 'NA', 'NA', 'NA',
                                    'NA', 'inactive')               
                else:
                    # If the account is active add tweets to alltweets 
                    alltweets.extend(new_tweets)
                    
                    # Save the id of the oldest tweet less one
                    oldest = alltweets[-1].id - 1
                    
                    # Keep grabbing tweets until there are no tweets left to grab
                    while len(new_tweets) > 0:
                        print ("getting tweets before %s" % (oldest))
                        
                        new_tweets = api.user_timeline(
                            screen_name = screen_name, 
                            count=200, max_id=oldest, 
                            exclude_replies=False,                       
                            tweet_mode="extended")

                        alltweets.extend(new_tweets)

                        # Update the id of the oldest tweet less one
                        oldest = alltweets[-1].id - 1
                        print ("...%s tweets downloaded so far" % 
                               (len(alltweets)))
                        
                    # Check how many tweets were posted in the last 7 days
                    eight_days_ago = datetime.date.today() - datetime.timedelta(days = 8)
                    counter = 0
                    for tweet in alltweets:
                        if tweet.created_at.date() < eight_days_ago:
                            break
                        else:
                            counter+=1
                            
                    # Collect fields for the metedata
                    date_scraped = datetime.date.today()          
                    account_id = alltweets[0].user.id
                    latest_tweet_id = alltweets[0].id
                    total_tweets = len(alltweets)
                    last_week_total_tweets = counter
                    active_since = alltweets[0].user.created_at.date()
                    
                    metadata = Data(screen_name, account_id, date_scraped, 
                                    latest_tweet_id, total_tweets, 
                                    last_week_total_tweets, active_since, 
                                    'active')
                    
    # Except exception if the user does not exist or is suspended 
    except tweepy.error.TweepError as e:
        if (e.api_code == 50) or (e.api_code == 63):
            print(e)
            metadata = Data(screen_name, 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 
                            e.args[0][0]['message'])

    # Return results         
    if len(alltweets)>0:
        results_dic = {'tweets': alltweets, 'metadata': metadata}
    else:
        results_dic = {'tweets':'NA', 'metadata' : metadata}
        
    return results_dic               
                               
    

In [10]:
results = get_timeline('izzigopal', 'Ten')

not protected
getting tweets before 497597381399949311
...62 tweets downloaded so far


In [13]:
results['metadata']

Data(handle='izzigopal', handle_id=2502041252, date_scraped=datetime.date(2020, 8, 6), latest_tweet_id=1283661986656657411, total_tweets=62, tweets_per_week=0, account_created_date=datetime.date(2014, 5, 17), account_status='active')

In [38]:
results['metadata'][0:8]

('izzigopal',
 2502041252,
 datetime.date(2020, 8, 6),
 1283661986656657411,
 62,
 0,
 datetime.date(2014, 5, 17),
 'active')

In [20]:
handles = ['izzigopal', 'ganeshgorti']

In [55]:
# Create a metadat file with fixed number of entries 
# The index 
metadata = df = pd.DataFrame(columns=['handle_id','date_scraped',
                                      'latest_tweet_id','total_tweets',
                                      'tweets_per_week','account_created_date',
                                     'account_status'],
                            index=handles)

today = datetime.date.today().strftime("%B%d") 
fname = '%s_metadata_%s' % (state_name, today) + '.csv'

metadata.to_csv(fname)


In [56]:
metadata.loc['izzigopal'] = results['metadata'][1:8]

In [57]:
metadata

Unnamed: 0,handle_id,date_scraped,latest_tweet_id,total_tweets,tweets_per_week,account_created_date,account_status
izzigopal,2502041252.0,2020-08-06,1.2836619866566574e+18,62.0,0.0,2014-05-17,active
ganeshgorti,,,,,,,


In [45]:
metadata.loc['izzigopal'] 


Unnamed: 0,handle,handle_id,date_scraped,latest_tweet_id,total_tweets,tweets_per_week,account_created_date,account_status
izzigopal,,,,,,,,


In [58]:
# # Download tweets and write to a text file 
# # Assumes you are in the state folder 

# # Open metadata file 


# metadata = pd.read_csv('')

# for handle in handles:

#     results = get_timeline(handle, state_name)

#     # Edit/Add to metadat
afile 
    
#     metadata.loc[handle] = results['metadata'][1:8]
    
#     if results['tweets'] != 'NA':
        
#         # Make a directory for the account if it doesnt already exist
#         # Assumes you are in the state directory 
#         if not os.path.exists(handle):
#             os.mkdir(handle)
            
#             # Dump tweets to file
#             file_name =  '%s/' % handle + '%s_Tweets_' % handle + '.json'
#             print(file_name)
#             with open(file_name , 'w', encoding='utf8') as file:
#                 json.dump([tweet._json for tweet in results['tweets']], file)
#                 print("writen to file")
                
        
# metadata.to_csv('')   
    


FileNotFoundError: [Errno 2] File b'' does not exist: b''

In [74]:
# get_all_tweets('izzigopal', 'New_York')

not protected
getting tweets before 497597381399949311
...63 tweets downloaded so far
writen to file


In [None]:
#         # if the call goes through, check if the user is protected 
#         if user.protected == True:
#             print('protected')
#             # save protected in metadata
#             row = (screen_name, 'NA','NA','NA','NA','NA','NA','protected')
#             metadata_writer(metadata, row)  
            
#         # if not, make initial request     
#         else:
#             print('not protected')
#             new_tweets = api.user_timeline(screen_name = screen_name, count=200, exclude_replies=False, tweet_mode="extended")    

#             #check if there is at least 1 tweet 
#             if len(new_tweets)==0:
#                 print('zero tweets')
                
#                 row = (screen_name, 'NA','NA','NA','NA','NA','NA','no tweets')
#                 metadata_writer(metadata, row)   
       

#             #check if the account is active
#             else:    
#                 time_delta = datetime.date.today() - new_tweets[0].created_at.date()

#                 if time_delta.days > 365:

#                     print('account inactive')
                    
#                     row = (screen_name, 'NA', 'NA','NA','NA','NA','NA','inactive')
#                     metadata_writer(metadata, row)   
        
                        
#                 else:
                    
#                     #If account is active, save the most recent tweets
#                     alltweets.extend(new_tweets)

#                     #save the id of the oldest tweet less one
#                     oldest = alltweets[-1].id - 1
                    
#                     #keep grabbing tweets until there are no tweets left to grab
#                     while len(new_tweets) > 0:
                        
#                         print ("getting tweets before %s" % (oldest))
                        
#                         new_tweets = api.user_timeline(screen_name = screen_name, count=200, max_id=oldest, exclude_replies=False,
#                                                    tweet_mode="extended")

#                         #save most recent tweets

#                         alltweets.extend(new_tweets)

#                         #update the id of the oldest tweet less one
                        
#                         oldest = alltweets[-1].id - 1

#                         print ("...%s tweets downloaded so far" % (len(alltweets)))
                        
#                     # check how many tweets in the last 7 days
                    
#                     # date 8 days ago
#                     eight_days_ago = datetime.date.today() - datetime.timedelta(days = 8)
                    
#                     i = 0
#                     for tweet in alltweets:
#                         if tweet.created_at.date() < eight_days_ago:
#                             break
#                         else:
#                             i+=1
                            
#                     account_id = alltweets[0].user.id
#                     latest_tweet_id = alltweets[0].id
#                     total_tweets = len(alltweets)
#                     last_week_total_tweets = i
#                     active_since = alltweets[0].user.created_at.date()
                
                
#                     # Make directory for each legislator in the list if it doesnt already exist
#                     if not os.path.exists(screen_name):
#                         os.mkdir(screen_name)
    

#                     # save all tweets 
#                     # file_name = 'scren_name/screen_name_Tweets_(date_words)_(latest_tweet_id).json'

#                     file_name =  '%s/' % screen_name + '%s_Tweets_' % screen_name + date_in_words + '_%s' % latest_tweet_id + '.json'                  
#                     with open(file_name , 'w', encoding='utf8') as file:
#                         json.dump([tweet._json for tweet in alltweets], file)

#                     print("writen to file")

#                     # write metadata
#                     row = (screen_name, account_id, date_scraped, latest_tweet_id, total_tweets, last_week_total_tweets, active_since, 'active')
#                     metadata_writer(metadata, row)   


        
        

In [102]:
def get_friend_ids(screen_name):
    

    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    
    
    try:
        friendsids =[]
        
        for user in tweepy.Cursor(api.friends_ids, screen_name= screen_name,count=5000).items():
            friendsids.append(user)
          
        date_scraped = datetime.date.today().strftime('%B%d')
        friends_len = len(friendsids)
    
        file_name = '%s/%s_FriendsIds_%s_%s.csv' %(screen_name, screen_name, date_scraped, friends_len)
        pd.Series(friendsids).to_csv(file_name, index = False, header = False)
            
    except Exception as e:
        print(e)
        time.sleep(5)
    


'/Users/ishitagopal/Desktop'

In [7]:
import os
import glob
os.chdir('/Users/ishitagopal/Box/Projects/state_covid_policy/Data/North_Carolina')

In [8]:
# for account in active_accounts:
#     print(account)
#     get_friend_ids(account)
# Accounts for which friend Ids were scraped successfully 

# Accounts for which friend Ids were scraped successfully 




def collect_friends(accounts_list):
        
        filepath_collected_friends = glob.glob('*/*FriendsIds*.csv')
        
        if len(filepath_collected_friends) == 0:
            
            for account in account_list:
                print(account)
                get_friend_ids(account)
                
        else:
        
        
        completed_friend_collection = [name.split('/')[0] for name in filepath_friends]


# Accounts for which to retry collection 

retry_friends = [x for x in active_accounts if x not in scarped_friends]   


while len(retry_friends) > 0:
    print(len(retry_friends))
    for account in retry_friends:
        print(account)
        get_friend_ids(account)
        
    filepath_friends = glob.glob('*/*FriendsIds*.csv')
    scarped_friends = [name.split('/')[0] for name in filepath_friends]        
    retry_friends = [x for x in active_accounts if x not in scarped_friends]      


    
#Find duplicates     
# from collections import Counter

# [k for k,v in Counter(scarped_friends).items() if v>1]

print(len(retry_friends))

NameError: name 'active_accounts' is not defined

In [None]:
def continue_timeline_collection(screen_name, last_tweet_id, update_number):
    
    alltweets = []
    
    for tweet in tweepy.Cursor(api.user_timeline, screen_name = screen_name, 
                               since_id = last_tweet_id).items(200):
        alltweets.append(tweet)
        
    # date the  account was scraped 
    date_scraped = datetime.date.today()   
    date_in_words = date_scraped.strftime("%B%d")      # format = MonthDay
    
    # assign metadata file name 
    metadata = '%s_metadata_Round%s_%s' % (state_name, update_number, date_in_words) + '.csv'    

    
    row = (screen_name, account_id, date_scraped, latest_tweet_id, total_tweets, last_week_total_tweets, active_since, 'active')
    metadata_writer(metadata, row)
    
    # Dump tweets to file
    # File path = 'screen_name/screenname_Tweets_date_tweetid.json'
    
    if len(alltweets) > 0:

        file_name =  '%s/' % screen_name + '%s_Round2_Tweets_' % screen_name + date_in_words + '_%s' % latest_tweet_id + '.json'                  
        with open(file_name , 'w', encoding='utf8') as file:
            json.dump([tweet._json for tweet in alltweets], file)




In [None]:
get_all_tweets('izzigopal', 'New_Delhi')
os.listdir()
metadata = glob.glob('*_metadata_*.csv')[0]
pd.read_csv(metadata, header = None)
continue_timeline_collection()



In [None]:
## How to open saved json tweets

#json_data = 'Adam_Morfeld/Adam_Morfeld_Tweets_June24_1275617128461164551.json'
#with open(json_data, 'r') as f:
#    distros_dict = json.load(f)
#distros_dict[1]