In [592]:
import os
import glob
import datetime
import pandas as pd



In [593]:
# Import helper functions
os.chdir('/Users/ishitagopal/Box/Projects/state_covid_policy/code/')
%run twitter_api_helper_functions.ipynb

In [594]:
# Set to main directory 
os.chdir('/Users/ishitagopal/Box/Projects/state_covid_policy/')

In [595]:
# Import all metadata files 
meta_files = glob.glob('Data/*/*_metadata_*.csv')

meta_li =[]
for file in meta_files:
    df = pd.read_csv(file, header = None)
    df['state'] = file.split('/')[1]
    meta_li.append(df)

In [596]:

all_meta = pd.concat(meta_li, axis=0, ignore_index=True, sort=False)

#Columns in the first metadata file 
all_meta.columns = ['screen_name', 
                       'account_id',
                       'date_scraped',
                       'latest_tweet_id', 
                       'total_tweets', 
                       'last_week_total_tweets',
                       'active_since', 
                       'active', 
                       'state']

# Metadata 
# metadata_df.columns = ['screen_name', 
#                        'date_scraped',
#                        'latest_tweet_id', 
#                        'total_tweets',  
#                        'active', 
#                        'state']

# cols = ["date_scraped", "latest_tweet_id", "total_tweets", "active"]
# Metadata file number and column names
#metadata_num = 'metadata_R2'

In [602]:
# Subset to include only active accounts
active_meta = all_meta.loc[all_meta.active=='active',:].copy()

In [604]:
# Assign data types
active_meta['date_scraped'] = pd.to_datetime(active_meta['date_scraped'])
active_meta["account_id"] = active_meta["account_id"].astype(int)
active_meta["latest_tweet_id"] = active_meta["latest_tweet_id"].astype(int)

active_meta.dtypes

screen_name                       object
account_id                         int64
date_scraped              datetime64[ns]
latest_tweet_id                    int64
total_tweets                      object
last_week_total_tweets           float64
active_since                      object
active                            object
state                             object
dtype: object

In [605]:
# Number of active accounts v/s other categories
active_meta.active.value_counts()

active    3816
Name: active, dtype: int64

In [606]:
# Number of active accounts in each State 
active_meta.state.value_counts()

Pennsylvania      184
New_York          167
Massachusetts     157
Texas             156
Minnesota         148
New_Hampshire     138
Georgia           135
Florida           132
Maryland          128
Missouri          126
North_Carolina    118
Illinois          116
California        115
Virginia          105
South_Carolina    101
Ohio              100
Michigan           93
Kentucky           87
Connecticut        83
Colorado           83
Tennessee          83
New_Jersy          82
Arizona            78
Louisiana          78
Washington         77
Arkansas           77
Mississippi        74
Indiana            74
Kansas             71
Rhode_Island       68
Utah               63
New_Mexico         60
Nevada             56
Alabama            54
Oregon             53
Montana            47
Maine              44
Idaho              31
Vermont            31
North_Dakota       28
Hawaii             25
Alaska             24
South_Dakota       23
Nebraska           22
Delaware           21
Name: stat

In [607]:
# Maximum number tweets sent in a day 
max(active_meta.last_week_total_tweets)/7

197.28571428571428

In [610]:
# Total number of legislators who tweet more than a 100 times in a week
active_meta[active_meta.last_week_total_tweets > 100]['state'].value_counts().sum()

87

In [611]:
# Distribution by State 
active_meta[active_meta.last_week_total_tweets > 100]['state'].value_counts()

Arizona           9
Texas             7
Minnesota         7
New_Hampshire     7
New_Mexico        6
New_York          4
Colorado          4
California        3
Florida           3
Massachusetts     3
Kentucky          3
Tennessee         3
Delaware          2
Ohio              2
Georgia           2
Arkansas          2
North_Carolina    2
Alabama           2
Maryland          2
Pennsylvania      2
Nebraska          1
Nevada            1
Washington        1
Oregon            1
Indiana           1
South_Carolina    1
Vermont           1
Missouri          1
Virginia          1
Illinois          1
Mississippi       1
Rhode_Island      1
Name: state, dtype: int64

In [612]:
# State names
States = all_meta.state.value_counts().index.to_list()


In [556]:
# Identify States for which to collect tweets 
# Pinpoints when 15 days have passed since the last download

today = datetime.datetime.today()
days_elapsed = today - df_active_metadata.date_scraped
df_collect = df_active_metadata[days_elapsed.dt.days > 15]

if df_collect.shape[0]>0:
    
    States = sorted(df_collect.state.value_counts().index.to_list())
    

In [557]:
States

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New_Hampshire',
 'New_Jersy',
 'New_Mexico',
 'New_York',
 'North_Carolina',
 'North_Dakota',
 'Ohio']

In [659]:
state = 'New_York'
# Navigate to State folder
path_state = ("/Users/ishitagopal/Box/Projects/state_covid_policy/Data/" + state)
os.chdir(path_state)

In [673]:
# Check if metadata file exists, else create one 
fname = '%s_' % state + metadata_num +'.csv'
print(fname)

# Subset to state dataframe
state_meta = active_meta.loc[active_meta['state']==state,:] 

# Create metadata file if it doesnt exist 
if os.path.exists(fname)==True:
    print('File already exists')
    df = pd.read_csv(fmeta[0], index_col=0)
    handles = df.loc[df['active'].isnull(),:].index.to_list()
    since_id = state_meta.loc[state_meta['screen_name'].isin(handles), 'latest_tweet_id'].to_list()
    print(len(handles))
    
else:
    print('Creating file')
    handles = state_meta['screen_name'].to_list()
    since_id = state_meta['latest_tweet_id'].to_list()
    df = pd.DataFrame(columns=cols,index=handles)
    df.to_csv(fname)
    print(len(handles))

New_York_metadata_R2.csv
File already exists
2


In [672]:
for i in range(0,len(handles)):    
    
    print('Handle name: %s, Handle name: %s' % (i, handles[i]))
    
    # Read metadata csv 
    metadata = pd.read_csv(fname, index_col=0)

    # Download tweets
    results = get_timeline_since(handles[i], since_id[i])
    len(results['tweets'])
    metadata.loc[handles[i]] = results['metadata'][1:5]

    if len(results['tweets']) > 0:

        # Dump tweets to file
        date = today.strftime("%B%d") 
        file_name =  '%s/' % handles[i] + '%s_Tweets_%s' % (handles[i], date) + '.json'
        print(file_name)
        with open(file_name , 'w', encoding='utf8') as file:
            json.dump([tweet._json for tweet in results['tweets']], file)
            print("writen to file")

    metadata.to_csv(fname)

SyntaxError: invalid syntax (<ipython-input-672-68c7c0f9015d>, line 4)

In [481]:
#New york 
30, 64

(30, 64)

In [499]:
metadata.active.isna().sum() 

AttributeError: 'DataFrame' object has no attribute 'active'

In [483]:
# Number of active accounts
metadata.active.value_counts()


active                                          99
[{'code': 50, 'message': 'User not found.'}]     1
Name: active, dtype: int64

In [498]:
# Number of accounts with 0 tweets
sum(metadata.total_tweets.value_counts()==0)

0