# Twitter data collection

Note that you will need to have an API key from twitter to use this. 

## Imports
- `pandas` is a package for handling and manipulating data.
- `tweepy` is a package that helps you use the Twitter API in python.
- `time` is a package that lets you work with time in python. 

In [None]:
import pandas as pd
import tweepy
import time
import sys
import json

## keys
- you'll need API keys from twitter to use their API. Once you have them, they can go here.

In [None]:
keys = {'CONSUMER_KEY': 'xxx',
        'CONSUMER_SECRET': 'xxx'}

In [None]:
with open('twitter_auth', 'r') as f:
    keys = json.loads(f.read())    
keys

In [None]:
auth = tweepy.AppAuthHandler(keys['CONSUMER_KEY'], 
                                  keys['CONSUMER_SECRET'])

api = tweepy.API(auth, wait_on_rate_limit=True, 
                 wait_on_rate_limit_notify=True)

if not api:
    print ("Can't Authenticate :(")
else:
    print('Authenticated successfully!')

## User account names we want to get tweets for

In [None]:
users = ['nyctaxi', 'NYTWA', 'NYC_TMODA', 'YellowCabNYC', 
         'NYC_DOT', 'NYCTSubway']

## Some functions to help us

In [None]:
def get_user_tweets(name, api):
    tweets = []
    try: 
        for status in tweepy.Cursor(api.user_timeline, 
                                    screen_name=name, 
                                    count=200).items():
            try:
                tweets.append(status._json)
            except:
                pass
    except:
        pass
    return tweets

def get_user_followers(name, api):
    followers = []
    try: 
        for status in tweepy.Cursor(api.followers, 
                                    screen_name=name, 
                                    count=200).items():
            try:
                followers.append(status._json)
            except:
                pass
            if len(followers) >= 5000:
                print(name, "has more than 5,000 followers. Stopping at 5,000.")
                break
    except:
        pass
    return followers

def get_user_followers_ids(name, api):
    followers = []
    try: 
        for status in tweepy.Cursor(api.followers_ids, 
                                    screen_name=name, 
                                    count=5000).items():
            try:
                followers.append(status)
            except:
                pass
            if len(followers) >= 5000:
                print(name, "has more than 5,000 followers. Stopping at 5,000.")
                break
            
    except:
        pass
    return {'name':name, 'followers': followers}


def get_user_friends(name, api):
    friends = []
    try: 
        for status in tweepy.Cursor(api.friends, 
                                    screen_name=name, 
                                    count=200).items():
            try:
                friends.append(status._json)
            except:
                pass
            if len(friends) >= 5000:
                print(name, "has more than 5,000 friends. Stopping at 5,000.")
                break
    except:
        pass
    return friends

def get_user_friends_ids(name, api):
    friends = []
    try: 
        for status in tweepy.Cursor(api.friends_ids, 
                                    screen_name=name, 
                                    count=5000).items():
            try:
                friends.append(status)
            except:
                pass
            if len(friends) >= 5000:
                print(name, "has more than 5,000 friends. Stopping at 5,000.")
                break
    except:
        pass
    return {'name':name, 'friends': friends}

In [None]:
all_tweets = []

for u in users:
    print("Getting tweets for user", u)
    tweets = get_user_tweets(u, api)
    print("Found", len(tweets), 'tweets.')
    all_tweets.extend(tweets)
    
print('Done!')

## What's the data look like?

In [None]:
all_tweets[0]

## Save our data

In [None]:
with open('tweets_raw.json', 'w') as out_file:
    for t in all_tweets:
        json.dump(t, out_file)
        out_file.write('\n')

## Convert it to pandas

In [None]:
df = pd.DataFrame(all_tweets)
df.head()

## Friends

In [None]:
all_friends = []

for u in users:
    print("Getting friends for user", u)
    friends = get_user_friends(u, api)
    print("Found", len(friends), 'friends.')
    all_friends.extend(friends)
    
print('Done!')

In [None]:
all_friends[0]

In [None]:
with open('data/friends_raw.json', 'w') as out_file:
    for t in all_friends:
        json.dump(t, out_file)
        out_file.write('\n')

## Followers

In [None]:
all_followers = []

for u in users:
    print("Getting followers for user", u)
    followers = get_user_followers(u, api)
    print("Found", len(followers), 'followers.')
    all_followers.extend(followers)
    
print('Done!')

In [None]:
all_followers[0]

In [None]:
with open('data/followers_raw.json', 'w') as out_file:
    for t in all_followers:
        json.dump(t, out_file)
        out_file.write('\n')

## Followers IDs only

In [None]:
all_followers_ids = []

for u in users:
    print("Getting follower IDs for user", u)
    followers = get_user_followers_ids(u, api)
    print("Found", len(followers['followers']), 'followers.')
    all_followers_ids.append(followers)
    
print('Done!')

In [None]:
all_followers_ids[2]

In [None]:
with open('data/followers_ids_raw.json', 'w') as out_file:
    for t in all_followers_ids:
        json.dump(t, out_file)
        out_file.write('\n')