# Twitter data collection

Note that you will need to have an API key from twitter to use this. 

## Imports
- `pandas` is a package for handling and manipulating data.
- `tweepy` is a package that helps you use the Twitter API in python.
- `time` is a package that lets you work with time in python. 

In [3]:
import pandas as pd
import tweepy
import time
import sys
import json

## keys
- you'll need API keys from twitter to use their API. Once you have them, they can go here.

In [4]:
keys = {'CONSUMER_KEY': 'xxx',
        'CONSUMER_SECRET': 'xxx'}

In [5]:
with open('twitter_auth', 'r') as f:
    keys = json.loads(f.read())    
keys

{'CONSUMER_KEY': '4ER7Uf3Bp8rjVHf2LKcQd1tFO',
 'CONSUMER_SECRET': 'Jq83413Rg7nDhwmhJHlaLOUKvGo0hLrGbFlh8iJJqhGB6VICXS'}

In [6]:
auth = tweepy.AppAuthHandler(keys['CONSUMER_KEY'], 
                                  keys['CONSUMER_SECRET'])

api = tweepy.API(auth, wait_on_rate_limit=True, 
                 wait_on_rate_limit_notify=True)

if not api:
    print ("Can't Authenticate :(")
else:
    print('Authenticated successfully!')

Authenticated successfully!


## User account names we want to get tweets for

In [7]:
users = ['nyctaxi', 'NYTWA', 'NYC_TMODA', 'YellowCabNYC', 
         'NYC_DOT', 'NYCTSubway']

## Some functions to help us

In [8]:
def get_user_tweets(name, api):
    tweets = []
    try: 
        for status in tweepy.Cursor(api.user_timeline, 
                                    screen_name=name, 
                                    count=200).items():
            try:
                tweets.append(status._json)
            except:
                pass
    except:
        pass
    return tweets

def get_user_followers(name, api):
    followers = []
    try: 
        for status in tweepy.Cursor(api.followers, 
                                    screen_name=name, 
                                    count=200).items():
            try:
                followers.append(status._json)
            except:
                pass
            if len(followers) >= 5000:
                print(name, "has more than 5,000 followers. Stopping at 5,000.")
                break
    except:
        pass
    return followers

def get_user_followers_ids(name, api):
    followers = []
    try: 
        for status in tweepy.Cursor(api.followers_ids, 
                                    screen_name=name, 
                                    count=5000).items():
            try:
                followers.append(status)
            except:
                pass
            if len(followers) >= 5000:
                print(name, "has more than 5,000 followers. Stopping at 5,000.")
                break
            
    except:
        pass
    return followers


def get_user_friends(name, api):
    friends = []
    try: 
        for status in tweepy.Cursor(api.friends, 
                                    screen_name=name, 
                                    count=200).items():
            try:
                friends.append(status._json)
            except:
                pass
            if len(friends) >= 5000:
                print(name, "has more than 5,000 friends. Stopping at 5,000.")
                break
    except:
        pass
    return friends

def get_user_friends_ids(name, api):
    friends = []
    try: 
        for status in tweepy.Cursor(api.friends_ids, 
                                    screen_name=name, 
                                    count=5000).items():
            try:
                friends.append(status)
            except:
                pass
            if len(friends) >= 5000:
                print(name, "has more than 5,000 friends. Stopping at 5,000.")
                break
    except:
        pass
    return friends

In [9]:
all_tweets = []

for u in users:
    print("Getting tweets for user", u)
    tweets = get_user_tweets(u, api)
    print("Found", len(tweets), 'tweets.')
    all_tweets.extend(tweets)
    
print('Done!')

Getting tweets for user nyctaxi
Found 3195 tweets.
Getting tweets for user NYTWA
Found 3181 tweets.
Getting tweets for user NYC_TMODA
Found 30 tweets.
Getting tweets for user YellowCabNYC
Found 349 tweets.
Getting tweets for user NYC_DOT
Found 3216 tweets.
Getting tweets for user NYCTSubway
Found 3218 tweets.
Done!


## What's the data look like?

In [10]:
all_tweets[0]

{'created_at': 'Fri Apr 13 16:57:57 +0000 2018',
 'id': 984838110532993024,
 'id_str': '984838110532993024',
 'text': 'RT @NYLawSchool: Meera Joshi refers to @nyctaxi ’s work with Families for Safe Streets, Transportation Alternatives, and the New York City…',
 'truncated': False,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'NYLawSchool',
    'name': 'New York Law School',
    'id': 42979354,
    'id_str': '42979354',
    'indices': [3, 15]},
   {'screen_name': 'nyctaxi',
    'name': 'NYC TLC',
    'id': 94163495,
    'id_str': '94163495',
    'indices': [39, 47]}],
  'urls': []},
 'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 94163495,
  'id_str': '94163495',
  'name': 'NYC TLC',
  'screen_name': 'nyctaxi',
  'location': 'New York, NY

## Save our data

In [11]:
with open('tweets_raw.json', 'w') as out_file:
    for t in all_tweets:
        json.dump(t, out_file)
        out_file.write('\n')

## Convert it to pandas

In [12]:
df = pd.DataFrame(all_tweets)
df.head()

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,geo,id,id_str,...,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user
0,,,Fri Apr 13 16:57:57 +0000 2018,"{'hashtags': [], 'symbols': [], 'user_mentions...",,0,False,,984838110532993024,984838110532993024,...,,,,2,False,{'created_at': 'Fri Apr 13 13:17:22 +0000 2018...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @NYLawSchool: Meera Joshi refers to @nyctax...,False,"{'id': 94163495, 'id_str': '94163495', 'name':..."
1,,,Thu Apr 12 14:39:25 +0000 2018,"{'hashtags': [], 'symbols': [], 'user_mentions...",,1,False,,984440857825763328,984440857825763328,...,,,,0,False,,"<a href=""http://twitter.com/download/iphone"" r...","@sarah_stern @Uber Hi Ms. Stern, do you have t...",False,"{'id': 94163495, 'id_str': '94163495', 'name':..."
2,,,Thu Apr 12 14:36:39 +0000 2018,"{'hashtags': [], 'symbols': [], 'user_mentions...",,0,False,,984440162619936771,984440162619936771,...,,,,1,False,{'created_at': 'Thu Apr 12 13:53:27 +0000 2018...,"<a href=""http://twitter.com/download/iphone"" r...","RT @NYLawSchool: Tomorrow—Join us for coffee, ...",False,"{'id': 94163495, 'id_str': '94163495', 'name':..."
3,,,Mon Apr 09 17:43:50 +0000 2018,"{'hashtags': [], 'symbols': [], 'user_mentions...",,0,False,,983400107155566592,983400107155566592,...,,,,8,False,{'created_at': 'Mon Apr 09 16:31:06 +0000 2018...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @NYCVotes: Our democracy is powered by acti...,False,"{'id': 94163495, 'id_str': '94163495', 'name':..."
4,,,Mon Apr 09 15:36:19 +0000 2018,"{'hashtags': [], 'symbols': [], 'user_mentions...",,0,False,,983368016061784064,983368016061784064,...,,,,0,False,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",@djcoreycraig @Uber @lyft @juno Thank you very...,False,"{'id': 94163495, 'id_str': '94163495', 'name':..."


## Friends

In [13]:
all_friends = []

for u in users:
    print("Getting friends for user", u)
    friends = get_user_friends(u, api)
    print("Found", len(friends), 'friends.')
    all_friends.extend(friends)
    
print('Done!')

Getting friends for user nyctaxi
Found 660 followers.
Getting friends for user NYTWA
Found 405 followers.
Getting friends for user NYC_TMODA
Found 70 followers.
Getting friends for user YellowCabNYC
Found 1782 followers.
Getting friends for user NYC_DOT
Found 874 followers.
Getting friends for user NYCTSubway
Found 294 followers.
Done!


In [14]:
all_friends[0]

{'id': 333231586,
 'id_str': '333231586',
 'name': 'Steve D Luddite',
 'screen_name': 'mrloopy52',
 'location': 'London SE12',
 'description': "Ex London Fireman, Taxi Driver since 1991. Sack Chapman. Crush touts' cars and rickshaws. GB. Proud Luddite #52%",
 'url': None,
 'entities': {'description': {'urls': []}},
 'protected': False,
 'followers_count': 1267,
 'friends_count': 420,
 'listed_count': 62,
 'created_at': 'Mon Jul 11 06:10:49 +0000 2011',
 'favourites_count': 922,
 'utc_offset': None,
 'time_zone': None,
 'geo_enabled': True,
 'verified': False,
 'statuses_count': 45218,
 'lang': 'en',
 'status': {'created_at': 'Sun Apr 15 18:25:19 +0000 2018',
  'id': 985584874114469893,
  'id_str': '985584874114469893',
  'text': 'RT @steve_garelick: So I have been advised by a reliable source that Uber are about to message users about the Local elections. I am concer…',
  'truncated': False,
  'entities': {'hashtags': [],
   'symbols': [],
   'user_mentions': [{'screen_name': 'steve_ga

In [15]:
with open('data/friends_raw.json', 'w') as out_file:
    for t in all_friends:
        json.dump(t, out_file)
        out_file.write('\n')

## Followers

In [None]:
all_followers = []

for u in users:
    print("Getting followers for user", u)
    followers = get_user_followers(u, api)
    print("Found", len(followers), 'followers.')
    all_followers.extend(followers)
    
print('Done!')

Getting followers for user nyctaxi
nyctaxi has more than 5,000 followers. Stopping at 5,000.
Found 5000 followers.
Getting followers for user NYTWA
Rate limit reached. Sleeping for: 847


In [None]:
all_followers[0]

In [None]:
with open('data/followers_raw.json', 'w') as out_file:
    for t in all_followers:
        json.dump(t, out_file)
        out_file.write('\n')

## Followers IDs only

In [None]:
all_followers_ids = []

for u in users:
    print("Getting follower IDs for user", u)
    followers = get_user_followers_ids(u, api)
    print("Found", len(followers), 'followers.')
    all_followers_ids.extend(followers)
    
print('Done!')

In [None]:
all_followers[0]

In [None]:
with open('data/followers_ids_raw.json', 'w') as out_file:
    for t in all_followers_ids:
        json.dump(t, out_file)
        out_file.write('\n')