# Twitter data collection

#### References

searchtweets API reference: https://twitterdev.github.io/search-tweets-python/  
Twitter API reference: https://developer.twitter.com/en/docs/tweets/search/api-reference/premium-search.html  
Twitter tweet object and dictionary: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object

## Imports and credentials

In [2]:
from searchtweets import ResultStream, gen_rule_payload, load_credentials, collect_results

# general imports
import numpy as np
import pandas as pd
from textblob import TextBlob
import re
import time
import datetime

# plotting and visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

premium_search_args_30day = load_credentials("~/.twitter_keys.yaml",
                                          yaml_key="search_tweets_premium_30day",
                                          env_overwrite=False)
premium_search_args_fullarchive = load_credentials("~/.twitter_keys.yaml",
                                          yaml_key="search_tweets_premium_fullarchive",
                                          env_overwrite=False)

Grabbing bearer token from OAUTH
Grabbing bearer token from OAUTH


## Functions

In [3]:
def days_to_collect(start, end, frequency):
    '''
    will return an array starting at midnight of desired date to last frequency hour of end date
    start = start date
    end = end date
    frequency = number of hours to step by per day. For example frequency = 12, will collect twice: at midnight and noon
    '''
    # add one day for right_side border case
    # pd.date_range only allows dates, use rounding dates and closed='right' to get desired dates
    #print(start, end)
    start = datetime.datetime.strptime(start, '%Y-%m-%d') - datetime.timedelta(days=0, hours=int(frequency))
    end = datetime.datetime.strptime(end, '%Y-%m-%d') + datetime.timedelta(days=1, hours=0)
    #print(start, end)
    dates = pd.date_range(start=start, end=end, freq=frequency+'H', closed='left')
    formatted_dates = [ datetime.datetime.strftime(t, '%Y%m%d%H%M') for t in dates ]
    #print(formatted_dates)
    return formatted_dates

def collect_tweets(from_date, to_date, max_results):
    # maxResults is capped at 100 for sandbox account, even though there should be a next function to get more, it 
    # appears max_results=500 is accepted without any extra work
    # date format: YYYY-mm-DD HH:MM
    # from_date is inclusive. to_date is non-inclusive. Appears to start at from_date and start collecting tweets working
    # backwards to to_date
    bitcoin_rule = gen_rule_payload("bitcoin", results_per_call=100, from_date=from_date, to_date=to_date) 
    print(bitcoin_rule)
    collected_tweets = collect_results(bitcoin_rule, max_results=max_results, result_stream_args=premium_search_args)
    return collected_tweets

## Dates

In [28]:
example_start_date = '2018-10-11'
example_end_date = '2018-10-15'
interval = 24
results_per_call=100
max_results = 100

print("The intra-day hour interval is set to", interval, "Edit the code if desired to change this")
print("The number of tweets per interval is set to", max_results, "Edit the code if desired to change this")
print("please input two dates in the format below to collect dates\n\t", example_start_date, example_end_date, "\n")
user_dates = input("\t")
print()
start_date, end_date = user_dates.split(' ')
test_dates = days_to_collect(start_date, end_date, str(interval))

if (datetime.datetime.fromtimestamp(time.time()) - datetime.datetime.strptime(start_date, '%Y-%m-%d')).days < 30:
    premium_search_args = premium_search_args_30day
    print("will use 30-day dev environment")
else:
    premium_search_args = premium_search_args_fullarchive
    print("will use full-archive dev environment")
    
print("\ntwitter recognized dates will be collected on the closed iterval from", start_date, "to", end_date, "spaced in", str(interval), "hour intervals")

The intra-day hour interval is set to 24 Edit the code if desired to change this
The number of tweets per interval is set to 100 Edit the code if desired to change this
please input two dates in the format below to collect dates
	 2018-10-11 2018-10-15 

	2017-09-15 2017-09-15

will use full-archive dev environment

twitter recognized dates will be collected on the closed iterval from 2017-09-15 to 2017-09-15 spaced in 24 hour intervals


In [29]:
test_dates = list(reversed(test_dates))

In [30]:
test_dates

['201709150000', '201709140000']

## Twitter call

In [31]:
tweets = []
for i in range(0,len(test_dates[:-1])):
    # test_dates reversed. Eg. 2018-10-31 -> 2018-10-30
    # collect_tweets requires forward collection: collect_tweets(from, to, max_results=100)
    tweets = np.append(tweets, collect_tweets(test_dates[i+1], test_dates[i], max_results=max_results))
    
    # Requests are limited to 30 per minute for sandbox, 60 for subscriptions 
    # Requests are limited to 10 per second
    num_calls = (i + 1) * max_results//results_per_call
    if num_calls % 5 == 0 and num_calls % 20 != 0:
        print("waiting 10 seconds")
        time.sleep(10)

{"query": "bitcoin", "maxResults": 100, "toDate": "201709150000", "fromDate": "201709140000"}


retrying request; current status code: 429
retrying request; current status code: 429
retrying request; current status code: 429
HTTP Error code: 429: Request exceeds account’s current package request limits. Please upgrade your package and retry or contact Twitter about enterprise access.
Rule payload: {'query': 'bitcoin', 'maxResults': 100, 'toDate': '201709150000', 'fromDate': '201709140000'}


HTTPError: 

In [24]:
# flip tweets back so that the rows are in increasing days
tweets = list(reversed(tweets))

## To dataframe and csv

In [25]:
def to_df(tweets):
    # create a pandas df from tweets
    S2 = pd.DataFrame(columns=['tweets', 'date', 'user_name', 'user_screen_name', 'user_followers', 
                           'user_friends', 'user_verified', 'user_language', 'retweet_count', 'favorite_count'])

    for i, tweet in enumerate(tweets):
        S2.loc[i] = [tweet['text'], 
                     tweet['created_at'], 
                     tweet['user']['name'], 
                     tweet['user']['screen_name'], 
                     tweet['user']['followers_count'], 
                     tweet['user']['friends_count'], 
                     tweet['user']['verified'], 
                     tweet['user']['lang'], 
                     tweet['retweet_count'], 
                     tweet['favorite_count']] 
    return S2

S2 = to_df(tweets)

In [26]:
S2.tail()

Unnamed: 0,tweets,date,user_name,user_screen_name,user_followers,user_friends,user_verified,user_language,retweet_count,favorite_count
495,Are you traveling and want to pay in bitcoin f...,Thu Aug 10 23:59:28 +0000 2017,Keith Wellborn,kwellbor,269,429,False,en,0,2
496,"Bitcoin Investor Loses $50000 as His Wallet, B...",Thu Aug 10 23:59:31 +0000 2017,CryptoNews,betbybitcoins,11552,12134,False,en,0,1
497,@bitcoin_unnyou そうでしたか。それは残念です(´･ω･｀)\n普段はなにを触...,Thu Aug 10 23:59:31 +0000 2017,たいや,taiya_556,1444,99,False,ja,0,0
498,RT @CNBCFastMoney: '@BKBrianKelly breaks down ...,Thu Aug 10 23:59:48 +0000 2017,cashdab,thecashdab,102,446,False,en,0,0
499,RT @coindesk: The latest Bitcoin Price Index i...,Thu Aug 10 23:59:58 +0000 2017,Jenny Xuan Nguyen,maria_xuan_2910,37,59,False,vi,0,0


In [27]:
# save file to csv
'''
S2_tweets = S2.loc[:,['tweets']]
S2_meta = S2.drop(['tweets'], axis=1)

filename = 'complete_tweets/tweets_' + start_date + '_' + end_date
S2_tweets.to_csv(filename + '_Tweets.csv', index=False)
S2_meta.to_csv(filename + '_Metadata.csv', index=False)
print('saved files', filename + '_Tweets.csv', 'and', filename + '_Metadata.csv')
'''

filename = 'complete_tweets/tweets_' + start_date + '_' + end_date + '.csv'
S2.to_csv(filename, index=False)
print('saved file', filename)

saved file complete_tweets/tweets_2017-08-07_2017-08-11.csv
