# Twitter data collection

#### References

searchtweets API reference: https://twitterdev.github.io/search-tweets-python/  
Twitter API reference: https://developer.twitter.com/en/docs/tweets/search/api-reference/premium-search.html  
Twitter tweet object and dictionary: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object

## Imports and credentials

In [40]:
from searchtweets import ResultStream, gen_rule_payload, load_credentials, collect_results

# general imports
import numpy as np
import pandas as pd
from textblob import TextBlob
import re
import time
import datetime

# plotting and visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

premium_search_args_30day = load_credentials("~/.twitter_keys.yaml",
                                          yaml_key="search_tweets_premium_30day",
                                          env_overwrite=False)
premium_search_args_fullarchive = load_credentials("~/.twitter_keys.yaml",
                                          yaml_key="search_tweets_premium_fullarchive",
                                          env_overwrite=False)

Grabbing bearer token from OAUTH
Grabbing bearer token from OAUTH


## Functions

In [41]:
def days_to_collect(start, end, frequency):
    '''
    will return an array starting at midnight of desired date to last frequency hour of end date
    start = start date
    end = end date
    frequency = number of hours to step by per day. For example frequency = 12, will collect twice: at midnight and noon
    '''
    # add one day for right_side border case
    # pd.date_range only allows dates, use rounding dates and closed='right' to get desired dates
    #print(start, end)
    start = datetime.datetime.strptime(start, '%Y-%m-%d') - datetime.timedelta(days=0, hours=int(frequency))
    end = datetime.datetime.strptime(end, '%Y-%m-%d') + datetime.timedelta(days=1, hours=0)
    #print(start, end)
    dates = pd.date_range(start=start, end=end, freq=frequency+'H', closed='left')
    formatted_dates = [ datetime.datetime.strftime(t, '%Y%m%d%H%M') for t in dates ]
    #print(formatted_dates)
    return formatted_dates

def collect_tweets(from_date, to_date, max_results):
    # maxResults is capped at 100 for sandbox account, even though there should be a next function to get more, it 
    # appears max_results=500 is accepted without any extra work
    # date format: YYYY-mm-DD HH:MM
    # from_date is inclusive. to_date is non-inclusive. Appears to start at from_date and start collecting tweets working
    # backwards to to_date
    bitcoin_rule = gen_rule_payload("bitcoin", results_per_call=100, from_date=from_date, to_date=to_date) 
    print(bitcoin_rule)
    collected_tweets = collect_results(bitcoin_rule, max_results=max_results, result_stream_args=premium_search_args)
    return collected_tweets

## Dates

In [49]:
example_start_date = '2018-10-11'
example_end_date = '2018-10-15'
interval = 24
results_per_call=100
max_results = 100

print("The intra-day hour interval is set to", interval, "Edit the code if desired to change this")
print("The number of tweets per interval is set to", max_results, "Edit the code if desired to change this")
print("please input two dates in the format below to collect dates\n\t", example_start_date, example_end_date, "\n")
user_dates = input("\t")
print()
start_date, end_date = user_dates.split(' ')
test_dates = days_to_collect(start_date, end_date, str(interval))

if (datetime.datetime.fromtimestamp(time.time()) - datetime.datetime.strptime(start_date, '%Y-%m-%d')).days < 30:
    premium_search_args = premium_search_args_30day
    print("will use 30-day dev environment")
else:
    premium_search_args = premium_search_args_fullarchive
    print("will use full-archive dev environment")
    
print("\ntwitter recognized dates will be collected on the closed iterval from", start_date, "to", end_date, "spaced in", str(interval), "hour intervals")

The intra-day hour interval is set to 24 Edit the code if desired to change this
The number of tweets per interval is set to 100 Edit the code if desired to change this
please input two dates in the format below to collect dates
	 2018-10-11 2018-10-15 

	2018-01-15 2018-03-02

will use full-archive dev environment

twitter recognized dates will be collected on the closed iterval from 2018-01-15 to 2018-03-02 spaced in 24 hour intervals


## Twitter call

In [50]:
tweets = []
for i in range(0,len(test_dates[:-1])):
    tweets = np.append(tweets, collect_tweets(test_dates[i], test_dates[i+1], max_results=max_results))
    
    # Requests are limited to 30 per minute for sandbox, 60 for subscriptions 
    # Requests are limited to 10 per second
    num_calls = (i + 1) * max_results//results_per_call
    if num_calls % 2 == 0 and num_calls % 20 != 0:
        print("waiting 10 seconds")
        time.sleep(10)
    if num_calls % 10 == 0:
        print("waiting 60 seconds")
        time.sleep(60)

{"query": "bitcoin", "maxResults": 100, "toDate": "201801150000", "fromDate": "201801140000"}
{"query": "bitcoin", "maxResults": 100, "toDate": "201801160000", "fromDate": "201801150000"}
waiting 10 seconds
{"query": "bitcoin", "maxResults": 100, "toDate": "201801170000", "fromDate": "201801160000"}
{"query": "bitcoin", "maxResults": 100, "toDate": "201801180000", "fromDate": "201801170000"}
waiting 10 seconds
{"query": "bitcoin", "maxResults": 100, "toDate": "201801190000", "fromDate": "201801180000"}
{"query": "bitcoin", "maxResults": 100, "toDate": "201801200000", "fromDate": "201801190000"}
waiting 10 seconds
{"query": "bitcoin", "maxResults": 100, "toDate": "201801210000", "fromDate": "201801200000"}
{"query": "bitcoin", "maxResults": 100, "toDate": "201801220000", "fromDate": "201801210000"}
waiting 10 seconds
{"query": "bitcoin", "maxResults": 100, "toDate": "201801230000", "fromDate": "201801220000"}
{"query": "bitcoin", "maxResults": 100, "toDate": "201801240000", "fromDate": 

In [6]:
tweets[0:2]

array([{'created_at': 'Fri Aug 31 23:59:57 +0000 2018', 'id': 1035678609157976065, 'id_str': '1035678609157976065', 'text': 'Haha @Eminem dropped that new album and name dropped $btc in it. Legit #bitcoin #notalike', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 212643233, 'id_str': '212643233', 'name': 'TyrantMFSlayer', 'screen_name': 'TyrantMFSlayer', 'location': None, 'url': 'https://www.youtube.com/channel/UCr2z8VKsuFe5cD0XeFpyhgg', 'description': 'I game on PS4 pro PSN: TyrantMFSlayer   Please subscribe to my gaming YouTube! 👇🏽', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 553, 'friends_count': 270, 'listed_count': 14, 'favourites_count': 6206, 'statuses_count': 19551, 'created_at': 'Sat Nov 06 17:0

## To dataframe and csv

In [None]:
def to_df(tweets):
    # create a pandas df from tweets
    S2 = pd.DataFrame(columns=['tweets', 'date', 'user_name', 'user_screen_name', 'user_followers', 
                           'user_friends', 'user_verified', 'user_language', 'retweet_count', 'favorite_count'])

    for i, tweet in enumerate(tweets):
        S2.loc[i] = [tweet['text'], 
                     tweet['created_at'], 
                     tweet['user']['name'], 
                     tweet['user']['screen_name'], 
                     tweet['user']['followers_count'], 
                     tweet['user']['friends_count'], 
                     tweet['user']['verified'], 
                     tweet['user']['lang'], 
                     tweet['retweet_count'], 
                     tweet['favorite_count']] 
    return S2

S2 = to_df(tweets)

In [47]:
S2.tail()

Unnamed: 0,tweets,date,user_name,user_screen_name,user_followers,user_friends,user_verified,user_language,retweet_count,favorite_count
195,"RT @JWilliamsFstmed: The Winklevoss brothers, ...",Wed Apr 18 23:57:45 +0000 2018,John Lin,grasta_man,85,220,False,en,0,0
196,Is Bitcoin Still a Good Investment? https://t....,Wed Apr 18 23:57:45 +0000 2018,Galeriks ✍️,galeriks,2196,1152,False,en,0,0
197,RT @politico: A man suspected of stealing hund...,Wed Apr 18 23:57:43 +0000 2018,Nick,nick_forcier,400,403,False,en,0,0
198,RT @politico: A man suspected of stealing hund...,Wed Apr 18 23:57:43 +0000 2018,Nick,nick_forcier,400,403,False,en,0,0
199,"RT @TIPMayerMultple: Apr 18, 2018: The current...",Wed Apr 18 23:57:42 +0000 2018,Deep State,HempBarrel,170,850,False,en,0,0


In [48]:
# save file to csv
'''
S2_tweets = S2.loc[:,['tweets']]
S2_meta = S2.drop(['tweets'], axis=1)

filename = 'complete_tweets/tweets_' + start_date + '_' + end_date
S2_tweets.to_csv(filename + '_Tweets.csv', index=False)
S2_meta.to_csv(filename + '_Metadata.csv', index=False)
print('saved files', filename + '_Tweets.csv', 'and', filename + '_Metadata.csv')
'''

filename = 'complete_tweets/tweets_' + start_date + '_' + end_date + '.csv'
S2.to_csv(filename, index=False)
print('saved file', filename)

saved file complete_tweets/tweets_2018-04-18_2018-04-19.csv
