# Twitter data collection

#### References

searchtweets API reference: https://twitterdev.github.io/search-tweets-python/  
Twitter API reference: https://developer.twitter.com/en/docs/tweets/search/api-reference/premium-search.html  
Twitter tweet object and dictionary: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object

## Imports and credentials

In [2]:
from searchtweets import ResultStream, gen_rule_payload, load_credentials, collect_results

# general imports
import numpy as np
import pandas as pd
from textblob import TextBlob
import re
import time
import datetime

# plotting and visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

premium_search_args_30day = load_credentials("~/.twitter_keys.yaml",
                                          yaml_key="search_tweets_premium_30day",
                                          env_overwrite=False)
premium_search_args_fullarchive = load_credentials("~/.twitter_keys.yaml",
                                          yaml_key="search_tweets_premium_fullarchive",
                                          env_overwrite=False)

Grabbing bearer token from OAUTH
Grabbing bearer token from OAUTH


## Functions

In [3]:
def days_to_collect(start, end, frequency):
    '''
    will return an array starting at midnight of desired date to last frequency hour of end date
    start = start date
    end = end date
    frequency = number of hours to step by per day. For example frequency = 12, will collect twice: at midnight and noon
    '''
    # add one day for right_side border case
    # pd.date_range only allows dates, use rounding dates and closed='right' to get desired dates
    #print(start, end)
    start = datetime.datetime.strptime(start, '%Y-%m-%d') - datetime.timedelta(days=0, hours=int(frequency))
    end = datetime.datetime.strptime(end, '%Y-%m-%d') + datetime.timedelta(days=1, hours=0)
    #print(start, end)
    dates = pd.date_range(start=start, end=end, freq=frequency+'H', closed='left')
    formatted_dates = [ datetime.datetime.strftime(t, '%Y%m%d%H%M') for t in dates ]
    #print(formatted_dates)
    return formatted_dates

def collect_tweets(from_date, to_date, max_results):
    # maxResults is capped at 100 for sandbox account, even though there should be a next function to get more, it 
    # appears max_results=500 is accepted without any extra work
    # date format: YYYY-mm-DD HH:MM
    # from_date is inclusive. to_date is non-inclusive. Appears to start at from_date and start collecting tweets working
    # backwards to to_date
    bitcoin_rule = gen_rule_payload("bitcoin", results_per_call=100, from_date=from_date, to_date=to_date) 
    print(bitcoin_rule)
    collected_tweets = collect_results(bitcoin_rule, max_results=max_results, result_stream_args=premium_search_args)
    return collected_tweets

## Dates

In [21]:
example_start_date = '2018-10-11'
example_end_date = '2018-10-15'
interval = 12
results_per_call=100
max_results = 500

print("The intra-day hour interval is set to", interval, "Edit the code if desired to change this")
print("The number of tweets per interval is set to", max_results, "Edit the code if desired to change this")
print("please input two dates in the format below to collect dates\n\t", example_start_date, example_end_date, "\n")
user_dates = input("\t")
print()
start_date, end_date = user_dates.split(' ')
test_dates = days_to_collect(start_date, end_date, str(interval))

if (datetime.datetime.fromtimestamp(time.time()) - datetime.datetime.strptime(start_date, '%Y-%m-%d')).days < 30:
    premium_search_args = premium_search_args_30day
    print("will use 30-day dev environment")
else:
    premium_search_args = premium_search_args_fullarchive
    print("will use full-archive dev environment")
    
print("\ntwitter recognized dates will be collected on the closed iterval from", start_date, "to", end_date, "spaced in", str(interval), "hour intervals")

The intra-day hour interval is set to 12 Edit the code if desired to change this
The number of tweets per interval is set to 500 Edit the code if desired to change this
please input two dates in the format below to collect dates
	 2018-10-11 2018-10-15 

	2018-11-16 2018-11-30

will use 30-day dev environment

twitter recognized dates will be collected on the closed iterval from 2018-11-16 to 2018-11-30 spaced in 12 hour intervals


## Twitter call

In [22]:
tweets = []
for i in range(0,len(test_dates[:-1])):
    tweets = np.append(tweets, collect_tweets(test_dates[i], test_dates[i+1], max_results=max_results))
    
    # Requests are limited to 30 per minute for sandbox, 60 for subscriptions 
    # Requests are limited to 10 per second
    num_calls = (i + 1) * max_results//results_per_call
    if num_calls % 10 == 0 and num_calls % 20 != 0:
        print("waiting 2 seconds")
        time.sleep(2)
    if num_calls % 20 == 0:
        print("waiting 60 seconds")
        time.sleep(60)

{"query": "bitcoin", "maxResults": 100, "toDate": "201811160000", "fromDate": "201811151200"}
{"query": "bitcoin", "maxResults": 100, "toDate": "201811161200", "fromDate": "201811160000"}
waiting 2 seconds
{"query": "bitcoin", "maxResults": 100, "toDate": "201811170000", "fromDate": "201811161200"}
{"query": "bitcoin", "maxResults": 100, "toDate": "201811171200", "fromDate": "201811170000"}
waiting 60 seconds
{"query": "bitcoin", "maxResults": 100, "toDate": "201811180000", "fromDate": "201811171200"}
{"query": "bitcoin", "maxResults": 100, "toDate": "201811181200", "fromDate": "201811180000"}
waiting 2 seconds
{"query": "bitcoin", "maxResults": 100, "toDate": "201811190000", "fromDate": "201811181200"}
{"query": "bitcoin", "maxResults": 100, "toDate": "201811191200", "fromDate": "201811190000"}
waiting 60 seconds
{"query": "bitcoin", "maxResults": 100, "toDate": "201811200000", "fromDate": "201811191200"}
{"query": "bitcoin", "maxResults": 100, "toDate": "201811201200", "fromDate": "2

In [23]:
tweets[-1]

{'created_at': 'Fri Nov 30 11:51:14 +0000 2018',
 'id': 1068472517948555264,
 'id_str': '1068472517948555264',
 'text': "RT @business: Bitcoin is on a downward spiral—and one economist says it's on its way to zero https://t.co/9TOh5j0t3P https://t.co/Z027PXsveV",
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'truncated': False,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 33049861,
  'id_str': '33049861',
  'name': 'ozawa',
  'screen_name': 'o_oz_za_a',
  'location': None,
  'url': None,
  'description': None,
  'translator_type': 'none',
  'protected': False,
  'verified': False,
  'followers_count': 211,
  'friends_count': 1355,
  'listed_count': 27,
  'favourites_count': 23,
  'statuses_count': 25863,
  'created_at': 'Sat Apr 18 23:26:56 +0000 2009',
  'utc_offset': None,
  'time_zone': None,
  'g

## To dataframe and csv

In [24]:
def to_df(tweets):
    # create a pandas df from tweets
    S2 = pd.DataFrame(columns=['tweets', 'date', 'user_name', 'user_screen_name', 'user_followers', 
                           'user_friends', 'user_verified', 'user_language', 'retweet_count', 'favorite_count'])

    for i, tweet in enumerate(tweets):
        S2.loc[i] = [tweet['text'], 
                     tweet['created_at'], 
                     tweet['user']['name'], 
                     tweet['user']['screen_name'], 
                     tweet['user']['followers_count'], 
                     tweet['user']['friends_count'], 
                     tweet['user']['verified'], 
                     tweet['user']['lang'], 
                     tweet['retweet_count'], 
                     tweet['favorite_count']] 
    return S2

S2 = to_df(tweets)

In [26]:
S2.tail()

Unnamed: 0,tweets,date,user_name,user_screen_name,user_followers,user_friends,user_verified,user_language,retweet_count,favorite_count
14995,RT @business: Bitcoin is on a downward spiral—...,Fri Nov 30 11:51:15 +0000 2018,Mr Khan,evergreatkhan,387,4928,False,en-gb,0,0
14996,"Bitcoin Today: 2018 – ICOs, 2019 – ? https://t...",Fri Nov 30 11:51:15 +0000 2018,Bitcoin & Ethereum,bitcointrendnew,374,248,False,en,0,0
14997,Bitcoin – Here we go again https://t.co/3Faz8B...,Fri Nov 30 11:51:15 +0000 2018,Bitcoin & Ethereum,bitcointrendnew,374,248,False,en,0,0
14998,Precio actual #Bitcoin\n$BTC = U$S 4015.57\nht...,Fri Nov 30 11:51:14 +0000 2018,Cripto247,cripto247,2269,0,False,es,1,1
14999,RT @business: Bitcoin is on a downward spiral—...,Fri Nov 30 11:51:14 +0000 2018,ozawa,o_oz_za_a,211,1355,False,en,0,0


In [27]:
# save file to csv
S2_tweets = S2.loc[:,['tweets']]
S2_meta = S2.drop(['tweets'], axis=1)

filename = 'complete_tweets/tweets_' + start_date + '_' + end_date
S2_tweets.to_csv(filename + '_Tweets.csv', index=False)
S2_meta.to_csv(filename + '_Metadata.csv', index=False)
print('saved files', filename + '_Tweets.csv', 'and', filename + '_Metadata.csv')

saved files complete_tweets/tweets_2018-11-16_2018-11-30_Tweets.csv and complete_tweets/tweets_2018-11-16_2018-11-30_Metadata.csv
