# Data Acquisition

## Introduction

The two datasets we will be dealing with are the following:

1. __Trump's Tweets__: Starting from late 2016 till April 2018. 
2. __SPY's Historical Price__: The movements of the SPY within the same time period.  

## Common Libraries

In [69]:
# For Trump's Tweets
import re
import tweepy
import csv
from nltk.sentiment.vader import SentimentIntensityAnalyzer    

# For SPY Data
from datetime import datetime
from pandas import DataFrame
import pandas_datareader.data as dr

## 1. Trump's Tweets

### User Credentials from Twitter

In [94]:
# Variables that contains the user credentials to access Twitter API 
ACCESS_TOKEN        = 'Nothing'
ACCESS_TOKEN_SECRET = 'to'
CONSUMER_KEY        = 'see'
CONSUMER_SECRET     = 'here.'

### Authentication 

In [95]:
def auth(): 
    oauth = tweepy.OAuthHandler( CONSUMER_KEY, CONSUMER_SECRET )
    oauth.set_access_token( ACCESS_TOKEN, ACCESS_TOKEN_SECRET ) 
    return oauth

### Getting All Tweets for Trump

In [99]:
def get_all_tweets( screen_name ):
    api = tweepy.API( auth() )
    all_tweets = []

    # Get the 200 Most Recent Tweets.
    new_tweets = api.user_timeline( screen_name = screen_name, 
                                    count       = 200 )
    all_tweets.extend( new_tweets )
    
    # Save the id of the oldest tweet less one
    oldest_tweet_id = all_tweets[ -1 ].id - 1

    # Let's get the most recent 4000 tweets. 
    while len( new_tweets ) > 0:
        
        # All Subsequent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline( screen_name = screen_name, 
                                        count       = 200,
                                        max_id      = oldest_tweet_id, 
                                        tweet_mode  = 'extended' ) 
        
        all_tweets.extend( new_tweets )
        
        # Update the id of the oldest tweet less one
        oldest_tweet_id = all_tweets[ -1 ].id - 1
        
        print( f'{ len(all_tweets)} tweets downloaded so far.') 
        
    return all_tweets

donald_tweets = get_all_tweets( 'realDonaldTrump' )

400 tweets downloaded so far.
600 tweets downloaded so far.
800 tweets downloaded so far.
997 tweets downloaded so far.
1197 tweets downloaded so far.
1397 tweets downloaded so far.
1596 tweets downloaded so far.
1796 tweets downloaded so far.
1996 tweets downloaded so far.
2196 tweets downloaded so far.
2396 tweets downloaded so far.
2595 tweets downloaded so far.
2795 tweets downloaded so far.
2995 tweets downloaded so far.
3195 tweets downloaded so far.
3240 tweets downloaded so far.
3240 tweets downloaded so far.


In [101]:
import pprint

# Let's print the text of the first 5 tweets.
pprint.pprint( [ t.text  for t in donald_tweets[ : 5 ]] )

['Looks like OPEC is at it again. With record amounts of Oil all over the '
 'place, including the fully loaded ships at… https://t.co/l5MMjmtI14',
 'Nancy Pelosi is going absolutely crazy about the big Tax Cuts given to the '
 'American People by the Republicans...got… https://t.co/0REgmJNMqT',
 'So exciting! I have agreed to be the Commencement Speaker at our GREAT Naval '
 'Academy on May 25th in Annapolis, Mary… https://t.co/L9iZ6RS3ft',
 'So General Michael Flynn’s life can be totally destroyed while Shadey James '
 'Comey can Leak and Lie and make lots of… https://t.co/q1lyKyyeYI',
 'James Comey Memos just out and show clearly that there was NO COLLUSION and '
 'NO OBSTRUCTION. Also, he leaked classif… https://t.co/YfMYBrTkza']


### Tranforming the Tweets  

__Helper Functions__ 

In [107]:
sid = SentimentIntensityAnalyzer()

def analyze_sentiment( input_tweet ):
    return sid.polarity_scores( input_tweet )

def clean_tweet( input_tweet ):
    return ' '.join(re.sub( "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)",
                            " ", 
                            input_tweet ).split() )

def get_tweet_text( input_tweet ):
    text = ""

    if hasattr( input_tweet, 'full_text' ):
        return clean_tweet( input_tweet.full_text )

    elif hasattr( input_tweet, 'fulltext' ):
        return clean_tweet( input_tweet.fulltext )

    elif hasattr( input_tweet, 'text' ):
        return clean_tweet( input_tweet.text )

    else:
        return None

__Tranformation Logic__

In [110]:
def transform_tweets( input_tweets ):
    output_tweets = [] 
        
    for t in donald_tweets:
        compound_value = 0
        text           = get_tweet_text( t )
        
        if ( text == None ):
            print( f'No text found for: User: {t.user.name} Tweet @ {t.created_at}')
            continue
        else:
             compound_value = analyze_sentiment( text )[ 'compound' ] 
            
        output_tweets.append( [ t.created_at.date(), text, compound_value ])
    return DataFrame( data    = output_tweets, 
                      columns = [ "Created At", 
                                  "Cleaned Tweet", 
                                  "Sentiment Score" ])
            
transformed_tweets_df = transform_tweets( donald_tweets ) 
transformed_tweets_df.head()

Unnamed: 0,Created At,Cleaned Tweet,Sentiment Score
0,2018-04-20,Looks like OPEC is at it again With record amo...,0.3612
1,2018-04-20,Nancy Pelosi is going absolutely crazy about t...,-0.5984
2,2018-04-20,So exciting I have agreed to be the Commenceme...,0.8931
3,2018-04-20,So General Michael Flynn s life can be totally...,-0.7089
4,2018-04-20,James Comey Memos just out and show clearly th...,-0.6669


### Saving the Tweets

In [112]:
def write_tweets( transformed_tweets_df, file_path ):
    transformed_tweets_df.to_csv( file_path )
 
write_tweets( transformed_tweets_df = transformed_tweets_df, 
              file_path             = '../../data/realDonaldTrump_tweets.csv' )

## 2. SPY Movements

### Getting the Historical Data

In [113]:
def download_historical_prices_for_instrument( ticker ):
    try:
        now_time         = datetime.now()
        print(f"Getting historical data for: {ticker}")
        start_time       = datetime(now_time.year - 3, now_time.month , now_time.day)
        stock_df         = dr.DataReader( ticker,'iex', start_time, now_time)
        stock_df['Name'] = ticker
        return stock_df
    
    except Exception as e:
        print(f'Unable to get data for: {ticker} because of Error: {e} ')
        
spy_df = download_historical_prices_for_instrument( 'SPY' )
spy_df.head()

Getting historical data for: SPY
5y


Unnamed: 0_level_0,open,high,low,close,volume,Name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-04-20,196.8031,197.9234,196.709,197.5468,92189481,SPY
2015-04-21,198.3187,198.4976,196.9726,197.3115,72559831,SPY
2015-04-22,197.6974,198.4882,196.6525,198.2811,78264616,SPY
2015-04-23,197.8292,199.5143,197.6974,198.78,102585942,SPY
2015-04-24,199.2507,199.5425,198.7329,199.2412,61327387,SPY
