# Pulling Twitter Historical Data, Feb 2019

In [None]:
print("Hello World! This was created on Feb 22, 2019.")

Here, progress shall be documented for work on integrating Twitter tweet counts with predictive gambling. Aim is to find:

1. Undervalued opportunities that will correct itself
2. Near-certain swings that will occur
3. Calculate risk "theta decay" and negative risk options

So far, working outlines of these things are ready: 2/22/2019:

1. Market data pulling script "__Base3.py__"

Abilty to save historical data to pickle.

2. Twitter Live count - "__Twitterlive.py__"

Notify when a new tweet has occured. Implemented sound on Linux.

3. Twitter historical data pull "__Twitter RDT r1.py__"

Weird problem occurred today with pulling Realdonaldtrump handle, only doing 1/20/80 instead of full history.

## Documentation: Twitter RDT r1.py

Preliminaries:


In [None]:
from datetime import timedelta
import tweepy
import matplotlib.pyplot as plt
import numpy as np
import pickle

#consumer_key, consumer_secret on account APythonTesting1, password.

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)
handle = "realDonaldTrump"
User = api.get_user(handle)

### Basic Save and Load functions

* Not implemented in a class! For classes make sure to add _self_.

In [None]:
def picklesave(l, filename = "Pickle.p"):
    """Save to pickle file for reference"""
    with open(filename, "wb") as fp:
        pickle.dump(l, fp)
    
def pickleread(filename = "Pickle.p"):
    """Read from pickle file"""
    r = pickle.load(open(filename, "rb"))
    return r

### Downloading tweets and extracting time from tweets

In [None]:
def htweet():
    """Get maximum 3200 tweets from twitter. Return as list with type Status"""
    alltweets = []
    for tweet in tweepy.Cursor(api.user_timeline,id=handle).items():
        alltweets.append(tweet)
    return alltweets

def extractdt(l):
    """Extract datetime list from htweet list, return as list. Time in EST"""
    dt = []
    for i in l:
        obj = i.created_at
        EST = obj - timedelta(hours=5)
        dt.append(EST)
    return dt

def pullplot():
    """Get historical tweets, extract date"""
    m = htweet()
    k = extractdt(m)
    k.sort()
    return k

Note: For PST, change line to PST = obj - timedelta(hours=8)

__pullplot()__ returns a list of _datetime_ objects.

More methods: Specialized functions for datetime to return date only, time only, time and fraction of an hour.

### Extract time data from Datetime

In [None]:
def dateonly(l):
    """Make datetime list a date-only list"""
    d = []
    for i in l:
        d.append(i.date())
    return d
        
def timeonly(l):
    """Make datetime list a time-only list"""
    t = []
    for i in l:
        t.append(i.time())
    return t
    
def timeonlydot(l):
    """Make datetime list format in range 0.0 to 24
    ex. 16:32am = 16.5333..."""
    hourstore = []
    for i in l:
        hourstore.append(i.hour + i.minute / 60)
    
    return hourstore


### Separate Datetime objects by Date or Time

In [None]:
def separatebydate(l):
    """From data, separate by day of the week"""
    container = [0] * 7
    for i in l:
        index = i.weekday()
        container[index] += 1
    
    return container
    

In [None]:
def wednesdaycount(l):
    """From data, count number of tweets in each wednesday period."""
    container = []
    
    init = l[0] + timedelta(days=( (9 - (l[0].weekday()) % 7) )) # first Wednesday after @ 12pm ET
    init = init.replace(hour=12, minute=0, second = 0)

    start = init
    counter = 0
    for i in range(len(l)):
        if l[i] < start:
            counter += 1
        else:
            container.append(counter)
            counter = 0
            start = start + timedelta(days=7)
    container.append(counter)
    
    del container[0] #not full
    return container


In [None]:
def separatewednesday(l):
    """From data, separate into separate wednesday buckets."""
    container = []
    
    init = l[0] + timedelta(days=( (9 - (l[0].weekday()) % 7) )) + timedelta( # first Wednesday after @ 12pm ET
    init = init.replace(hour=12, minute=0, second = 0) # figure out the exact later
    
    #assuming doesn't start ON that Wednesday AFTERNOON fix it later with time. Discard first one anyway.

    start = init
    counter = []
    for i in range(len(l)):
        if l[i] < start:
            counter.append(l[i])
        else:
            container.append(counter)
            counter = []
            start = start + timedelta(days=7)
    container.append(counter)
    
    del container[0] #not full
    
    return container

__separatewednesday__ is a breakthrough in data segregation. Now all the data is sorted into single week periods by Wednesdays.
@Realdonaldtrump markets are Wednesdays.

Note: container[0] is, in general, an unsaturated list. If data BEGINS on WED MORNING, those dates will be included in container[0] even though they are not part of that period.
If data BEGINS on WED AFTERNOON, then some may be left out. To prevent problems, that whole week is unconsidered by only starting from the NEXT WEDNESDAY.

The very last container is the current counter. 2/22/19: Use twitter to confirm by mousing over the "tweet" count.

In [None]:
#returns
[ 
[datetime... datetime] # One week
...
[datetime... datetime] # Another week
]

__Generalized Form__

In [None]:
def separatebyweekday(l, weekday = 0):
    """From data, separate into buckets by week. Specify weekday: 0 for Monday, 6 for Sunday. End time always 12pm (EST)."""
    container = []
    
    init = l[0] + timedelta(days=( ((7 + weekday) - (l[0].weekday()) % 7) )) # first @ 12pm ET. If on the day, start from the next week for consistency.
    init = init.replace(hour=12, minute=0, second = 0)

    start = init
    counter = []
    for i in range(len(l)):
        if l[i] < start:
            counter.append(l[i])
        else:
            container.append(counter)
            counter = []
            start = start + timedelta(days=7)
    container.append(counter)
    
    del container[0] #not full
    
    return container


### Plot Results

Use on parsed data only. 

Specialized functions: plotit, plotit2, plotit3.
    
* plotit plots as a basic __step__ graph.
* plotit2 plots dots only.
* plotit3 plots with y-axis (Tweet count) scale = n, default 100.

In [None]:
def plotit(list):
    """Plot... reverses x and y axis"""
    list.sort() #always make sure sorted?
    x = np.asarray(list)
    y = np.arange(len(list))
    fig, ax = plt.subplots()
    plt.xlabel("Time (Wednesday 12pm to Wednesday 12pm)")
    plt.ylabel("Tweet count")
    plt.step(x, y)
    for label in ax.get_xticklabels()[::2]:
        label.set_visible(False)

def plotit2(list):
    """Plot function... reverses x and y axis"""
    list.sort() #always make sure sorted?
    x = np.asarray(list)
    y = np.arange(len(list))
    fig, ax = plt.subplots()
    plt.xlabel("Time (Wed to Wed 12pm)")
    plt.ylabel("Tweet count")
    plt.plot(x, y, 'b.')
    for label in ax.get_xticklabels()[::2]:
        label.set_visible(False)
        
        

def plotit3(list, n = 100):
    """Plot function... reverses x and y axis. Limits each to the top."""
    list.sort() #always make sure sorted?
    x = np.asarray(list)
    y = np.arange(len(list))
    fig, ax = plt.subplots()
    plt.ylim(top = n)
    
    plt.xlabel("Time (Wed to Wed 12pm)")
    plt.ylabel("Tweet count")
    plt.plot(x, y, 'b.')
    for label in ax.get_xticklabels()[::2]:
        label.set_visible(False)

And now...

In [None]:
def start(l):
    """Start operation on a preparsed list"""
    for i in range(len(l)):
        plotit(l[i])
        num = "Plot "+ str(i) + ".svg"
        plt.savefig(num, format = 'svg', dpi=1200)
        plt.show()
        input("This is plot number " + str(i) + ".")


__start__ plots and saves each into directory, showing output for each.

Extraneous Functions:

In [None]:
def barplot(input, bins = 23):
    """Input: hours objects. Goal: frequency of each time.
    Use multiples of 24, minus 1"""
    plt.hist(input, bins = bins)

In [None]:
def deltatime(data):
    """From datatime list return the time between each tweet"""
    deltastore = []
    for i in range((len(data)) - 1):
        a = data[i+1] - data[i] #timedelta object
        deltastore.append(a.total_seconds())
    
    return deltastore

And that concludes the current progress on the Twitter Pull.
