# This notebook is a test bed to check the functionality of the timeloop package and its efficacy for deployment of the Muskometer backend. 

## I'm going to need to have a background script running that scans for new tweets periodically and updates stock data periodically as well.  In addition, when new tweets are found, they need to be put through the anomaly detection alogrithm and the datasets used by the front end need to be updated.  The timeloop package seems to be a good way to do all this simultaneously within a single script.

In [1]:
import time
import timeloop
import datetime
from collections import defaultdict
import os, sys
import pandas as pd
import GetOldTweets3 as got
from timeloop import Timeloop
from datetime import timedelta

In [2]:
# This example code is shamelessly copied from :
# https://medium.com/greedygame-engineering/an-elegant-way-to-run-periodic-tasks-in-python-61b7c477b679
tl = Timeloop()

@tl.job(interval=timedelta(seconds=2))
def sample_job_every_2s():
    print ("2s job current time : {}".format(time.ctime()))
    
@tl.job(interval=timedelta(seconds=5))
def sample_job_every_5s():
    print ("5s job current time : {}".format(time.ctime()))
    
@tl.job(interval=timedelta(seconds=10))
def sample_job_every_10s():
    print ("10s job current time : {}".format(time.ctime()))
    
if __name__ == "__main__":
    tl.start(block=True)

[2020-06-28 13:36:28,636] [timeloop] [INFO] Starting Timeloop..
[2020-06-28 13:36:28,638] [timeloop] [INFO] Registered job <function sample_job_every_2s at 0x7fc4d7e15310>
[2020-06-28 13:36:28,639] [timeloop] [INFO] Registered job <function sample_job_every_5s at 0x7fc4d88040d0>
[2020-06-28 13:36:28,640] [timeloop] [INFO] Registered job <function sample_job_every_10s at 0x7fc4d88041f0>
[2020-06-28 13:36:28,640] [timeloop] [INFO] Timeloop now started. Jobs will run based on the interval set
2s job current time : Sun Jun 28 13:36:30 2020
2s job current time : Sun Jun 28 13:36:32 2020
5s job current time : Sun Jun 28 13:36:33 2020
2s job current time : Sun Jun 28 13:36:34 2020
2s job current time : Sun Jun 28 13:36:36 2020
10s job current time : Sun Jun 28 13:36:38 2020
5s job current time : Sun Jun 28 13:36:38 2020
2s job current time : Sun Jun 28 13:36:38 2020
2s job current time : Sun Jun 28 13:36:40 2020
2s job current time : Sun Jun 28 13:36:42 2020
5s job current time : Sun Jun 28 1

## Looks like that worked as advertised!

### Let's start building up the functionality of the backend and test it as we go.  First I'll load up the stored tweets from Elon Musk, scan for new tweets, and add them to the record.

In [45]:
def reload_tweet_data(path,username="elonmusk"):
   #note we'll have to do a .drop and set the 'Time' column to the proper values every time
    df = pd.read_csv(path+username+'_tweets.csv').drop(['Unnamed: 0'],axis='columns')
    #order by earliest first
    df['Time'] = pd.to_datetime(df['Time'])#.sort_values(by='Time',ascending=True)
    return df.sort_values(by='Time',ascending=True).reset_index().drop('index',axis='columns')
def prepend_new_tweets(df_new,df_old): #adds the new tweets to the front of the data set and resets the index
    result = pd.concat([df_old,df_new]).reset_index().drop('index',axis='columns')
    #makes sure we're sorted properly in time order
    result.sort_values(by='Time',ascending=True,inplace=True)
    return result.reset_index().drop('index',axis='columns')
def store_tweet_data(df,path,username="elonmusk"):
    df.to_csv(path+username+'_tweets.csv')
    return
def scan_for_new_tweets(path,username="elonmusk"):
    """Scans for new tweets ad adds them to the end
        of the old data frame"""
    df_old = reload_tweet_data(path,username) #get the old tweets
    #look for new tweets starting from latest date
    df_new = scrape_new_tweets(df_old['Time'].max(),username)
    if len(df_new) == 0:# No new tweets
        return df_old
    else:
        df_combined = prepend_new_tweets(df_new,df_old)
        return df_combined.drop_duplicates(subset = ['Time'])
def scrape_new_tweets(t_last_tweet,username = "elonmusk"):
    """Function to scrape the recent tweets of Elon Musk"""
    #t_last_tweet must be pandas Timestamp data
    os.makedirs('tweet_data', exist_ok=True)
    date_str = str(t_last_tweet.date().year)+"-"\
              +str(t_last_tweet.date().month)+"-"\
              +str(t_last_tweet.date().day)
    count = 0
    # Creation of query object                                                                                                                                                                                      
    tweetCriteria = got.manager.TweetCriteria().setUsername(username)\
                                               .setMaxTweets(count)\
                                               .setSince(date_str)
    # Creation of list that contains all tweets                                                                                                                                                                     
    tweets = None
    for ntries in range(5):
        try:
            tweets = got.manager.TweetManager.getTweets(tweetCriteria)
        except SystemExit:
            print("Trying again in 15 minutes.")
            time.sleep(15*60)
        else:
            break
    if tweets is None:
        print("Failed after 5 tries, quitting!")
        exit(1)

    data = defaultdict(list)
    for t in tweets:
        data["username"].append(username)
        data["tweet_id"].append(t.id)
        data["reply_to"].append(t.to)
        data["date"].append(t.date)
        data["retweets"].append(t.retweets)
        data["favorites"].append(t.favorites)
        data["hashtags"].append(list(set(t.hashtags.split())))
        data["mentions"].append(t.mentions)
        data["text"].append(t.text)
        data["permalink"].append(t.permalink)
    if len(data) == 0: #no new tweets
        return None
    else:
        #make a DataFrame out of the scraped tweets
        df = pd.DataFrame(data, columns=["username","tweet_id",
                                         "reply_to","date","retweets",
                                         "favorites","hashtags","mentions",
                                         "text","permalink"])        
        # Convert 'Time' column to datetime and strip time information.
        df['Time'] = pd.to_datetime(df['date'])
        df.drop(labels=['date'],axis=1,inplace=True)
        return df.sort_values(by='Time',ascending=True)

In [37]:
elon_tweets_df = reload_tweet_data('../data/raw/',username="elonmusk")

In [5]:
store_tweet_data(elon_tweets_df,'../data/raw/',username="elonmusk")

In [40]:
elon_tweets_df.dtypes

username                  object
tweet_id                   int64
reply_to                  object
retweets                   int64
favorites                  int64
hashtags                  object
mentions                  object
text                      object
permalink                 object
Time         datetime64[ns, UTC]
dtype: object

### Okay, let's see if we can scrape the new tweets from June.

In [46]:
new_tweets_df = scrape_new_tweets(elon_tweets_df['Time'].max(),username = "elonmusk")

In [47]:
new_tweets_df.tail()

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time
4,elonmusk,1276716075422367746,waitbutwhy,822,12936,[],,It’s a tough one. That’s why we should expand ...,https://twitter.com/elonmusk/status/1276716075...,2020-06-27 03:16:47+00:00
3,elonmusk,1276738326804873217,slashdot,137,2280,[],,"Verified should be far more widespread, simply...",https://twitter.com/elonmusk/status/1276738326...,2020-06-27 04:45:12+00:00
2,elonmusk,1276747755914842112,CathieDWood,104,2718,[],,For sure,https://twitter.com/elonmusk/status/1276747755...,2020-06-27 05:22:40+00:00
1,elonmusk,1276957305842528256,BLKMDL3,325,15615,[],,Major Supercharger increases are underway for ...,https://twitter.com/elonmusk/status/1276957305...,2020-06-27 19:15:21+00:00
0,elonmusk,1276959792876011520,GerberKawasaki,480,6735,[],,"Physics favors electric transport, batteries f...",https://twitter.com/elonmusk/status/1276959792...,2020-06-27 19:25:14+00:00


In [48]:
#if no new tweets, we should get back a None data type
test_tweets_df = scrape_new_tweets(new_tweets_df['Time'].max(),username = "elonmusk")

In [49]:
test_tweets_df.head()

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time
4,elonmusk,1276716075422367746,waitbutwhy,822,12936,[],,It’s a tough one. That’s why we should expand ...,https://twitter.com/elonmusk/status/1276716075...,2020-06-27 03:16:47+00:00
3,elonmusk,1276738326804873217,slashdot,137,2280,[],,"Verified should be far more widespread, simply...",https://twitter.com/elonmusk/status/1276738326...,2020-06-27 04:45:12+00:00
2,elonmusk,1276747755914842112,CathieDWood,104,2718,[],,For sure,https://twitter.com/elonmusk/status/1276747755...,2020-06-27 05:22:40+00:00
1,elonmusk,1276957305842528256,BLKMDL3,325,15617,[],,Major Supercharger increases are underway for ...,https://twitter.com/elonmusk/status/1276957305...,2020-06-27 19:15:21+00:00
0,elonmusk,1276959792876011520,GerberKawasaki,480,6735,[],,"Physics favors electric transport, batteries f...",https://twitter.com/elonmusk/status/1276959792...,2020-06-27 19:25:14+00:00


# Looks like the API only has date as it's finest time setting.

### To handle this, we'll have to eliminate duplicates after joining the new tweet data.

In [171]:
def scan_for_new_tweets(path,username="elonmusk"):
    """Scans for new tweets ad adds them to the end
        of the old data frame"""
    df_old = reload_tweet_data(path,username) #get the old tweets
    #look for new tweets starting from latest date
    df_new = scrape_new_tweets(df_old['Time'].max(),username)
    if len(df_new) == 0:# No new tweets
        return df_old,len(df_old),False
    else:
        df_combined = prepend_new_tweets(df_new,df_old)
        df_combined.drop_duplicates(subset = ['Time'],inplace=True)
    if len(df_old) == len(df_combined):
        # also no new tweets
        return df_old,len(df_old),False
    else:  
        return df_combined,len(df_old),True
        

In [172]:
check_me_df,lenold,newtweetflag = scan_for_new_tweets('../data/raw/',username="elonmusk")

In [173]:
check_me_df.iloc[9800:9810] #checks the old data vs. the join to the new data

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time
9800,elonmusk,1267157474886455296,NASASpaceflight,708,14436,[],,Brought home by same person who placed it ther...,https://twitter.com/elonmusk/status/1267157474...,2020-05-31 18:14:19+00:00
9801,elonmusk,1267160409498357764,NASASpaceflight,81,2494,[],,Must be due to relativistic aging,https://twitter.com/elonmusk/status/1267160409...,2020-05-31 18:25:58+00:00
9802,elonmusk,1267180654896254976,SpaceX,22581,250519,[],,Nine years later,https://twitter.com/elonmusk/status/1267180654...,2020-05-31 19:46:25+00:00
9803,elonmusk,1267402337653587968,scale_banana,1598,68779,[],,Where’s the banana!?,https://twitter.com/elonmusk/status/1267402337...,2020-06-01 10:27:19+00:00
9804,elonmusk,1267409179339296768,DjKeyWay,1396,9923,['#JusticeForGeorge'],,Definitely not right that the other officers w...,https://twitter.com/elonmusk/status/1267409179...,2020-06-01 10:54:30+00:00
9805,elonmusk,1267415489111785472,mharrisonair,483,11700,[],,Well said,https://twitter.com/elonmusk/status/1267415489...,2020-06-01 11:19:34+00:00
9806,elonmusk,1267531196751323144,PPathole,2342,36211,[],,Starship is the key to making life multiplanet...,https://twitter.com/elonmusk/status/1267531196...,2020-06-01 18:59:21+00:00
9807,elonmusk,1267650659320500226,,32819,570849,[],,Off Twitter for a while,https://twitter.com/elonmusk/status/1267650659...,2020-06-02 02:54:03+00:00
9808,elonmusk,1268595216971206656,SciGuySpace,929,13348,[],,So many war stories over 18 eventful years! Bu...,https://twitter.com/elonmusk/status/1268595216...,2020-06-04 17:27:23+00:00
9809,elonmusk,1268601657501220864,PPathole,355,3540,[],,That’s when all life on Earth will be boiled o...,https://twitter.com/elonmusk/status/1268601657...,2020-06-04 17:52:59+00:00


In [174]:
lenold

10035

In [175]:
newtweetflag

True

In [177]:
check_me_df.tail(10)

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time
10029,elonmusk,1276616756291747840,SamTalksTesla,87,3267,[],,Jay is awesome,https://twitter.com/elonmusk/status/1276616756...,2020-06-26 20:42:07+00:00
10030,elonmusk,1276716075422367746,waitbutwhy,823,12942,[],,It’s a tough one. That’s why we should expand ...,https://twitter.com/elonmusk/status/1276716075...,2020-06-27 03:16:47+00:00
10032,elonmusk,1276738326804873217,slashdot,137,2281,[],,"Verified should be far more widespread, simply...",https://twitter.com/elonmusk/status/1276738326...,2020-06-27 04:45:12+00:00
10034,elonmusk,1276747755914842112,CathieDWood,104,2718,[],,For sure,https://twitter.com/elonmusk/status/1276747755...,2020-06-27 05:22:40+00:00
10036,elonmusk,1276957305842528256,BLKMDL3,325,15635,[],,Major Supercharger increases are underway for ...,https://twitter.com/elonmusk/status/1276957305...,2020-06-27 19:15:21+00:00
10038,elonmusk,1276959792876011520,GerberKawasaki,494,6931,[],,"Physics favors electric transport, batteries f...",https://twitter.com/elonmusk/status/1276959792...,2020-06-27 19:25:14+00:00
10040,elonmusk,1277359833721655302,,22623,182453,[],,69 days after 4/20 again haha,https://twitter.com/elonmusk/status/1277359833...,2020-06-28 21:54:51+00:00
10041,elonmusk,1277382781308268545,neiltyson,542,8383,[],,Dogs rock,https://twitter.com/elonmusk/status/1277382781...,2020-06-28 23:26:02+00:00
10042,elonmusk,1277385215657177088,,295,3742,[],@jayleno,Great Model Y review by @jayleno,https://twitter.com/elonmusk/status/1277385215...,2020-06-28 23:35:42+00:00
10043,elonmusk,1277386215382110211,,190,2726,[],,"Btw, Tesla actually receives *least* subsidies...",https://twitter.com/elonmusk/status/1277386215...,2020-06-28 23:39:41+00:00


# Check that we can store and reload tweets successfully

In [97]:
pwd

'/Users/JJ/Insight/projects/Muskometer/Insight-Data-Science-Project/notebooks'

In [98]:
store_tweet_data(check_me_df,'../data/raw/',username="elonmusk")

In [188]:
elon_newandold_tweets_df = reload_tweet_data('../data/raw/',username="elonmusk")

In [190]:
len(elon_newandold_tweets_df.drop_duplicates(subset = ['Time']))

10035

In [100]:
elon_tweets_df.tail()

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time
9801,elonmusk,1267146619562201090,SpaceX,5576,67423,[],@Space_Station,Congratulations Bob & Doug on docking & hatch ...,https://twitter.com/elonmusk/status/1267146619...,2020-05-31 17:31:11+00:00
9802,elonmusk,1267156817295085575,Rogozin,1209,7558,[],,"Спасибо, сэр, ха-ха. Мы рассчитываем на взаимо...",https://twitter.com/elonmusk/status/1267156817...,2020-05-31 18:11:42+00:00
9803,elonmusk,1267157474886455296,NASASpaceflight,708,14436,[],,Brought home by same person who placed it ther...,https://twitter.com/elonmusk/status/1267157474...,2020-05-31 18:14:19+00:00
9804,elonmusk,1267160409498357764,NASASpaceflight,81,2494,[],,Must be due to relativistic aging,https://twitter.com/elonmusk/status/1267160409...,2020-05-31 18:25:58+00:00
9805,elonmusk,1267180654896254976,SpaceX,22581,250519,[],,Nine years later,https://twitter.com/elonmusk/status/1267180654...,2020-05-31 19:46:25+00:00


In [106]:
elon_newandold_tweets_df[9801:]

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time
9801,elonmusk,1267160409498357764,NASASpaceflight,81,2494,[],,Must be due to relativistic aging,https://twitter.com/elonmusk/status/1267160409...,2020-05-31 18:25:58+00:00
9802,elonmusk,1267180654896254976,SpaceX,22581,250519,[],,Nine years later,https://twitter.com/elonmusk/status/1267180654...,2020-05-31 19:46:25+00:00
9803,elonmusk,1267402337653587968,scale_banana,1598,68779,[],,Where’s the banana!?,https://twitter.com/elonmusk/status/1267402337...,2020-06-01 10:27:19+00:00
9804,elonmusk,1267409179339296768,DjKeyWay,1396,9923,['#JusticeForGeorge'],,Definitely not right that the other officers w...,https://twitter.com/elonmusk/status/1267409179...,2020-06-01 10:54:30+00:00
9805,elonmusk,1267415489111785472,mharrisonair,483,11700,[],,Well said,https://twitter.com/elonmusk/status/1267415489...,2020-06-01 11:19:34+00:00
...,...,...,...,...,...,...,...,...,...,...
10030,elonmusk,1276716075422367746,waitbutwhy,823,12942,[],,It’s a tough one. That’s why we should expand ...,https://twitter.com/elonmusk/status/1276716075...,2020-06-27 03:16:47+00:00
10031,elonmusk,1276738326804873217,slashdot,137,2281,[],,"Verified should be far more widespread, simply...",https://twitter.com/elonmusk/status/1276738326...,2020-06-27 04:45:12+00:00
10032,elonmusk,1276747755914842112,CathieDWood,104,2718,[],,For sure,https://twitter.com/elonmusk/status/1276747755...,2020-06-27 05:22:40+00:00
10033,elonmusk,1276957305842528256,BLKMDL3,325,15635,[],,Major Supercharger increases are underway for ...,https://twitter.com/elonmusk/status/1276957305...,2020-06-27 19:15:21+00:00


# Got it!

# Now we need to check the functionality constructing new tweet features

### load up the stock data scraper library and the stock to tweet library and get to work

In [107]:
import nltk
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
import re

In [178]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"RT", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),:;!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    df = df[df[text_field].notna()]
    return df

def apply_vader(tweet,category): #gives back the float value of the vader sentiment
    return sid.polarity_scores(tweet)[category]

def tokenize_hashtags(df): #tokenize hashtags, using one hot encoding
    df['hashtags_token'] = 0. #initialize all to zero
    df['hashtags_token'].loc[df['hashtags'] != '[]'] = 1. #any field with a hashtag set to 1.
    return df

def tokenize_mentions(df): #tokenize mentions, using one hot encoding
    df['mentions_token'] = 0. #initialize all to zero
    df['mentions_token'].loc[df['mentions'].notna()] = 1. #any field with a mention set to 1.
    return df

def tokenize_reply_to(df): #tokenize mentions, using one hot encoding
    df['reply_to_token'] = 0. #initialize all to zero
    df['reply_to_token'].loc[df['reply_to'].notna()] = 1. #any field with a reply_to set to 1.
    return df

def convert_hashtag(input_txt):
    if input_txt == '[]': #return empty string if no hashtag
        return ""
    input_list = input_txt.strip("['']").split("', '") #strips out useless characters
    txt_list = re.findall('[A-Z][^A-Z]*', " ".join(input_list)) #splits hastags into words on Captial letters
    return " ".join(txt_list)

def tweet_word_count(df):
    tokenizer = RegexpTokenizer(r'\w+') #split on words
    df["tokens"] = df["tweet"].apply(tokenizer.tokenize) #returns list of individual words
    df['tweet_length'] = df.apply(lambda row : len(row['tokens']), axis=1) #creates tweet length column
    df = df.drop(['tokens'],axis='columns') #drops the temporary column
    return df   

def integral_history(df,category,length):
    #the depth back in tweet history
    result = df[category]
    
    
def construct_features(tweets):
    """Constructs features from Elon's tweet data"""
    #generate the sentiment intensity analyzer instance
    try:
        sid = SentimentIntensityAnalyzer() #returns error if no lexicon
    except:
        nltk.download('vader_lexicon') #get the bloody lexicon
    # Clean the text of the tweets
    tweets = standardize_text(tweets,"text")
    # Tokenize the hashtags
    tweets = tokenize_hashtags(tweets)
    # Tokenize the mentions
    tweets = tokenize_mentions(tweets)
    # Tokenize the reply_to
    tweets = tokenize_reply_to(tweets)
    # Clean the text of the hastags
    tweets["hashtags"] = tweets.apply(lambda row: convert_hashtag(row['hashtags']),axis=1)
    # Prepare to apply vader to the tweets
    vader_categories = ['neg','neu','pos','compound']
    # Apply vader to the tweets
    for cat in vader_categories: #iterates over the categories
        #creates new feature each iteration
        tweets['text_'+cat] = tweets.apply(lambda row : apply_vader(row['text'],cat), axis=1)
    # Apply vader to the hashtags
    for cat in vader_categories: #iterates over the categories
        #creates new feature each iteration
        tweets['hashtags_'+cat] = tweets.apply(lambda row : apply_vader(row['hashtags'],cat), axis=1)
    #Do some temporal processing
    #Hour of the day
    tweets['hour'] = tweets['Time'].dt.hour
    #Time between tweets in seconds
    tweets['delta_time'] = abs(pd.to_timedelta((tweets['Time']-tweets['Time']\
                                                     .shift()).fillna(6000.)).astype('timedelta64[s]'))\
                                                     .replace(0.,6000.)
    tweets['log10_delta_time'] = np.log10(abs(pd.to_timedelta((tweets['Time']-tweets['Time']\
                                                     .shift()).fillna(60.)).astype('timedelta64[s]')\
                                                     .replace(0.,6000.)))
    #Make some rate of sentiment change features
    tweets['dcompound_dTime'] = (tweets['text_compound']-tweets['text_compound']
                                           .shift()).fillna(0.)/(tweets['delta_time']) #change per second
    tweets['dcompound_dTweet'] = (tweets['text_compound']-tweets['text_compound']
                                            .shift()).fillna(0.) #change per tweet
    #Make some integral sentiment change features
    tweets['integral_compound_5'] = tweets['text_compound'].rolling(min_periods=1, window=5).sum()
    tweets['integral_compound_10'] = tweets['text_compound'].rolling(min_periods=1, window=10).sum()
    #Make a difference sentiment features
    tweets['delta_compound_mean'] = tweets['text_compound'] - tweets['text_compound'].mean()
    tweets['delta_compound_median'] = tweets['text_compound'] - tweets['text_compound'].median()
    #All done for now
    return tweets

def strip_down_to_features_and_rescale(df):
    #drop improperly formatted data
    df = df.drop(['username','reply_to','retweets',
                  'tweet_id','favorites','hashtags','mentions',
                  'text','permalink','Time'],axis='columns')
    # These features are on a 0 to 1 scale
    zero_to_one = ['hour','delta_time','log10_delta_time']
    # These features are on a -1 to 1 scale
    negone_to_one = ['dcompound_dTime','dcompound_dTweet','integral_compound_5',
                    'integral_compound_10','delta_compound_mean','delta_compound_median']
    # shrink the scale for the zero_to_one features
    df[zero_to_one] /= df[zero_to_one].max()
    # shrink the scale for the -1 to 1 ranges
    # need to preserve true zero, however, so no shifting the mean
    for x in negone_to_one:
        # this won't fill the entire range -1 to 1, but it preserves true 0
        df[x] /= max(abs(df[x].min()),df[x].max())
    return df

### Need to construct only the features for the newest tweets so we'll use the last 10 of the old tweets as a reference for some of the integral time features.

In [113]:
len_old = len(elon_tweets_df)
print (len_old)

9806


In [191]:
print (len(elon_newandold_tweets_df))


10035


In [192]:
print (len(elon_newandold_tweets_df.iloc[len_old-10:]))

239


In [116]:
new_tweets_df = elon_newandold_tweets_df.iloc[len_old-10:]

In [117]:
new_tweets_df.head(15)

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time
9796,elonmusk,1267056905601638404,TeslaTested,1650,84762,[],,Probably,https://twitter.com/elonmusk/status/1267056905...,2020-05-31 11:34:41+00:00
9797,elonmusk,1267057495773675521,TeslaGong,81,3948,[],,Sure,https://twitter.com/elonmusk/status/1267057495...,2020-05-31 11:37:02+00:00
9798,elonmusk,1267146619562201090,SpaceX,5576,67423,[],@Space_Station,Congratulations Bob & Doug on docking & hatch ...,https://twitter.com/elonmusk/status/1267146619...,2020-05-31 17:31:11+00:00
9799,elonmusk,1267156817295085575,Rogozin,1630,11897,[],,"Спасибо, сэр, ха-ха. Мы рассчитываем на взаимо...",https://twitter.com/elonmusk/status/1267156817...,2020-05-31 18:11:42+00:00
9800,elonmusk,1267157474886455296,NASASpaceflight,708,14436,[],,Brought home by same person who placed it ther...,https://twitter.com/elonmusk/status/1267157474...,2020-05-31 18:14:19+00:00
9801,elonmusk,1267160409498357764,NASASpaceflight,81,2494,[],,Must be due to relativistic aging,https://twitter.com/elonmusk/status/1267160409...,2020-05-31 18:25:58+00:00
9802,elonmusk,1267180654896254976,SpaceX,22581,250519,[],,Nine years later,https://twitter.com/elonmusk/status/1267180654...,2020-05-31 19:46:25+00:00
9803,elonmusk,1267402337653587968,scale_banana,1598,68779,[],,Where’s the banana!?,https://twitter.com/elonmusk/status/1267402337...,2020-06-01 10:27:19+00:00
9804,elonmusk,1267409179339296768,DjKeyWay,1396,9923,['#JusticeForGeorge'],,Definitely not right that the other officers w...,https://twitter.com/elonmusk/status/1267409179...,2020-06-01 10:54:30+00:00
9805,elonmusk,1267415489111785472,mharrisonair,483,11700,[],,Well said,https://twitter.com/elonmusk/status/1267415489...,2020-06-01 11:19:34+00:00


### Let's test out the feature construction function

In [149]:
uf_new_tweets_df = construct_features(new_tweets_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_field] = df[text_field].str.replace(r"http\S+", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_field] = df[text_field].str.replace(r"http", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_field] = df[text_field].str.replace(r"@\S+", "")
A value is trying to be set on 

In [150]:
uf_new_tweets_df.loc[uf_new_tweets_df['integral_compound_10'].isna()]

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time,...,hashtags_compound,hour,delta_time,log10_delta_time,dcompound_dTime,dcompound_dTweet,integral_compound_5,integral_compound_10,delta_compound_mean,delta_compound_median


In [151]:
uf_oldtweets_df = pd.read_csv('../data/cleaned/elonmusk_unscaled_tweet_features.csv')\
                                .drop('Unnamed: 0',axis='columns')
uf_oldtweets_df['Time'] = pd.to_datetime(uf_oldtweets_df['Time'])

In [152]:
uf_oldtweets_df.columns#.loc[uf_oldtweets_df['Integral_compound_10'].isna()]

Index(['username', 'tweet_id', 'reply_to', 'retweets', 'favorites', 'hashtags',
       'mentions', 'text', 'permalink', 'Time', 'hashtags_token',
       'mentions_token', 'reply_to_token', 'text_neg', 'text_neu', 'text_pos',
       'text_compound', 'hashtags_neg', 'hashtags_neu', 'hashtags_pos',
       'hashtags_compound', 'hour', 'delta_time', 'log10_delta_time',
       'dcompound_dTime', 'dcompound_dTweet', 'integral_compound_5',
       'integral_compound_10', 'delta_compound_mean', 'delta_compound_median'],
      dtype='object')

In [153]:
len(uf_oldtweets_df)

9707

In [154]:
len(uf_new_tweets_df)

236

In [163]:
result = pd.concat([uf_oldtweets_df,uf_new_tweets_df]).reset_index().drop('index',axis='columns')
    #makes sure we're sorted properly in time order
result.sort_values(by='Time',ascending=True,inplace=True)

In [164]:
result.iloc[9697:9717] #should be some dupes

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time,...,hashtags_compound,hour,delta_time,log10_delta_time,dcompound_dTime,dcompound_dTweet,integral_compound_5,integral_compound_10,delta_compound_mean,delta_compound_median
9697,elonmusk,1266811094527508481,,54238,862612,,,5 mins to t 0,https://twitter.com/elonmusk/status/1266811094...,2020-05-30 19:17:55+00:00,...,0.0,19,27562.0,4.440311,-3e-06,-0.0868,0.8234,2.1632,-0.184917,0.0
9698,elonmusk,1266890648587776003,NASA,4042,64610,,,dragonship endeavor,https://twitter.com/elonmusk/status/1266890648...,2020-05-31 00:34:02+00:00,...,0.0,0,18967.0,4.277999,0.0,0.0,0.2155,2.1632,-0.184917,0.0
9699,elonmusk,1267056312497721344,SpaceX,16259,149590,,@Space_Station,dragon docks with in 3 hours,https://twitter.com/elonmusk/status/1267056312...,2020-05-31 11:32:20+00:00,...,0.0,11,39498.0,4.596575,0.0,0.0,-0.1864,1.6365,-0.184917,0.0
9700,elonmusk,1267056905601638404,TeslaTested,1650,84762,,,probably,https://twitter.com/elonmusk/status/1267056905...,2020-05-31 11:34:41+00:00,...,0.0,11,141.0,2.149219,0.0,0.0,0.0868,1.0116,-0.184917,0.0
9707,elonmusk,1267056905601638404,TeslaTested,1650,84762,,,probably,https://twitter.com/elonmusk/status/1267056905...,2020-05-31 11:34:41+00:00,...,0.0,11,6000.0,3.778151,0.0,0.0,0.0,0.0,-0.184081,0.0
9701,elonmusk,1267057495773675521,TeslaGong,81,3948,,,sure,https://twitter.com/elonmusk/status/1267057495...,2020-05-31 11:37:02+00:00,...,0.0,11,141.0,2.149219,0.002257,0.3182,0.3182,1.3298,0.133283,0.3182
9708,elonmusk,1267057495773675521,TeslaGong,81,3948,,,sure,https://twitter.com/elonmusk/status/1267057495...,2020-05-31 11:37:02+00:00,...,0.0,11,141.0,2.149219,0.002257,0.3182,0.3182,0.3182,0.134119,0.3182
9709,elonmusk,1267146619562201090,SpaceX,5576,67423,,@Space_Station,congratulations bob doug on docking hatch ...,https://twitter.com/elonmusk/status/1267146619...,2020-05-31 17:31:11+00:00,...,0.0,17,21249.0,4.327338,1.3e-05,0.2812,0.9176,0.9176,0.415319,0.5994
9702,elonmusk,1267146619562201090,SpaceX,5576,67423,,@Space_Station,congratulations bob doug on docking hatch ...,https://twitter.com/elonmusk/status/1267146619...,2020-05-31 17:31:11+00:00,...,0.0,17,21249.0,4.327338,1.3e-05,0.2812,0.9176,1.741,0.414483,0.5994
9703,elonmusk,1267156817295085575,Rogozin,1209,7558,,,", , ...",https://twitter.com/elonmusk/status/1267156817...,2020-05-31 18:11:42+00:00,...,0.0,18,2431.0,3.385785,-0.000247,-0.5994,0.9176,1.1331,-0.184917,0.0


In [165]:
result.drop_duplicates(subset = ['Time'],inplace = True)

In [166]:
result.iloc[9697:9717] # no more dupes

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time,...,hashtags_compound,hour,delta_time,log10_delta_time,dcompound_dTime,dcompound_dTweet,integral_compound_5,integral_compound_10,delta_compound_mean,delta_compound_median
9700,elonmusk,1267056905601638404,TeslaTested,1650,84762,,,probably,https://twitter.com/elonmusk/status/1267056905...,2020-05-31 11:34:41+00:00,...,0.0,11,141.0,2.149219,0.0,0.0,0.0868,1.0116,-0.184917,0.0
9701,elonmusk,1267057495773675521,TeslaGong,81,3948,,,sure,https://twitter.com/elonmusk/status/1267057495...,2020-05-31 11:37:02+00:00,...,0.0,11,141.0,2.149219,0.002257,0.3182,0.3182,1.3298,0.133283,0.3182
9709,elonmusk,1267146619562201090,SpaceX,5576,67423,,@Space_Station,congratulations bob doug on docking hatch ...,https://twitter.com/elonmusk/status/1267146619...,2020-05-31 17:31:11+00:00,...,0.0,17,21249.0,4.327338,1.3e-05,0.2812,0.9176,0.9176,0.415319,0.5994
9703,elonmusk,1267156817295085575,Rogozin,1209,7558,,,", , ...",https://twitter.com/elonmusk/status/1267156817...,2020-05-31 18:11:42+00:00,...,0.0,18,2431.0,3.385785,-0.000247,-0.5994,0.9176,1.1331,-0.184917,0.0
9704,elonmusk,1267157474886455296,NASASpaceflight,708,14436,,,brought home by same person who placed it ther...,https://twitter.com/elonmusk/status/1267157474...,2020-05-31 18:14:19+00:00,...,0.0,18,157.0,2.1959,0.0,0.0,0.9176,0.7312,-0.184917,0.0
9705,elonmusk,1267160409498357764,NASASpaceflight,81,2494,,,must be due to relativistic aging,https://twitter.com/elonmusk/status/1267160409...,2020-05-31 18:25:58+00:00,...,0.0,18,699.0,2.844477,0.0,0.0,0.9176,1.0044,-0.184917,0.0
9706,elonmusk,1267180654896254976,SpaceX,22581,250519,,,nine years later,https://twitter.com/elonmusk/status/1267180654...,2020-05-31 19:46:25+00:00,...,0.0,19,4827.0,3.683677,0.0,0.0,0.5994,0.9176,-0.184917,0.0
9714,elonmusk,1267402337653587968,scale_banana,1598,68779,,,where s the banana!?,https://twitter.com/elonmusk/status/1267402337...,2020-06-01 10:27:19+00:00,...,0.0,10,52854.0,4.723078,0.0,0.0,-1.110223e-16,0.9176,-0.184081,0.0
9715,elonmusk,1267409179339296768,DjKeyWay,1396,9923,Justice For George,,definitely not right that the other officers w...,https://twitter.com/elonmusk/status/1267409179...,2020-06-01 10:54:30+00:00,...,0.5267,10,1631.0,3.212454,-0.000181,-0.296,-0.296,0.6216,-0.480081,-0.296
9716,elonmusk,1267415489111785472,mharrisonair,483,11700,,,well said,https://twitter.com/elonmusk/status/1267415489...,2020-06-01 11:19:34+00:00,...,0.0,11,1504.0,3.177248,0.000378,0.5692,-0.0228,0.8948,0.089119,0.2732


### Okay!  Looks good, let's write a new funtion that reads in the stored unscaled tweet data, and combines it with the data from the newest tweets.

In [167]:
def combine_with_old_unscaled_tweet_features_and_store(df,username = 'elonmusk'):
    """Function to take the new tweets +10 data frame, process it for features,
        and combine it with the old unscaled features data."""
    # load old unscaled tweet data
    uf_oldtweets_df = pd.read_csv('../data/cleaned/'+username+'_unscaled_tweet_features.csv')\
                                .drop('Unnamed: 0',axis='columns')
    uf_oldtweets_df['Time'] = pd.to_datetime(uf_oldtweets_df['Time'])
    # compute the new tweet features
    uf_newtweets_df = construct_features(df)
    # combine the two data frames
    result = pd.concat([uf_oldtweets_df,uf_new_tweets_df]).reset_index().drop('index',axis='columns')
    # makes sure we're sorted properly in time order
    result.sort_values(by='Time',ascending=True,inplace=True)
    # eliminates duplicate entries
    result.drop_duplicates(subset = ['Time'],inplace = True)
    # store the results
    result.to_csv('../data/cleaned/'+username+'_unscaled_tweet_features.csv')
    return result

In [168]:
new_tweets_df = elon_newandold_tweets_df.iloc[len_old-10:]
uf_combined_df = combine_with_old_unscaled_tweet_features_and_store(new_tweets_df,username = 'elonmusk')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_field] = df[text_field].str.replace(r"http\S+", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_field] = df[text_field].str.replace(r"http", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_field] = df[text_field].str.replace(r"@\S+", "")
A value is trying to be set on 

In [170]:
uf_combined_df.tail()

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time,...,hashtags_compound,hour,delta_time,log10_delta_time,dcompound_dTime,dcompound_dTweet,integral_compound_5,integral_compound_10,delta_compound_mean,delta_compound_median
9938,elonmusk,1276716075422367746,waitbutwhy,823,12942,,,it s a tough one that s why we should expand ...,https://twitter.com/elonmusk/status/1276716075...,2020-06-27 03:16:47+00:00,...,0.0,3,23680.0,4.374382,-1.784628e-05,-0.4226,1.286,3.136,0.018219,0.2023
9939,elonmusk,1276738326804873217,slashdot,137,2281,,,"verified should be far more widespread, simply...",https://twitter.com/elonmusk/status/1276738326...,2020-06-27 04:45:12+00:00,...,0.0,4,5305.0,3.724685,-3.813384e-05,-0.2023,0.8272,3.136,-0.184081,0.0
9940,elonmusk,1276747755914842112,CathieDWood,104,2718,,,for sure,https://twitter.com/elonmusk/status/1276747755...,2020-06-27 05:22:40+00:00,...,0.0,5,2248.0,3.351796,0.000141548,0.3182,1.1454,2.8293,0.134119,0.3182
9941,elonmusk,1276957305842528256,BLKMDL3,325,15635,,,major supercharger increases are underway for ...,https://twitter.com/elonmusk/status/1276957305...,2020-06-27 19:15:21+00:00,...,0.0,19,49961.0,4.698631,8.606713e-07,0.043,1.5066,2.4909,0.177119,0.3612
9942,elonmusk,1276959792876011520,GerberKawasaki,480,6749,,,"physics favors electric transport, batteries f...",https://twitter.com/elonmusk/status/1276959792...,2020-06-27 19:25:14+00:00,...,0.0,19,593.0,2.773055,0.0001947723,0.1155,1.3584,2.4421,0.292619,0.4767


### Looks good, let's make sure that the scaling feature functions.  I'll need to rescale the tweet features every time because new values may lie outside previously established ranges.

In [179]:
sf_combined_df = strip_down_to_features_and_rescale(uf_combined_df)

In [181]:
sf_combined_df.tail()

Unnamed: 0,hashtags_token,mentions_token,reply_to_token,text_neg,text_neu,text_pos,text_compound,hashtags_neg,hashtags_neu,hashtags_pos,hashtags_compound,hour,delta_time,log10_delta_time,dcompound_dTime,dcompound_dTweet,integral_compound_5,integral_compound_10,delta_compound_mean,delta_compound_median
9938,0.0,0.0,1.0,0.069,0.826,0.106,0.2023,0.0,0.0,0.0,0.0,0.130435,0.000503,0.570128,-3e-05,-0.222996,0.352194,0.501704,0.015861,0.206703
9939,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173913,0.000113,0.485451,-6.4e-05,-0.106749,0.226543,0.501704,-0.160249,0.0
9940,0.0,0.0,1.0,0.0,0.303,0.697,0.3182,0.0,0.0,0.0,0.0,0.217391,4.8e-05,0.436851,0.000236,0.167907,0.313688,0.452637,0.116756,0.325125
9941,0.0,0.0,1.0,0.0,0.783,0.217,0.3612,0.0,0.0,0.0,0.0,0.826087,0.001062,0.612389,1e-06,0.02269,0.412609,0.398499,0.154189,0.369061
9942,0.0,0.0,1.0,0.0,0.728,0.272,0.4767,0.0,0.0,0.0,0.0,0.826087,1.3e-05,0.361422,0.000325,0.060947,0.372022,0.390692,0.254736,0.487075


# Alright!  Nextstep is to tag the anomalies.

In [182]:
import numpy as np
import pandas as pd
import pyod
from pyod.models.vae import VAE
from pyod.models.iforest import IForest

def fit_VAE_direct(df):
    """This is the function that performs unsupervised anomaly detection\
        on the scaled tweet data from a user."""
    #dataframe to array
    X = df.values
    ndim = X.shape[1] #the number of features
    random_state = np.random.RandomState(81)#Random seed
    outlier_fraction = 0.007 #.7% of all tweets are outliers (best fit)
    classifiers = {
        'Variational Auto Encoder (VAE)': VAE(epochs=20,
                contamination = outlier_fraction, random_state = random_state,
                encoder_neurons = [ndim,max(int(ndim/2),1),max(int(ndim/4),1)],
                decoder_neurons = [max(int(ndim/4),1),max(int(ndim/2),1),ndim],
                verbosity=0)
    }
    for i, (clf_name,clf) in enumerate(classifiers.items()):
        clf.fit(X)
        y_pred = clf.predict(X)
    return y_pred

Using TensorFlow backend.


In [183]:
anom_predictions = fit_VAE_direct(sf_combined_df)



In [184]:
np.shape(anom_predictions)

(9933,)

In [200]:
len_old

9806

In [204]:
print(anom_predictions[len_old:]) 

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [206]:
1 in anom_predictions[len_old-200:]

True

### No new anomalies since the last time I updated the data!

In [185]:
len(sf_combined_df)

9933

In [197]:
len(uf_combined_df)

9933

In [207]:
# Need a new function to scan for new anomalies
def scan_for_new_anomalies(y_pred,n_old):
    """Function that scans for new anomalies
        based on how many anomalies where in the last
        scan.  Simply returns a boolean True if there
        is a new anomaly."""
    return 1 in anom_predictions[n_old:]

In [208]:
scan_for_new_anomalies(anom_predictions,len_old)

False

## Excellent!

### Time to do the stock+tweet+anomaly wrangling

In [210]:
def nearest(items, pivot): # general get nearest value function
    return min(items, key=lambda x: abs(x - pivot))

def nearest_price(items, pivot, df): # nearest price function
    timestamp = min(items, key=lambda x: abs(x - pivot))
    return df.loc[df['DateTime']==timestamp,['Open']].values[0][0]
    
def join_tweets_and_stocks(stock_df,tweet_df):
    # set new column in the tweet data frame to have the stock date
    tweet_df['stock_time']=tweet_df['Time']\
                         .apply(lambda row: nearest(stock_df['DateTime'],row))
    # set new column in the tweet data frame to have the stock price
    tweet_df['stock_price']=tweet_df['Time']\
                         .apply(lambda row: nearest_price(\
                                            stock_df['DateTime'],row,stock_df))
    return tweet_df

def load_stock_data(stock_name):
    stock_df = pd.read_csv('../data/raw/'+stock_name.lower()+'_stock_price.csv')\
                                .drop('Unnamed: 0',axis='columns')
    stock_df['DateTime'] = pd.to_datetime(stock_df['DateTime'],utc=True)
    return stock_df

def join_tweets_stocks_anomalies(tweet_df,anomalies,stock_name):
    stock_df = load_stock_data(stock_name)
    # set new column in the tweet data frame to have the stock date
    tweet_df['stock_time']=tweet_df['Time']\
                         .apply(lambda row: nearest(stock_df['DateTime'],row))
    # set new column in the tweet data frame to have the stock price
    tweet_df['stock_price']=tweet_df['Time']\
                         .apply(lambda row: nearest_price(\
                                            stock_df['DateTime'],row,stock_df))
    # set the tags for which tweets are anomalous
    tweet_df['anomalous'] = anomalies
    return tweet_df
    

In [211]:
check_me_df = join_tweets_stocks_anomalies(uf_combined_df,anom_predictions,'TSLA')

### That function needs to be much, much faster.  Let's build an alternative

In [248]:
def join_tweets_stocks_anomalies_quick(tweet_df,anomalies,stock_name):
    stock_df = load_stock_data(stock_name)
    # send the data we want to arrays
    tweet_array = tweet_df['Time'].values
    time_array = stock_df['DateTime'].values
    stock_time_list = []
    stock_price_list = []
    for i in range(len(tweet_array)):
        bin_now = np.argmin(abs(time_array - tweet_array[i]))
        stock_time_list.append(stock_df.iloc[bin_now]['DateTime'])
        stock_price_list.append(stock_df.iloc[bin_now]['Open'])
    # set new column in the tweet data frame to have the stock date
    tweet_df['stock_time'] = stock_time_list
    # set new column in the tweet data frame to have the stock price
    tweet_df['stock_price'] = stock_price_list
    # set the tags for which tweets are anomalous
    tweet_df['anomalous'] = anomalies
    return tweet_df

In [218]:
uf_combined_df.loc[0]['Time'].date()

datetime.date(2011, 12, 1)

In [234]:
test_array = uf_combined_df['Time'].values

In [235]:
test_array

array(['2011-12-01T09:55:11.000000000', '2011-12-01T10:29:04.000000000',
       '2011-12-03T08:20:28.000000000', ...,
       '2020-06-27T05:22:40.000000000', '2020-06-27T19:15:21.000000000',
       '2020-06-27T19:25:14.000000000'], dtype='datetime64[ns]')

In [236]:
time_array = stock_df['DateTime'].values

In [237]:
time_array

array(['2010-06-29T00:00:00.000000000', '2010-06-30T00:00:00.000000000',
       '2010-07-01T00:00:00.000000000', ...,
       '2020-06-24T00:00:00.000000000', '2020-06-25T00:00:00.000000000',
       '2020-06-26T00:00:00.000000000'], dtype='datetime64[ns]')

In [240]:
np.argmin(abs(time_array - test_array[-100]))

2511

In [239]:
len(time_array)

2517

In [242]:
stock_df.iloc[500]['Open']

34.26

In [243]:
tweet_array = uf_combined_df['Time'].values
time_array = stock_df['DateTime'].values
stock_time_list = []
stock_price_list = []
for i in range(len(tweet_array)):
    bin_now = np.argmin(abs(time_array - tweet_array[i]))
    stock_time_list.append([stock_df.iloc[bin_now]['DateTime']])
    stock_price_list.append([stock_df.iloc[bin_now]['Open']])

In [249]:
check_me_too_df = join_tweets_stocks_anomalies_quick(uf_combined_df,anom_predictions,'TSLA')

In [250]:
check_me_df.tail()

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time,...,log10_delta_time,dcompound_dTime,dcompound_dTweet,integral_compound_5,integral_compound_10,delta_compound_mean,delta_compound_median,stock_time,stock_price,anomalous
9938,elonmusk,1276716075422367746,waitbutwhy,823,12942,,,it s a tough one that s why we should expand ...,https://twitter.com/elonmusk/status/1276716075...,2020-06-27 03:16:47+00:00,...,4.374382,-1.784628e-05,-0.4226,1.286,3.136,0.018219,0.2023,2020-06-26 00:00:00+00:00,994.78,0
9939,elonmusk,1276738326804873217,slashdot,137,2281,,,"verified should be far more widespread, simply...",https://twitter.com/elonmusk/status/1276738326...,2020-06-27 04:45:12+00:00,...,3.724685,-3.813384e-05,-0.2023,0.8272,3.136,-0.184081,0.0,2020-06-26 00:00:00+00:00,994.78,0
9940,elonmusk,1276747755914842112,CathieDWood,104,2718,,,for sure,https://twitter.com/elonmusk/status/1276747755...,2020-06-27 05:22:40+00:00,...,3.351796,0.000141548,0.3182,1.1454,2.8293,0.134119,0.3182,2020-06-26 00:00:00+00:00,994.78,0
9941,elonmusk,1276957305842528256,BLKMDL3,325,15635,,,major supercharger increases are underway for ...,https://twitter.com/elonmusk/status/1276957305...,2020-06-27 19:15:21+00:00,...,4.698631,8.606713e-07,0.043,1.5066,2.4909,0.177119,0.3612,2020-06-26 00:00:00+00:00,994.78,0
9942,elonmusk,1276959792876011520,GerberKawasaki,480,6749,,,"physics favors electric transport, batteries f...",https://twitter.com/elonmusk/status/1276959792...,2020-06-27 19:25:14+00:00,...,2.773055,0.0001947723,0.1155,1.3584,2.4421,0.292619,0.4767,2020-06-26 00:00:00+00:00,994.78,0


In [251]:
check_me_too_df.tail()

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time,...,log10_delta_time,dcompound_dTime,dcompound_dTweet,integral_compound_5,integral_compound_10,delta_compound_mean,delta_compound_median,stock_time,stock_price,anomalous
9938,elonmusk,1276716075422367746,waitbutwhy,823,12942,,,it s a tough one that s why we should expand ...,https://twitter.com/elonmusk/status/1276716075...,2020-06-27 03:16:47+00:00,...,4.374382,-1.784628e-05,-0.4226,1.286,3.136,0.018219,0.2023,2020-06-26 00:00:00+00:00,994.78,0
9939,elonmusk,1276738326804873217,slashdot,137,2281,,,"verified should be far more widespread, simply...",https://twitter.com/elonmusk/status/1276738326...,2020-06-27 04:45:12+00:00,...,3.724685,-3.813384e-05,-0.2023,0.8272,3.136,-0.184081,0.0,2020-06-26 00:00:00+00:00,994.78,0
9940,elonmusk,1276747755914842112,CathieDWood,104,2718,,,for sure,https://twitter.com/elonmusk/status/1276747755...,2020-06-27 05:22:40+00:00,...,3.351796,0.000141548,0.3182,1.1454,2.8293,0.134119,0.3182,2020-06-26 00:00:00+00:00,994.78,0
9941,elonmusk,1276957305842528256,BLKMDL3,325,15635,,,major supercharger increases are underway for ...,https://twitter.com/elonmusk/status/1276957305...,2020-06-27 19:15:21+00:00,...,4.698631,8.606713e-07,0.043,1.5066,2.4909,0.177119,0.3612,2020-06-26 00:00:00+00:00,994.78,0
9942,elonmusk,1276959792876011520,GerberKawasaki,480,6749,,,"physics favors electric transport, batteries f...",https://twitter.com/elonmusk/status/1276959792...,2020-06-27 19:25:14+00:00,...,2.773055,0.0001947723,0.1155,1.3584,2.4421,0.292619,0.4767,2020-06-26 00:00:00+00:00,994.78,0
