# This notebook is a test bed to check the functionality of the timeloop package and its efficacy for deployment of the Muskometer backend. 

## I'm going to need to have a background script running that scans for new tweets periodically and updates stock data periodically as well.  In addition, when new tweets are found, they need to be put through the anomaly detection alogrithm and the datasets used by the front end need to be updated.  The timeloop package seems to be a good way to do all this simultaneously within a single script.

In [1]:
import time
import timeloop
import datetime
from collections import defaultdict
import os, sys
import pandas as pd
import GetOldTweets3 as got
from timeloop import Timeloop
from datetime import timedelta

In [2]:
# This example code is shamelessly copied from :
# https://medium.com/greedygame-engineering/an-elegant-way-to-run-periodic-tasks-in-python-61b7c477b679
tl = Timeloop()

@tl.job(interval=timedelta(seconds=2))
def sample_job_every_2s():
    print ("2s job current time : {}".format(time.ctime()))
    
@tl.job(interval=timedelta(seconds=5))
def sample_job_every_5s():
    print ("5s job current time : {}".format(time.ctime()))
    
@tl.job(interval=timedelta(seconds=10))
def sample_job_every_10s():
    print ("10s job current time : {}".format(time.ctime()))
    
if __name__ == "__main__":
    tl.start(block=True)

[2020-06-28 13:36:28,636] [timeloop] [INFO] Starting Timeloop..
[2020-06-28 13:36:28,638] [timeloop] [INFO] Registered job <function sample_job_every_2s at 0x7fc4d7e15310>
[2020-06-28 13:36:28,639] [timeloop] [INFO] Registered job <function sample_job_every_5s at 0x7fc4d88040d0>
[2020-06-28 13:36:28,640] [timeloop] [INFO] Registered job <function sample_job_every_10s at 0x7fc4d88041f0>
[2020-06-28 13:36:28,640] [timeloop] [INFO] Timeloop now started. Jobs will run based on the interval set
2s job current time : Sun Jun 28 13:36:30 2020
2s job current time : Sun Jun 28 13:36:32 2020
5s job current time : Sun Jun 28 13:36:33 2020
2s job current time : Sun Jun 28 13:36:34 2020
2s job current time : Sun Jun 28 13:36:36 2020
10s job current time : Sun Jun 28 13:36:38 2020
5s job current time : Sun Jun 28 13:36:38 2020
2s job current time : Sun Jun 28 13:36:38 2020
2s job current time : Sun Jun 28 13:36:40 2020
2s job current time : Sun Jun 28 13:36:42 2020
5s job current time : Sun Jun 28 1

## Looks like that worked as advertised!

### Let's start building up the functionality of the backend and test it as we go.  First I'll load up the stored tweets from Elon Musk, scan for new tweets, and add them to the record.

In [45]:
def reload_tweet_data(path,username="elonmusk"):
   #note we'll have to do a .drop and set the 'Time' column to the proper values every time
    df = pd.read_csv(path+username+'_tweets.csv').drop(['Unnamed: 0'],axis='columns')
    #order by earliest first
    df['Time'] = pd.to_datetime(df['Time'])#.sort_values(by='Time',ascending=True)
    return df.sort_values(by='Time',ascending=True).reset_index().drop('index',axis='columns')
def prepend_new_tweets(df_new,df_old): #adds the new tweets to the front of the data set and resets the index
    result = pd.concat([df_old,df_new]).reset_index().drop('index',axis='columns')
    #makes sure we're sorted properly in time order
    result.sort_values(by='Time',ascending=True,inplace=True)
    return result.reset_index().drop('index',axis='columns')
def store_tweet_data(df,path,username="elonmusk"):
    df.to_csv(path+username+'_tweets.csv')
    return
def scan_for_new_tweets(path,username="elonmusk"):
    """Scans for new tweets ad adds them to the end
        of the old data frame"""
    df_old = reload_tweet_data(path,username) #get the old tweets
    #look for new tweets starting from latest date
    df_new = scrape_new_tweets(df_old['Time'].max(),username)
    if len(df_new) == 0:# No new tweets
        return df_old
    else:
        df_combined = prepend_new_tweets(df_new,df_old)
        return df_combined.drop_duplicates(subset = ['Time'])
def scrape_new_tweets(t_last_tweet,username = "elonmusk"):
    """Function to scrape the recent tweets of Elon Musk"""
    #t_last_tweet must be pandas Timestamp data
    os.makedirs('tweet_data', exist_ok=True)
    date_str = str(t_last_tweet.date().year)+"-"\
              +str(t_last_tweet.date().month)+"-"\
              +str(t_last_tweet.date().day)
    count = 0
    # Creation of query object                                                                                                                                                                                      
    tweetCriteria = got.manager.TweetCriteria().setUsername(username)\
                                               .setMaxTweets(count)\
                                               .setSince(date_str)
    # Creation of list that contains all tweets                                                                                                                                                                     
    tweets = None
    for ntries in range(5):
        try:
            tweets = got.manager.TweetManager.getTweets(tweetCriteria)
        except SystemExit:
            print("Trying again in 15 minutes.")
            time.sleep(15*60)
        else:
            break
    if tweets is None:
        print("Failed after 5 tries, quitting!")
        exit(1)

    data = defaultdict(list)
    for t in tweets:
        data["username"].append(username)
        data["tweet_id"].append(t.id)
        data["reply_to"].append(t.to)
        data["date"].append(t.date)
        data["retweets"].append(t.retweets)
        data["favorites"].append(t.favorites)
        data["hashtags"].append(list(set(t.hashtags.split())))
        data["mentions"].append(t.mentions)
        data["text"].append(t.text)
        data["permalink"].append(t.permalink)
    if len(data) == 0: #no new tweets
        return None
    else:
        #make a DataFrame out of the scraped tweets
        df = pd.DataFrame(data, columns=["username","tweet_id",
                                         "reply_to","date","retweets",
                                         "favorites","hashtags","mentions",
                                         "text","permalink"])        
        # Convert 'Time' column to datetime and strip time information.
        df['Time'] = pd.to_datetime(df['date'])
        df.drop(labels=['date'],axis=1,inplace=True)
        return df.sort_values(by='Time',ascending=True)

In [37]:
elon_tweets_df = reload_tweet_data('../data/raw/',username="elonmusk")

In [5]:
store_tweet_data(elon_tweets_df,'../data/raw/',username="elonmusk")

In [40]:
elon_tweets_df.dtypes

username                  object
tweet_id                   int64
reply_to                  object
retweets                   int64
favorites                  int64
hashtags                  object
mentions                  object
text                      object
permalink                 object
Time         datetime64[ns, UTC]
dtype: object

### Okay, let's see if we can scrape the new tweets from June.

In [46]:
new_tweets_df = scrape_new_tweets(elon_tweets_df['Time'].max(),username = "elonmusk")

In [47]:
new_tweets_df.tail()

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time
4,elonmusk,1276716075422367746,waitbutwhy,822,12936,[],,It’s a tough one. That’s why we should expand ...,https://twitter.com/elonmusk/status/1276716075...,2020-06-27 03:16:47+00:00
3,elonmusk,1276738326804873217,slashdot,137,2280,[],,"Verified should be far more widespread, simply...",https://twitter.com/elonmusk/status/1276738326...,2020-06-27 04:45:12+00:00
2,elonmusk,1276747755914842112,CathieDWood,104,2718,[],,For sure,https://twitter.com/elonmusk/status/1276747755...,2020-06-27 05:22:40+00:00
1,elonmusk,1276957305842528256,BLKMDL3,325,15615,[],,Major Supercharger increases are underway for ...,https://twitter.com/elonmusk/status/1276957305...,2020-06-27 19:15:21+00:00
0,elonmusk,1276959792876011520,GerberKawasaki,480,6735,[],,"Physics favors electric transport, batteries f...",https://twitter.com/elonmusk/status/1276959792...,2020-06-27 19:25:14+00:00


In [48]:
#if no new tweets, we should get back a None data type
test_tweets_df = scrape_new_tweets(new_tweets_df['Time'].max(),username = "elonmusk")

In [49]:
test_tweets_df.head()

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time
4,elonmusk,1276716075422367746,waitbutwhy,822,12936,[],,It’s a tough one. That’s why we should expand ...,https://twitter.com/elonmusk/status/1276716075...,2020-06-27 03:16:47+00:00
3,elonmusk,1276738326804873217,slashdot,137,2280,[],,"Verified should be far more widespread, simply...",https://twitter.com/elonmusk/status/1276738326...,2020-06-27 04:45:12+00:00
2,elonmusk,1276747755914842112,CathieDWood,104,2718,[],,For sure,https://twitter.com/elonmusk/status/1276747755...,2020-06-27 05:22:40+00:00
1,elonmusk,1276957305842528256,BLKMDL3,325,15617,[],,Major Supercharger increases are underway for ...,https://twitter.com/elonmusk/status/1276957305...,2020-06-27 19:15:21+00:00
0,elonmusk,1276959792876011520,GerberKawasaki,480,6735,[],,"Physics favors electric transport, batteries f...",https://twitter.com/elonmusk/status/1276959792...,2020-06-27 19:25:14+00:00


# Looks like the API only has date as it's finest time setting.

### To handle this, we'll have to eliminate duplicates after joining the new tweet data.

In [95]:
def scan_for_new_tweets(path,username="elonmusk"):
    """Scans for new tweets ad adds them to the end
        of the old data frame"""
    df_old = reload_tweet_data(path,username) #get the old tweets
    #look for new tweets starting from latest date
    df_new = scrape_new_tweets(df_old['Time'].max(),username)
    if len(df_new) == 0:# No new tweets
        return df_old
    else:
        df_combined = prepend_new_tweets(df_new,df_old)
        return df_combined.drop_duplicates(subset = ['Time']) #drops duplicates
        

In [96]:
check_me_df = scan_for_new_tweets('../data/raw/',username="elonmusk")

In [93]:
check_me_df.iloc[9800:9810] #checks the old data vs. the join to the new data

Unnamed: 0,username,tweet_id,reply_to,retweets,favorites,hashtags,mentions,text,permalink,Time
9809,elonmusk,1267157474886455296,NASASpaceflight,708,14436,[],,Brought home by same person who placed it ther...,https://twitter.com/elonmusk/status/1267157474...,2020-05-31 18:14:19+00:00
9811,elonmusk,1267160409498357764,NASASpaceflight,81,2494,[],,Must be due to relativistic aging,https://twitter.com/elonmusk/status/1267160409...,2020-05-31 18:25:58+00:00
9813,elonmusk,1267180654896254976,SpaceX,22581,250519,[],,Nine years later,https://twitter.com/elonmusk/status/1267180654...,2020-05-31 19:46:25+00:00
9815,elonmusk,1267402337653587968,scale_banana,1598,68779,[],,Where’s the banana!?,https://twitter.com/elonmusk/status/1267402337...,2020-06-01 10:27:19+00:00
9816,elonmusk,1267409179339296768,DjKeyWay,1396,9923,[#JusticeForGeorge],,Definitely not right that the other officers w...,https://twitter.com/elonmusk/status/1267409179...,2020-06-01 10:54:30+00:00
9817,elonmusk,1267415489111785472,mharrisonair,483,11700,[],,Well said,https://twitter.com/elonmusk/status/1267415489...,2020-06-01 11:19:34+00:00
9818,elonmusk,1267531196751323144,PPathole,2342,36211,[],,Starship is the key to making life multiplanet...,https://twitter.com/elonmusk/status/1267531196...,2020-06-01 18:59:21+00:00
9819,elonmusk,1267650659320500226,,32819,570849,[],,Off Twitter for a while,https://twitter.com/elonmusk/status/1267650659...,2020-06-02 02:54:03+00:00
9820,elonmusk,1268595216971206656,SciGuySpace,929,13348,[],,So many war stories over 18 eventful years! Bu...,https://twitter.com/elonmusk/status/1268595216...,2020-06-04 17:27:23+00:00
9821,elonmusk,1268601657501220864,PPathole,355,3540,[],,That’s when all life on Earth will be boiled o...,https://twitter.com/elonmusk/status/1268601657...,2020-06-04 17:52:59+00:00
