Temporary notebook to collect up the functions that I'll use later when I make this official

In [1]:
from collections import defaultdict
import os, sys
import time
import pandas as pd
import GetOldTweets3 as got

def scrape_new_tweets(t_last_tweet,username = "elonmusk"):
    """Function to scrape the recent tweets of Elon Musk"""
    #t_last_tweet must be pandas Timestamp data
    os.makedirs('tweet_data', exist_ok=True)
    date_str = str(t_last_tweet.date().year)+"-"\
              +str(t_last_tweet.date().month)+"-"\
              +str(t_last_tweet.date().day)
    count = 0
    # Creation of query object                                                                                                                                                                                      
    tweetCriteria = got.manager.TweetCriteria().setUsername(username)\
                                               .setMaxTweets(count)\
                                               .setSince(date_str)
    # Creation of list that contains all tweets                                                                                                                                                                     
    tweets = None
    for ntries in range(5):
        try:
            tweets = got.manager.TweetManager.getTweets(tweetCriteria)
        except SystemExit:
            print("Trying again in 15 minutes.")
            time.sleep(15*60)
        else:
            break
    if tweets is None:
        print("Failed after 5 tries, quitting!")
        exit(1)

    data = defaultdict(list)
    for t in tweets:
        data["username"].append(username)
        data["tweet_id"].append(t.id)
        data["reply_to"].append(t.to)
        data["date"].append(t.date)
        data["retweets"].append(t.retweets)
        data["favorites"].append(t.favorites)
        data["hashtags"].append(list(set(t.hashtags.split())))
        data["mentions"].append(t.mentions)
        data["text"].append(t.text)
        data["permalink"].append(t.permalink)
    if len(data) == 0: #no new tweets
        return None
    else:
        #make a DataFrame out of the scraped tweets
        df = pd.DataFrame(data, columns=["username","tweet_id",
                                         "reply_to","date","retweets",
                                         "favorites","hashtags","mentions",
                                         "text","permalink"])        
        # Convert 'Time' column to datetime and strip time information.
        df['Time'] = pd.to_datetime(df['date']).sort_values(by='Time',ascending=True)
        return df
    
def reload_tweet_data(username="elonmusk",path):
   #note we'll have to do a .drop and set the 'Time' column to the proper values every time
    df = pd.read_csv(path+username+'.csv').drop(['Unnamed: 0'],axis='columns')
    #order by earliest first
    df['Time'] = pd.to_datetime(df['Time'])#.sort_values(by='Time',ascending=True)
    return df.sort_values(by='Time',ascending=True).reset_index().drop('index',axis='columns')

def prepend_new_tweets(df_new,df_old): #adds the new tweets to the front of the data set and resets the index
    result = pd.concat([df_old,df_new]).reset_index().drop('index',axis='columns')
    #makes sure we're sorted properly in time order
    result.sort_values(by='Time',ascending=True)
    return result.reset_index().drop('index',axis='columns')

def store_tweet_data(df,username="elonmusk",path):
    df.to_csv(path+username+'.csv')
    return

def scan_for_new_tweets(username="elonmusk",path):
    df_old = reload_tweet_data(username="elonmusk",path) #get the old tweets
    #look for new tweets starting from latest date
    df_new = scrape_new_tweets(df_old['Time'].max(),username = "elonmusk")
    if df_new == None:# No new tweets
        return df_old
    else:
        df_combined = prepend_new_tweets(df_new,df_old)
        return df_combined


In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
import re
nltk.download('vader_lexicon') #get the bloody lexicon


def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"RT", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),:;!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    df = df[df[text_field].notna()]
    return df

def apply_vader(tweet,category): #gives back the float value of the vader sentiment
    return sid.polarity_scores(tweet)[category]

def tokenize_hashtags(df): #tokenize hashtags, using one hot encoding
    df['hashtags_token'] = 0. #initialize all to zero
    df['hashtags_token'].loc[df['hashtags'] != '[]'] = 1. #any field with a hashtag set to 1.
    return df

def tokenize_mentions(df): #tokenize mentions, using one hot encoding
    df['mentions_token'] = 0. #initialize all to zero
    df['mentions_token'].loc[df['mentions'].notna()] = 1. #any field with a mention set to 1.
    return df

def tokenize_reply_to(df): #tokenize mentions, using one hot encoding
    df['reply_to_token'] = 0. #initialize all to zero
    df['reply_to_token'].loc[df['reply_to'].notna()] = 1. #any field with a reply_to set to 1.
    return df

def convert_hashtag(input_txt):
    if input_txt == '[]': #return empty string if no hashtag
        return ""
    input_list = input_txt.strip("['']").split("', '") #strips out useless characters
    txt_list = re.findall('[A-Z][^A-Z]*', " ".join(input_list)) #splits hastags into words on Captial letters
    return " ".join(txt_list)

def tweet_word_count(df):
    tokenizer = RegexpTokenizer(r'\w+') #split on words
    df["tokens"] = df["tweet"].apply(tokenizer.tokenize) #returns list of individual words
    df['tweet_length'] = df.apply(lambda row : len(row['tokens']), axis=1) #creates tweet length column
    df = df.drop(['tokens'],axis='columns') #drops the temporary column
    return df   

def integral_history(df,category,length):
    #the depth back in tweet history
    result = df[category]
    
    
def construct_features(tweets):
    """Constructs features from Elon's tweet data"""
    #generate the sentiment intensity analyzer instance
    sid = SentimentIntensityAnalyzer() #returns error if no lexicon
    # Clean the text of the tweets
    tweets = standardize_text(tweets,"text")
    # Tokenize the hashtags
    tweets = tokenize_hashtags(tweets)
    # Tokenize the mentions
    tweets = tokenize_mentions(tweets)
    # Tokenize the reply_to
    tweets = tokenize_reply_to(tweets)
    # Clean the text of the hastags
    tweets["hashtags"] = tweets.apply(lambda row: convert_hashtag(row['hashtags']),axis=1)
    # Prepare to apply vader to the tweets
    vader_categories = ['neg','neu','pos','compound']
    # Apply vader to the tweets
    for cat in vader_categories: #iterates over the categories
        #creates new feature each iteration
        tweets['text_'+cat] = tweets.apply(lambda row : apply_vader(row['text'],cat), axis=1)
    # Apply vader to the hashtags
    for cat in vader_categories: #iterates over the categories
        #creates new feature each iteration
        tweets['hashtags_'+cat] = tweets.apply(lambda row : apply_vader(row['hashtags'],cat), axis=1)
    #Do some temporal processing
    #Hour of the day
    tweets['hour'] = tweets['Time'].dt.hour
    #Time between tweets in seconds
    tweets['delta_time'] = abs(pd.to_timedelta((tweets['Time']-tweets['Time']\
                                                     .shift()).fillna(6000.)).astype('timedelta64[s]'))\
                                                     .replace(0.,6000.)
    tweets['log10_delta_time'] = np.log10(abs(pd.to_timedelta((tweets['Time']-tweets['Time']\
                                                     .shift()).fillna(60.)).astype('timedelta64[s]')\
                                                     .replace(0.,6000.)))
    #Make some rate of sentiment change features
    tweets['dcompound_dTime'] = (tweets['text_compound']-tweets['text_compound']
                                           .shift()).fillna(0.)/(tweets['delta_time']) #change per second
    tweets['dcompound_dTweet'] = (tweets['text_compound']-tweets['text_compound']
                                            .shift()).fillna(0.) #change per tweet
    #Make some integral sentiment change features
    tweets['Integral_compound_5'] = tweets['text_compound'].rolling(min_periods=1, window=5).sum()
    tweets['Integral_compound_10'] = tweets['text_compound'].rolling(min_periods=1, window=10).sum()
    #Make a difference sentiment features
    tweets['delta_compound_mean'] = tweets['text_compound'] - tweets['text_compound'].mean()
    tweets['delta_compound_median'] = tweets['text_compound'] - tweets['text_compound'].median()
    #All done for now
    return tweets

def strip_down_to_features_and_rescale(df):
    #drop improperly formatted data
    df = df.drop(['username','reply_to','retweets',
                  'tweet_id','favorites','hashtags','mentions',
                  'text','permalink','Time'],axis='columns')
    # These features are on a 0 to 1 scale
    zero_to_one = ['hour','delta_time','log10_delta_time']
    # These features are on a -1 to 1 scale
    negone_to_one = ['dcompound_dTime','dcompound_dTweet','integral_compound_5',
                    'integral_compound_10','delta_compound_mean','delta_compound_median']
    # shrink the scale for the zero_to_one features
    zero_to_one_scale = df[zero_to_one].max()
    df[zero_to_one] /= df[zero_to_one].max()
    # shrink the scale for the -1 to 1 ranges
    # need to preserve true zero, however, so no shifting the mean
    negone_to_one_scale = [] #list to hold the rescaling
    for x in negone_to_one:
        # this won't fill the entire range -1 to 1, but it preserves true 0
        df[x] /= max(abs(df[x].min()),df[x].max())
        negone_to_one_scale += [max(abs(df[x].min()),df[x].max())]
    return df,zero_to_one_scale,negone_to_one_scale

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/JJ/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
import numpy as np
from numpy import percentile
import matplotlib.pyplot as plt
import matplotlib.font_manager
from pyod.models.vae import VAE

def identify_anomalies(X,outlier_fraction = 0.01,epochs=20):
    """A function that performs variational auto encoding analysis on the tweet data"""
    ndim = X.shape[1] #the number of features
    random_state = np.random.RandomState(42)
    #outlier_fraction = 0.01 #1% of all tweets are outliers
    #specifies the model parameters
    classifiers = {
        'Variational Auto Encoder (VAE)': VAE(epochs,
                contamination = outlier_fraction, random_state = random_state,
                encoder_neurons = [ndim,max(int(ndim/2),1),max(int(ndim/4),1)],
                decoder_neurons = [max(int(ndim/4),1),max(int(ndim/2),1),20],
                verbosity=0)
    }

    for i, (clf_name,clf) in enumerate(classifiers.items()):
        clf.fit(X) #fits the model
        scores_pred = clf.decision_function(X) * -1 #model scores
        y_pred = clf.predict(X) #model predictions for anomalies
    return y_pred

# Don't forget to do this at some point:
unscaled_tweet_features_df['anomalous'] = y_pred
unscaled_tweet_features_df.to_csv('../data/processed/anomaly_tagged_tweet_features.csv')