In [1]:
import pandas as pd
from glob import glob
import re
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

# Tokenization
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jaromir\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Cleaning steps:

1. Get tweet_id
1. Drop unnecessary columns - permalink, formated date 
1. Extract cashtags
1. Extract emojis
1. 

## Load Data - New

In [3]:
pd.set_option('display.max_colwidth',280)
pd.set_option('display.html.use_mathjax', False)
pd.set_option('display.max_rows', 1000)

In [4]:
src_path = r'c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\04_CleanData'

In [5]:
filenames = glob(src_path+'\*.pkl')

In [6]:
filenames

['c:\\Users\\jaromir\\OneDrive\\UoM\\100_Disertation\\02_SrcData\\04_CleanData\\stock_prices_20200805.pkl',
 'c:\\Users\\jaromir\\OneDrive\\UoM\\100_Disertation\\02_SrcData\\04_CleanData\\tweets_20200826_110327.pkl']

In [7]:
for file in filenames:
    if 'tweets' in file:
        tweets = pd.read_pickle(file)
    else:
        stock_prices = pd.read_pickle(file)

In [8]:
df_text = tweets.loc[:,['text','ticker']]
df_rest = tweets.loc[:,~tweets.columns.isin(['text','ticker'])]

## Load Data - Existing

In [7]:
src_path_ex = r'c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\05_PreProcessed'

# search for available files
filenames = glob(src_path_ex+'\*.pkl')

# load files
# tweets = pd.read_pickle(filenames[0])
#stock_prices = pd.read_pickle(filenames[1])

#df_text = tweets.loc[:,['text','ticker','filtered_text','spacy_lemma','nltk_lemma']]
#df_rest = tweets.loc[:,~tweets.columns.isin(['text','ticker','filtered_text','spacy_lemma','nltk_lemma'])]

# Process Tweets

## Extract cashtags

In [9]:
# change $ticker to #ticker so that it get tokenized together
# make sure it is not greedy 
cashtag_finder = re.compile(r"\$[a-zA-Z]+")

In [10]:
def find_cashtags(document, filter_condition):
    """extracts cashtags from a tweets and return a strign separated by spaces"""
    document = re.findall(filter_condition, document)
    cashtags = ' '.join(document).upper()
    return cashtags

In [11]:
def cashtag_handle(df, sub_regex):
    """finds cashtags and moves them to a separate feature"""   
    df.loc[:,'cashtags'] = df.loc[:,'text'].apply(find_cashtags, args=[sub_regex])
    return df

In [12]:
%%time
df_text = cashtag_handle(df_text,cashtag_finder)

Wall time: 3.32 s


# Extract emojis

In [13]:
import emoji

In [14]:
def extract_emojis(s):
    return ''.join(c for c in s if c in emoji.UNICODE_EMOJI)

In [15]:
%%time
df_text['emoji'] = df_text.loc[:,'text'].apply(extract_emojis)

Wall time: 7.28 s


In [16]:
df_text[['text','emoji']].sample(5)

Unnamed: 0,text,emoji
191464,This is a big deal. $amzn. Amazon’s ad biz-3.9 billion in revenue—an increase of 44% YoY. . Market Realist,
91252,"I’m long $AAPL (largest % holding) $MSFT, $SBUX $T. Have been adding to my positions over the last month. Also added $TMUS $MA and $V. Have been watching $HD and have some hindsight bias about missing it at $201 a few weeks ago.",
380991,Day three of rotation. $MSFT calls likely to heat up tomorrow as Rotatavirus curve flattens. Ridiculous.,
286276,Cut losses on $GILD,
301890,"Johnson & Johnson $JNJ COO Michael E. Sneed Sells 58,128 Shares http://zpr.io/tKJCZ",


# Clean tweets

### Replace characters

In [17]:
# replace a + b 
plus_filter = re.compile(r"\s\+\s")

idx = 300 
print(df_text.loc[idx,'text'],"\n")
print(re.sub(plus_filter, "plus",df_text.loc[idx,'text']))

$AAPL $MSFT $JPM - Apple, Microsoft top Dow 2019 standings; Walgreens wobbles to the bottom https://seekingalpha.com/news/3529009-apple-microsoft-top-dow-2019-standings-walgreens-wobbles-to-bottom?source=tweet 

$AAPL $MSFT $JPM - Apple, Microsoft top Dow 2019 standings; Walgreens wobbles to the bottom https://seekingalpha.com/news/3529009-apple-microsoft-top-dow-2019-standings-walgreens-wobbles-to-bottom?source=tweet


In [18]:
# replace 4+
four_plus_filter = re.compile(r"\d+\+")

idx = 217169
print(df_text.loc[idx,'text'],"\n")
print(re.sub(four_plus_filter, "more than",df_text.loc[idx,'text']))

$AMZN Verizon joins Amazon and Global Optimism in signing The Climate Pledge https://www.otcdynamics.com/amzn-verizon-joins-amazon-and-global-optimism-in-signing-the-climate-pledge/?utm_campaign=twitter&amp;utm_medium=twitter&amp;utm_source=twitter 

$AMZN Verizon joins Amazon and Global Optimism in signing The Climate Pledge https://www.otcdynamics.com/amzn-verizon-joins-amazon-and-global-optimism-in-signing-the-climate-pledge/?utm_campaign=twitter&amp;utm_medium=twitter&amp;utm_source=twitter


In [19]:
# replace  -70%
minus_filter = re.compile(r"\s\-\d+%*")

idx = 63774
print(df_text.loc[idx,'text'],"\n")
print(re.sub(minus_filter, "",df_text.loc[idx,'text']))

Forever #mood $X $SPY $QQQ $AAPL  

Forever #mood $X $SPY $QQQ $AAPL 


In [20]:
# replace  %
percent_filter = re.compile(r"%+")

idx = 19011
print(df_text.loc[idx,'text'],"\n")
print(re.sub(percent_filter, "",df_text.loc[idx,'text']))

$AAPL has cored me somewhat today, my SHORT is underwater, still holding as the premise has NOT changed, out SHORTS on $NFLX, $SAM, $BA, $OLED, $AMD are more than offsetting $AAPL losses by 3-fold today 

$AAPL has cored me somewhat today, my SHORT is underwater, still holding as the premise has NOT changed, out SHORTS on $NFLX, $SAM, $BA, $OLED, $AMD are more than offsetting $AAPL losses by 3-fold today


In [21]:
renamed_strings = {'plus': [plus_filter,'plus'],
                   'four_plus' : [four_plus_filter,'more than'],
                   'minus' : [minus_filter,'minus'],
                   'percent' : [percent_filter,'percentage']}

In [22]:
def rename_strings(document, filter_condition, replacement_string):
    """replace given regEx string with a space"""
    document = re.sub(filter_condition, replacement_string, document)
    return document

In [23]:
def replace_strings_in_documents(df, filters):
    """filter results for all documents"""
    for char_filter in filters.keys():       
        df.loc[:,'filtered_text'] = df.loc[:,'text'].apply(rename_strings, args=filters[char_filter])
    return df

In [24]:
%%time
df_text = replace_strings_in_documents(df_text, renamed_strings)

Wall time: 8.65 s


In [25]:
df_text.loc[:,['text','filtered_text']].sample(5)

Unnamed: 0,text,filtered_text
326946,Lol...but would help my $MCD puts,Lol...but would help my $MCD puts
261569,•💡 Bulls &amp; Bears Make Money Pigs Get Slaughtered. 📈 Long - Focus on Reltve strngth into any morning weakness into 3312 📉 Short - Watch internals at 3345 4short add $spy $aapl $nvda $amzn $uso $bynd $ba $bmy $dis $tsla $wmt $xom $biib $gild $nflx $gold $ccl $mcd $xle $shak,•💡 Bulls &amp; Bears Make Money Pigs Get Slaughtered. 📈 Long - Focus on Reltve strngth into any morning weakness into 3312 📉 Short - Watch internals at 3345 4short add $spy $aapl $nvda $amzn $uso $bynd $ba $bmy $dis $tsla $wmt $xom $biib $gild $nflx $gold $ccl $mcd $xle $shak
100813,"Dow Jones Stocks To Buy And Watch In May 2020; Apple, Microsoft Approach New Buy Points $JNJ $AAPL $INTC $HD $MSFT","Dow Jones Stocks To Buy And Watch In May 2020; Apple, Microsoft Approach New Buy Points $JNJ $AAPL $INTC $HD $MSFT"
224559,Top Stock Trades for Thursday $AMZN $APT $LAKE $RAD,Top Stock Trades for Thursday $AMZN $APT $LAKE $RAD
116582,i’d say this is the chop/value zone coin flip how we move from here but i’d say down is more likely. if we do let’s see how far we go $SPY $AAPL,i’d say this is the chop/value zone coin flip how we move from here but i’d say down is more likely. if we do let’s see how far we go $SPY $AAPL


### Filter noise

In [26]:
# remove emojis
def remove_emojis(text):
    return ''.join(token for token in text if token not in emoji.UNICODE_EMOJI)

In [27]:
%%time
df_text.loc[:,'filtered_text'] = df_text.loc[:,'filtered_text'].apply(remove_emojis)

Wall time: 11.3 s


In [28]:
# remove urls
url_filter = re.compile(r"www.[\w\d]+.\w+|http://\S+|https://\S+")
print(df_text.loc[5,'text'])
print(re.sub(url_filter, "",df_text.loc[5,'text']))

Apple and Microsoft $AAPL $MSFT dominated S&amp;P 500 this past decade:
Apple and Microsoft $AAPL $MSFT dominated S&amp;P 500 this past decade:


In [29]:
# remove hashtags
hashtag_filter = re.compile(r"#\w+")
print(df_text.loc[0,'text'],"\n")
print(re.sub(hashtag_filter, "",df_text.loc[0,'text']))

2019 Jan 2nd (22:30gmt) $JPY flash crash backdrop: - risk aversion sentiment on political risks - material shift in $AUD short to long positioning in prior quarter (build of stops) - $AAPL downgrade amid fears of China slowdown - China #Caixin PMI miss at 48.3 approx 3 year low 

2019 Jan 2nd (22:30gmt) $JPY flash crash backdrop: - risk aversion sentiment on political risks - material shift in $AUD short to long positioning in prior quarter (build of stops) - $AAPL downgrade amid fears of China slowdown - China  PMI miss at 48.3 approx 3 year low


In [30]:
# remove mentions
mentions_filter = re.compile(r"@\w+")
print(df_text.loc[5,'text'],"\n")
print(re.sub(mentions_filter, "",df_text.loc[5,'text']))

Apple and Microsoft $AAPL $MSFT dominated S&amp;P 500 this past decade: 

Apple and Microsoft $AAPL $MSFT dominated S&amp;P 500 this past decade:


In [31]:
# remove cashtags
cashtag_filter = re.compile(r"\$\w+")
print(df_text.loc[0,'text'],"\n")
print(re.sub(cashtag_filter, "",df_text.loc[0,'text']))

2019 Jan 2nd (22:30gmt) $JPY flash crash backdrop: - risk aversion sentiment on political risks - material shift in $AUD short to long positioning in prior quarter (build of stops) - $AAPL downgrade amid fears of China slowdown - China #Caixin PMI miss at 48.3 approx 3 year low 

2019 Jan 2nd (22:30gmt)  flash crash backdrop: - risk aversion sentiment on political risks - material shift in  short to long positioning in prior quarter (build of stops) -  downgrade amid fears of China slowdown - China #Caixin PMI miss at 48.3 approx 3 year low


In [32]:
# amount & date filter
amount_filter = re.compile(r"""\d+[kKmM]+           # 4k, 5M
                              |\d+B                 # 1B
                              |\d+c                 # 1c
                              |\d+bn                # 1bn
                              |\d+BN                # 1BN
                              |\d+Bn                # 1Bn
                              |\d+mil               # 1mil
                              |\d+st                # 1st
                              |\d+nd                # 2nd
                              |\d+rd                # 3rd
                              |\d+th                # 4th
                              |\d+y                 # 25y
                                """,
                                re.VERBOSE)
idx = 83543
print(df_text.loc[idx,'text'],"\n")
print(re.sub(amount_filter, "",df_text.loc[idx,'text']))

Its amazning how much more money $AAPL makes than $AMZN (3-6x) but they roughly have the same market cap. 

Its amazning how much more money $AAPL makes than $AMZN (3-6x) but they roughly have the same market cap.


In [33]:
# remove non-alphanumeric characters
schar_filter = re.compile(r"[0-9%&+?!$,;=:.(…)\"'/{}“-]+")
idx = 29524
print(df_text.loc[idx,'text'],"\n")
print(re.sub(schar_filter, "",df_text.loc[idx,'text']))

A Beginner’s Guide to Stock Investing. On Amazon. Link: https://www.amazon.com/Beginners-Guide-Stock-Investing-Getting-ebook/dp/B07FTXYXKJ/ref=sr_1_1?ie=UTF8&amp;qid=1532939123&amp;sr=8-1&amp;keywords=rocco+capici $FB $GOOGL $JPM $BAC $WFC $AAPL $PM $PG $GE $HON $JNJ $CSCO $MSFT $INTC $TWTR $TSLA $V $DIS $UNH $VZ $KHC $SLB $XOM $ES_F $GC_F $NQ_F $CL_F $ZB_F $TLT $MET $PNC $CTL $LMT $WMT $TGT $DLTR $AMZN 

A Beginner’s Guide to Stock Investing On Amazon Link httpswwwamazoncomBeginnersGuideStockInvestingGettingebookdpBFTXYXKJrefsr__ieUTFampqidampsrampkeywordsroccocapici FB GOOGL JPM BAC WFC AAPL PM PG GE HON JNJ CSCO MSFT INTC TWTR TSLA V DIS UNH VZ KHC SLB XOM ES_F GC_F NQ_F CL_F ZB_F TLT MET PNC CTL LMT WMT TGT DLTR AMZN


In [34]:
filters = [url_filter, hashtag_filter, mentions_filter, cashtag_filter, amount_filter, schar_filter]

In [35]:
def filter_results(document, filter_condition):
    """replace given regEx string with a space"""
    document = re.sub(filter_condition, " ", document)
    return document

In [36]:
def filter_documents(df, filters):
    """filter results for all documents"""
    for char_filter in filters:       
        df.loc[:,'filtered_text'] = df.loc[:,'filtered_text'].apply(filter_results, args=[char_filter])
    return df

In [37]:
%%time
df_text = filter_documents(df_text, filters)

Wall time: 26 s


In [38]:
df_text[['text','filtered_text']].sample(5)

Unnamed: 0,text,filtered_text
185546,"$SPY $NASDAQ $DJIA Don't short yet bears its not time, wait until the $QQQs finish rallying and you see weird shit like $AMZN at 3k and $MSFT at 250. When shit gets really weird is when you pull out the shorts. Pretty clear where they are going to go and hide.",Don t short yet bears its not time wait until the finish rallying and you see weird shit like at and at When shit gets really weird is when you pull out the shorts Pretty clear where they are going to go and hide
182036,"You wonder why #StockMarket is rising despite our struggling #economy? #CoronaCrisis is shifting power to the digital world - the FANG companies: $FB, $AMZN, $NFLX, $GOOG, $MSFT, $AAPL Now look at their weighting in the S&amp;P 500 and you will understand. #stocks #OilCrash #...",You wonder why is rising despite our struggling is shifting power to the digital world the FANG companies Now look at their weighting in the S amp P and you will understand
223274,"Job listings out of Amazon $AMZN are pointing towards the company adding live TV to Amazon Prime to help differentiate the service from competitors like Netflix $NFLX, Disney Plus $DIS, &amp; HBO $T. It's not exactly clear what type of live content $AMZN is going after.",Job listings out of Amazon are pointing towards the company adding live TV to Amazon Prime to help differentiate the service from competitors like Netflix Disney Plus amp HBO It s not exactly clear what type of live content is going after
127017,"Dr. Henry Balogun Announces Release of New Book, ""Enemy of the Human Race"" $AMZN $BNED $AAPL",Dr Henry Balogun Announces Release of New Book Enemy of the Human Race
136498,ACTIVE TRADERS Try one of these FREE trading guides: http://ow.ly/7t5E30qbuCV $NFLX $TSLA $AAPL $SBUX $GS $FB $AMZN $GOOGL $NVDA,ACTIVE TRADERS Try one of these FREE trading guides


# Lemmatize & remove stopwords and keep only alphanumeric lemmas

## Spacy - token + stopwords + lemma

In [39]:
import spacy

In [40]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [41]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [42]:
# Function to preprocess text
stopwords = spacy.lang.en.stop_words.STOP_WORDS
def preprocess_pos(text):
    # Create Doc object
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [(token.lemma_ , token.tag_) for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [(lemma[0], get_wordnet_pos(lemma[1])) for lemma in lemmas 
            if lemma[0] not in stopwords and lemma[0] != '-PRON-' and lemma[1] != '_SP']
    
    return a_lemmas

In [43]:
%%time
# Apply preprocess to ted['transcript']
df_text.loc[:,'spacy_lemma_pos'] = df_text.loc[:,'filtered_text'].apply(preprocess_pos)

Wall time: 23min 5s


In [44]:
def extract_lemma(list_of_tags):
    return [lemma[0] for lemma in list_of_tags] 

In [45]:
%%time
# Apply preprocess to ted['transcript']
df_text.loc[:,'spacy_lemma'] = df_text.loc[:,'spacy_lemma_pos'].apply(extract_lemma)

Wall time: 1.54 s


In [46]:
df_text[['text','filtered_text','spacy_lemma_pos','spacy_lemma']].sample(5)

Unnamed: 0,text,filtered_text,spacy_lemma_pos,spacy_lemma
318190,Marriott International shares are trading lower after the company reported worse-than-expected Q1 EPS results. 7:30:05am Related Tickers: $MAR,Marriott International shares are trading lower after the company reported worse than expected Q EPS results am Related Tickers,"[(Marriott, n), (International, n), (share, n), (trade, v), (low, a), (company, n), (report, v), (bad, a), (expect, v), (Q, n), (EPS, n), (result, n), (relate, v), (Tickers, n)]","[Marriott, International, share, trade, low, company, report, bad, expect, Q, EPS, result, relate, Tickers]"
234909,$EGO chart; Snapped it’s downtrend line. #gold/#silver $TSLA $UBER $NIO $MU $NVDA $INTC $LYFT $XLNX $BIDU $AAPL $IRBT $DIS $PTON $AMD $BILL $M $FISV $NFLX $BABA $LK $Z $CHGG $CMG $SNAP $FB $BYND $LYFT $LULU $DT $EA $NEM $ROKU $RVMD $WDAY $PLUG $BE $FCEL $SPCE $ZM $BLDP,chart Snapped it’s downtrend line,"[(chart, n), (snap, v), (’, v), (downtrend, n), (line, n)]","[chart, snap, ’, downtrend, line]"
203408,Let's end the day with another BIGMONEY winner just getting started ... $SQ 79C already hit 3.40 from 1.67 (&gt;2X or 100% so far) 👊🤑 Email: optionsmaster@hotmail.com to subscribe. $AAPL $AMZN $BABA $BIDU $BYND $GOOGL $NFLX $NVDA $SPX $TSLA $SHOP,Let s end the day with another BIGMONEY winner just getting started C already hit from gt X or percentage so far Email optionsmaster com to subscribe,"[(let, v), (s, ), (end, v), (day, n), (BIGMONEY, n), (winner, n), (start, v), (C, n), (hit, v), (gt, ), (X, n), (percentage, n), (far, r), (Email, n), (optionsmaster, n), (com, n), (subscribe, v)]","[let, s, end, day, BIGMONEY, winner, start, C, hit, gt, X, percentage, far, Email, optionsmaster, com, subscribe]"
134738,"$AMZN Market Chatter: http://Amazon.com Files Trademark Applications for 'Amazon Pharmacy' in Canada, UK, Australia 1/21/20, 2:52 PM",Market Chatter Files Trademark Applications for Amazon Pharmacy in Canada UK Australia PM,"[(Market, n), (Chatter, n), (Files, n), (Trademark, n), (Applications, n), (Amazon, n), (Pharmacy, n), (Canada, n), (UK, n), (Australia, n), (pm, n)]","[Market, Chatter, Files, Trademark, Applications, Amazon, Pharmacy, Canada, UK, Australia, pm]"
9638,"Live stream Tonight 9:30pm EST Canada i explain how I lead my group to massive gains In both $AAPL and $TSLA also I talk about $COST and $AMD $GOOGL, all of dec into this jan month it has been back to back wins, come and ask questions also I talk about the $SPY and the market",Live stream Tonight pm EST Canada i explain how I lead my group to massive gains In both and also I talk about and all of dec into this jan month it has been back to back wins come and ask questions also I talk about the and the market,"[(live, a), (stream, n), (tonight, n), (pm, n), (EST, n), (Canada, n), (explain, v), (lead, v), (group, n), (massive, a), (gain, n), (talk, v), (dec, n), (jan, n), (month, n), (win, n), (come, v), (ask, v), (question, n), (talk, v), (market, n)]","[live, stream, tonight, pm, EST, Canada, explain, lead, group, massive, gain, talk, dec, jan, month, win, come, ask, question, talk, market]"


## NLTK token + stopwords + pos + lemma

In [47]:
from nltk.corpus import stopwords

In [48]:
%%time
# Word Tokenization
tokenizer = TweetTokenizer( preserve_case=True, reduce_len=False)
df_text.loc[:,'nltk_lemma'] = df_text.filtered_text.apply(lambda x: tokenizer.tokenize(x))

Wall time: 38.7 s


In [49]:
# Get rid of stop words 
cachedStopWords = stopwords.words("english")
def remove_stopwords(list_of_words):
    """ removes stopwords """
    no_stopwords = [t for t in list_of_words if t not in cachedStopWords]
    return no_stopwords

In [50]:
def do_pos_tag(list_of_words):
    return nltk.pos_tag(list_of_words)

In [51]:
def lemmatize_words(list_of_words):
    # initialize a lemmatizer object
    wordnet_lemmatizer = WordNetLemmatizer()
    # initiate an empty container for lemmas
    lemmatized_words_pos = []
    # remove stopwords
    list_of_words_ns = remove_stopwords(list_of_words)
    # process a document by assigning pos tag
    list_of_words_ns_pos = do_pos_tag(list_of_words_ns)
    # iterate through the list of words flagged with pos and lemmatized accordingly
    for word_tup in list_of_words_ns_pos:
        current_pos = get_wordnet_pos(word_tup[1])
        if current_pos == '': # not all pos tags have impact on lemmatization, all whuch dont have '' tag
            lemmatized_words_pos.append((wordnet_lemmatizer.lemmatize(word_tup[0]), current_pos))
        else:
            lemmatized_words_pos.append((wordnet_lemmatizer.lemmatize(word_tup[0],current_pos), current_pos))
    return lemmatized_words_pos

In [52]:
%%time
df_text.loc[:,'nltk_lemma_pos'] = df_text.loc[:,'nltk_lemma'].apply(lemmatize_words)

Wall time: 12min 34s


In [53]:
%%time
# Apply preprocess to ted['transcript']
df_text.loc[:,'nltk_lemma'] = df_text.loc[:,'nltk_lemma_pos'].apply(extract_lemma)

Wall time: 2.13 s


In [54]:
df_text[['text','nltk_lemma_pos','nltk_lemma']].sample(5)

Unnamed: 0,text,nltk_lemma_pos,nltk_lemma
145314,@jimcramer Action member just wondering what Jimmy Chill thought of the strength in $AMZN shares Friday held up great in a red tape. Sign of things to come?,"[(Action, n), (member, n), (wonder, v), (Jimmy, n), (Chill, n), (think, v), (strength, n), (share, n), (Friday, n), (hold, v), (great, a), (red, a), (tape, n), (Sign, n), (thing, n), (come, v)]","[Action, member, wonder, Jimmy, Chill, think, strength, share, Friday, hold, great, red, tape, Sign, thing, come]"
214230,"""#Amazon's quest to grow its real estate footprint and resolve the enduring last-mile delivery problem may lead it to the most troubled corner of retail real estate — the department store."" An $AMZN-$JCP $JCPNQ deal could reinvent retail real estate https://www.spglobal.com/m...","[(quest, a), (grow, a), (real, a), (estate, n), (footprint, n), (resolve, n), (endure, v), (last, a), (mile, a), (delivery, n), (problem, n), (may, ), (lead, v), (troubled, a), (corner, n), (retail, a), (real, a), (estate, n), (—, n), (department, n), (store, n), (An, ), (dea...","[quest, grow, real, estate, footprint, resolve, endure, last, mile, delivery, problem, may, lead, troubled, corner, retail, real, estate, —, department, store, An, deal, could, reinvent, retail, real, estate]"
157712,$ROKU market up 500 pts but roku is down. so what happens if we have one of those lunchtime fades? watching to see if we have a sudden air pocket. $TSLA $NFLX $UNH $AMZN $GOOGL $SPY,"[(market, n), (pt, n), (roku, v), (happen, v), (one, ), (lunchtime, n), (fade, v), (watch, v), (see, v), (sudden, a), (air, n), (pocket, n)]","[market, pt, roku, happen, one, lunchtime, fade, watch, see, sudden, air, pocket]"
80012,"Join @RobinhoodApp and we'll both get a stock like $AAPL, $F, or $S for free. Make sure to use my link.","[(Join, n), (get, v), (stock, n), (like, ), (free, a), (Make, n), (sure, n), (use, n), (link, n)]","[Join, get, stock, like, free, Make, sure, use, link]"
45997,$SPY $AAPL $MSFT $TLT Cash is trash,"[(Cash, n), (trash, n)]","[Cash, trash]"


## Spell check

In [None]:
to eliminate double counting same words, you can also implement a spell checker 
https://medium.com/@thomasdecaux/build-a-spell-checker-with-word2vec-data-with-python-5438a9343afd

smpl = df_text.loc[:1000,['text','spacy_transcript']]

from textblob import TextBlob
from spellchecker import SpellChecker

def get_ner(text):
    doc = nlp(text)
    ner_list = [(ent.text, ent.label_) for ent in doc.ents]
    return ner_list

def spell_correction(text):
    b = TextBlob(text)
    return str(b.correct())

def spell_checker(text):
    spell = SpellChecker()
    text = text.split()
    un_words = spell.unknown(text)
    new_text = ' '.join([spell.correction(word) if word in un_words else word for word in text])
    return new_text

%%time
smpl['NER'] = smpl.loc[:,'spacy_transcript'].apply(get_ner)

%%time
smpl['spell_checked'] = smpl.loc[:,'spacy_transcript'].apply(spell_correction)

%%time
smpl['spell_checked_spellchecker'] = smpl.loc[:,'spacy_transcript'].apply(spell_checker)

smpl[smpl.spacy_transcript != smpl.spell_checked_spellchecker].sample(10)

## Merge and save

In [55]:
print("df_text shape:", df_text.shape)
print("df_rest shape:", df_rest.shape)

df_text shape: (417476, 9)
df_rest shape: (417476, 12)


In [56]:
df = pd.concat([df_text,df_rest], axis=1 )

In [57]:
print("df shape:", df.shape)

df shape: (417476, 21)


In [58]:
# save file
current_time = str(datetime.now().strftime("%H%M%S"))
df.to_pickle(r"c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\05_PreProcessed\processed_tweets_"+current_time+".pkl")
stock_prices.to_pickle(r"c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\05_PreProcessed\stock_prices_"+current_time+".pkl")