In [145]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.probability import FreqDist

from datetime import datetime
from glob import glob
from nltk.corpus import wordnet

import time
import os

In [3]:
nltk.__version__

'3.5'

# Set custom options

In [2]:
pd.set_option('display.max_colwidth',280)
pd.set_option('display.html.use_mathjax', False)
pd.set_option('display.max_rows', 1000)

In [20]:
src_path = r'c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\05_PreProcessed'

filenames = glob(src_path+'\*.pkl')

tweets = pd.read_pickle(filenames[0])
stock_prices = pd.read_pickle(filenames[1])

In [22]:
tweets.shape

(417476, 21)

In [16]:
src_path = r'c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\97_LabelledTweets'
filenames = glob(src_path+'\*.csv')
labels = pd.read_csv(filenames[0])

In [17]:
labels.head()

Unnamed: 0,Column1,text,id,date,ticker,Column2
0,0,Some top bullish flow we caught today $TSLA $VXX $AAPL $ZM $BA #stocks #optionsflow #options,1271212964503310000,6/12/2020 0:49,AAPL,1.0
1,1,"Join us for Daily Powerful Watchlist, Swing & Day Option Trading Alerts Paypal monthly link in bio, $134.99 DM for biweekly link $74.99 $fb $aapl $amzn $nflx $googl $bidu $roku $spy $amd $nvda $tsla $ba $baba $shop #trading",1230543208410730000,2/20/2020 18:22,AAPL,0.0
2,2,$AMZN $AAPL down tomorrow? Trying to Shock Stocks With Emergency Cuts Usually Falls Short,1234976737114540000,3/3/2020 23:59,AAPL,-1.0
3,3,"Barron's Picks And Pans: Roundtable Picks, Airline And Oil Stocks, And More $LUV $DAL $UAL $CCL $LSB $MRK $AAPL $AMZN $BMY",1238951790864780000,3/14/2020 23:15,AAPL,0.0
4,4,Apple Confirms Redesigned Maps App Has Rolled Out to All Users Across United States http://dlvr.it/RP4HQP $AAPL,1222970881795740000,1/30/2020 20:52,AAPL,0.0


## Clean Lables

In [18]:
# set tweets ID as index
labels.set_index('id', drop=True, inplace=True)

# drop original index
labels.drop('Column1', axis= 1,inplace=True)

# rename columns
labels.columns = ['text','date','ticker','score']

# drop rows with no class
labels.dropna(axis=0,inplace =True)

In [19]:
labels.shape

(566, 4)

In [24]:
labels.groupby('ticker').score.count()

ticker
AAPL    50
AMZN    26
BABA    36
DLT     19
GILD    46
HLT     56
JNJ     41
MAR     56
MCD     56
MSFT    53
QSR     67
UAL     60
Name: score, dtype: int64

# Bag of Words

In [25]:
%%time
tweets.loc[:,'spacy_text'] = tweets.loc[:,'spacy_lemma'].apply(lambda x: ' '.join(x))

Wall time: 1.67 s
Parser   : 104 ms


In [26]:
%%time
tweets.loc[:,'nltk_text'] = tweets.loc[:,'nltk_lemma'].apply(lambda x: ' '.join(x))

Wall time: 709 ms
Compiler : 287 ms


In [27]:
def create_fdist(df, attribute):
    ticker_words = df.loc[:,attribute].str.cat(sep=' ')
    fdist = FreqDist(ticker_words.split(" "))
    return fdist

In [28]:
%%time
fdist_spacy = create_fdist(tweets,'spacy_text')

Wall time: 4.06 s


In [29]:
%%time
fdist_nltk = create_fdist(tweets,'nltk_text')

Wall time: 4.25 s


In [30]:
def most_common_words(fdist):
    length = 0
    output_list = []
    for key, value in fdist:
        if length < 5000:
            if len(key) > 1:
                output_list.append(key.lower())
                length += 1
    return output_list

In [31]:
%%time
spacy_most_common = most_common_words(fdist_spacy.most_common(6000))
nltk_most_common = most_common_words(fdist_nltk.most_common(6000))

Wall time: 163 ms


## ----------------- For improvement --------------------

1. handling imbalanced classes
https://medium.com/vickdata/detecting-hate-speech-in-tweets-natural-language-processing-in-python-for-beginners-4e591952223
  
1. Pipelines for multiple different classifiers
https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf
  
1. Improve lexicon based method by using only word formt he lexicon with the most extreme sentiment value

# 1. Lexicon-based sentiment

In [32]:
path = r"c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\99_SentiWord"

In [33]:
sentiwords = pd.read_csv(path+"\\SentiWords_1.1.txt", encoding='utf-8', sep='\t', skiprows=range(1,25,1), header=1)

In [34]:
def extract_lemma(text):
    return text.split('#')[0]

def extract_pos(text):
    return text.split('#')[1]

In [35]:
%%time
sentiwords.loc[:,'lemma'] = sentiwords.iloc[:,0].apply(extract_lemma)

Wall time: 152 ms


In [36]:
%%time
sentiwords.loc[:,'pos'] = sentiwords.iloc[:,0].apply(extract_pos)

Wall time: 104 ms


In [37]:
sentiwords = sentiwords.loc[:,['lemma','pos','prior_polarity_score']].reset_index(drop=True)

In [38]:
# adjectives, nouns, verbs and adverbs
sentiwords.pos.unique()

array(['a', 'n', 'r', 'v'], dtype=object)

In [39]:
sentiwords.loc[:,'nltk_common'] = 0
sentiwords.loc[:,'nltk_common'].mask(sentiwords.lemma.isin(nltk_most_common), 1 , inplace= True)

sentiwords.loc[:,'spacy_common'] = 0
sentiwords.loc[:,'spacy_common'].mask(sentiwords.lemma.isin(spacy_most_common), 1 , inplace= True)

In [40]:
sentiwords_nltk = sentiwords[sentiwords.nltk_common == 1]
sentiwords_spacy = sentiwords[sentiwords.spacy_common == 1]

In [43]:
sentiwords_spacy_idx = sentiwords_spacy.set_index(['lemma','pos'])
sentiwords_nltk_idx = sentiwords_nltk.set_index(['lemma','pos'])

In [44]:
def assign_sentiment_value(list_of_tuples, lexicon):
    score = 0
    for tup in list_of_tuples:
        try:
            score = lexicon.loc[(tup[0],tup[1]),'prior_polarity_score']
            score += score
        except:
            pass
    return score

# old, slow version, it is much faster to search through index than through regular attribute
#def assign_sentiment_value(list_of_tuples, lexicon):
#    res = 0
#    for tup in list_of_tuples:
#         idx = lexicon[(lexicon.lemma == tup[0]) & (lexicon.pos == tup[1])].index.values
#         if len(idx) > 0:
#             res = lexicon.loc[idx,'prior_polarity_score'].values[0]
#             res += res
#     return res

In [46]:
%%time
tweets.loc[:,'nltk_lex'] = tweets.loc[:,'nltk_lemma_pos'].apply(assign_sentiment_value, args=[sentiwords_nltk_idx])

Wall time: 13min 51s


In [45]:
%%time
tweets.loc[:,'spacy_lex'] = tweets.loc[:,'spacy_lemma_pos'].apply(assign_sentiment_value, args=[sentiwords_spacy_idx])

Wall time: 13min 31s


In [48]:
tweets.loc[:,['text','nltk_lex','spacy_lex']].sample(10)

Unnamed: 0,text,nltk_lex,spacy_lex
345215,"Stay ahead with Nasdaq 100 news, views & analysis $MSFT $AAPL $GOOG http://www.cityfalcon.com/watchlists?name=Nasdaq%20Tracker&amp;utm_campaign=T_AT",0.0,0.0
32838,$aapl virus proving to be bullish for this company. 🦠 $spy $spx $qqq $dia $iwm,0.0,0.0
50039,Can you get CoronaVirus from iPhone 11 at Apple Store? The screens look so 'high touched'. $AAPL,0.49462,0.9193
174342,FED Powell Economy Ready to BOOM Again! $AMZN $DIA $SPY $QQQ,0.0,0.81486
5720,Jeremy Siegel worries the hot 2020 stock market could collapse like it did in February 2018 https://www.cnbc.com/2020/01/09/jeremy-siegel-worries-the-hot-2020-market-may-fall-like-february-2018.html $SPY $QQQ $DJIA $DIA $GLD $SLV #stockmarket #investing #finance #stocks #gold...,-0.81654,-0.81654
170841,"$SONM Launches New Product That Could Help Stop Spread Of #Covid19. Low-Float #Nasdaq Listed w/ +1,750% Upside! $bynd $tsla $csco $codx $regn $ba $wmt $tdoc $amzn $twtr $fb $googl $nflx $grub $sbux $zm $mgm $penn $vvus $crm $aal $btc $abt $msft $amrn https://financialnews.med...",0.0,0.0
62994,"""Rush Rally 3' Just Got a Big Update Adding a New Classic Cars Expansion IAP, Updated Graphics for All Cars, and More http://dlvr.it/RSvWSJ $AAPL",0.0,0.0
149550,"LMAO That was hilarious, @MikeBloomberg said ""any jackass can knock down a barn door. It takes a good carpenter to build a barn"" $NFLX $MSFT $TSLA $SPY $AAPL #stocks #MAGA $TGT $SHOP $JPM $BA $AMZN $GE $NIO $CHK $NOK $AMD $F $BAC $EEM $SDC $EFA $XLP $XLE $CSCO $QQQ $UBER $XOM $S",0.0,0.0
404159,Beautiful. God I have loved $UAL this week.,0.0,0.0
347239,Thurs (2/27/20) gap down highest % below OR 30-min low and trading range $BLDP $TSLA $PPL $WPX $NLY $MRVL $INTC $MSFT $COP $MET $COG $AAPL $IBM $XOM $USB $SIRI $FITB $MPC $BRK.B $BHC $O $CHNG $CVS $AXP $RDS.B $KEY $CLF $DOW $C,-0.00478,-0.00478


In [52]:
# save file
current_time = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
tweets.to_pickle(r"c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\06_SentLabelled\SentimentLabels1_"+current_time+".pkl")

# 2 Distant supervision using emojis

In [571]:
#src_path = r'c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\06_SentLabelled'
#filenames = glob(src_path+'\*.pkl')
#tweets = pd.read_pickle(filenames[0])

In [53]:
import functools
import operator
import re
import emoji

In [54]:
emo_path = r"c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\98_Emoji_Sentiment"
df_emoji = pd.read_csv(emo_path+"\\Emoji_Sentiment_Data_v1.0.csv", encoding='utf-8')

In [55]:
df_emoji['sent_score'] = (df_emoji.Positive - df_emoji.Negative) / df_emoji.Occurrences

In [56]:
tweets_with_emojis = tweets.loc[tweets.emoji != '',:]

In [57]:
def filter_known_emojis(emoji_str):

    em_split_emoji = emoji.get_emoji_regexp().split(emoji_str)
    em_split_whitespace = [substr.split() for substr in em_split_emoji]
    em_split = functools.reduce(operator.concat, em_split_whitespace)
    
    known_emojis = [emo for emo in em_split if emo in df_emoji['Emoji'].values]
    return known_emojis

In [58]:
tweets_with_emojis['emoji_list'] = tweets_with_emojis.loc[:,'emoji'].apply(filter_known_emojis)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [59]:
# remove all lines where emojis are not known
tweets_with_emojis = tweets_with_emojis[tweets_with_emojis['emoji_list'].map(lambda d: len(d)) > 0]

In [60]:
tweets_with_emojis.loc[:,['emoji','emoji_list']]

Unnamed: 0,emoji,emoji_list
30,🎀🏇🏇,"[🎀, 🏇, 🏇]"
39,👉,[👉]
51,🏆🇺🇸,[🏆]
53,⬆,[⬆]
86,🚔🚔🚔📊,"[🚔, 🚔, 🚔]"
...,...,...
417413,✈👨✈👩✈,"[✈, 👨, ✈, 👩, ✈]"
417432,🤦🏻♂,[♂]
417433,🥇🔽▪🥈🔽▪🥉🔽▪➡,"[▪, ▪, ▪, ➡]"
417457,🖊⛽🛢🎰✈✈⬇,"[⛽, 🎰, ✈, ✈, ⬇]"


In [61]:
def get_emoji_sent(emoji_list):
    total_score = 0
    for emo in emoji_list:
        score = df_emoji.loc[df_emoji['Emoji'] == emo,'sent_score'].values
        if len(score) > 0: 
            total_score += score[0]            
    if total_score > 0.25:
        sent_score = 1
    elif total_score >= -0.25 and total_score <= 0.25:
        sent_score = 0
    else:
        sent_score = -1
    return sent_score

In [62]:
tweets_with_emojis.loc[:,'emoji_score'] = tweets_with_emojis.loc[:,'emoji_list'].apply(get_emoji_sent)

In [70]:
tweets_with_emojis.loc[:,['emoji_list','emoji_score']].sample(10)

Unnamed: 0,emoji_list,emoji_score
125300,"[👇, 💪, 👇]",1
220314,[➡],0
140907,"[💚, 💔, 💚, 💚, 💚, 💔, 💔, 💚]",1
318972,"[👌, 👌, 👌, ❤]",1
355191,"[🍞, 🍞, 🍞]",0
326870,[♀],1
119547,[🔮],1
378670,"[😱, 😷, 📉]",1
180758,"[💵, 🏪, 🏥]",1
162572,"[🚂, 💰, 💰, 💰, 💰]",1


In [72]:
tweets_with_emojis.groupby('emoji_score').text.count()

emoji_score
-1      499
 0     7418
 1    29556
Name: text, dtype: int64

### Train Naive Bayes

### explenation of assumptions: 
https://towardsdatascience.com/sentiment-analysis-introduction-to-naive-bayes-algorithm-96831d77ac91

In [73]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from pprint import pprint
from time import time

In [74]:
tweets_with_emojis.loc[:,'spacy_lemma_text'] = tweets_with_emojis.loc[:,'spacy_lemma'].apply(lambda x: ' '.join(x))
tweets_with_emojis.loc[:,'nltk_lemma_text'] = tweets_with_emojis.loc[:,'nltk_lemma'].apply(lambda x: ' '.join(x))

In [75]:
tweets_with_emojis.loc[:,['spacy_lemma_text','nltk_lemma_text','emoji_score']].sample(10)

Unnamed: 0,spacy_lemma_text,nltk_lemma_text,emoji_score
79929,"new video technology monopoly "" strong claim strong addition portfolio check new video find",NEW VIDEO Is ‘ technology monopoly ” strong enough claim strong addition portfolio Check new video find,1
9646,today Update Reminder play contract time common Daily Goal today P l start Balance Account grow growth,Today ’ Update Reminder I play mostly contract time With common Daily Goal Today ’ P L Starting Balance Account Grows Growth,1
106171,BOOM come Join pro daily Powerful Watchlist Swing amp Day Option Trading Alerts monthly link bio,BOOM Come Join The Pro For Daily Powerful Watchlist Swing amp Day Option Trading Alerts Monthly link bio,1
268919,stock board Dow currently thing work treatment gold,As stock across board Dow currently thing work treatment Gold,1
51978,today s Biggest Losers Tesla Inc » ️ percentage Apple Inc » ️ percentage Microsoft Corporation » ️ percentage ranking ️,Today Biggest Losers Tesla Inc » ️ percentage Apple Inc » ️ percentage Microsoft Corporation » ️ percentage Rankings ️,1
387631,COMING SOON Sector Rotation Insights conversation monitor ️ ️ ️ ‍ ️ ️ meantime explore free content,COMING SOON Sector Rotation Insights across conversation monitor ️ ️ ️ ‍ ️ ️ In meantime explore free content,1
411574,"follow PART share tip profit market strategy work decision feel "" hurt y",Follow PART Re share tip profit market top strategy work well Decisions make make feel good ” almost always hurt,1
4053,video * MUST *,VIDEO * MUST SEE *,1
32536,ready leg float curl trading perfectly buckle,ready next leg float Curling back trading perfectly Buckle,1
256334,hr Volume Alert current volume average percentage average price percentage,hr Volume Alert current volume average percentage average Price percentage,1


In [76]:
# Create CountVectorizer object
vectorizer = CountVectorizer(lowercase=True, strip_accents='unicode', ngram_range=(1,1))

# Generate matrix of word vectors
bow_spacy_v = vectorizer.fit_transform(tweets_with_emojis.spacy_lemma_text)

In [77]:
vectorizer_tfidf = TfidfVectorizer(lowercase=True, strip_accents='unicode', ngram_range=(1,1))

# Generate matrix of word vectors
bow_spacy_tfidf = vectorizer_tfidf.fit_transform(tweets_with_emojis.spacy_lemma_text)

In [78]:
bow_matrix = bow_spacy_v
train_X, test_X, train_y, test_y = train_test_split(bow_matrix,
                                                    tweets_with_emojis['emoji_score'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=tweets_with_emojis['emoji_score'])

* average=micro says the function to compute f1 by considering total true positives, false negatives and false positives (no matter of the prediction for each label in the dataset)
* average=macro says the function to compute f1 for each label, and returns the average without considering the proportion for each label in the dataset.
* average=weighted says the function to compute f1 for each label, and returns the average considering the proportion for each label in the dataset.
* average=samples says the function to compute f1 for each instance, and returns the average. Use it for multilabel classification.

In [79]:
MNB = MultinomialNB()

In [80]:
scoring = ['precision_samples', 'recall_samples', 'f1_samples']

In [81]:
# KFold/StratifiedKFold cross validation with 3 folds (the default)
# applying the classifier pipeline to the feature and target data
scores_spacy_v = cross_validate(MNB, train_X, train_y, cv=5, scoring = 'f1_micro')

In [82]:
scores_spacy_v['test_score'].mean()

0.7528522434472579

In [90]:
scores_spacy_tfidf = cross_val_score(MNB, train_X, train_y, cv=5, scoring = 'f1_micro')

In [93]:
scores_spacy_tfidf

array([0.75283522, 0.75100067, 0.75016678, 0.75095913, 0.75929942])

In [94]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_spacy_v['test_score'].mean(), scores_spacy_v['test_score'].std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_spacy_tfidf.mean(), scores_spacy_tfidf.std() * 2))

Accuracy: 0.75 (+/- 0.01)
Accuracy: 0.75 (+/- 0.01)


## GridSearch for best parameter

In [None]:
# https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html

In [95]:
train_X, test_X, train_y, test_y = train_test_split(tweets_with_emojis['spacy_lemma_text'],
                                                    tweets_with_emojis['emoji_score'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=tweets_with_emojis['emoji_score'])

In [96]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [97]:
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 20000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__fit_prior': (True, False)
}

In [744]:
# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_X, train_y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__fit_prior': (True, False),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 20000),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  3.8min finished


done in 231.095s

Best score: 0.770
Best parameters set:
	clf__fit_prior: True
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: 20000
	vect__ngram_range: (1, 2)


### make prediction with NB

In [98]:
pipeline = Pipeline([
    ('vect', CountVectorizer(max_df=0.5, max_features = 20000, ngram_range = (1, 2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', MultinomialNB(fit_prior = True)),
])

In [99]:
X = tweets_with_emojis['spacy_lemma_text']
y = tweets_with_emojis['emoji_score']
X_full = tweets.loc[:,'spacy_lemma'].apply(lambda x: ' '.join(x))

In [100]:
pipeline.fit(X, y)

Pipeline(steps=[('vect',
                 CountVectorizer(max_df=0.5, max_features=20000,
                                 ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [101]:
tweets.loc[:,'spacy_NB_sentiment'] = pipeline.predict(X_full)

In [102]:
tweets.columns

Index(['text', 'ticker', 'cashtags', 'emoji', 'filtered_text',
       'spacy_lemma_pos', 'spacy_lemma', 'nltk_lemma', 'nltk_lemma_pos',
       'username', 'to', 'retweets', 'favorites', 'replies', 'id', 'author_id',
       'date', 'hashtags', 'mentions', 'urls', 'sentiment_collection_date',
       'spacy_text', 'nltk_text', 'spacy_lex', 'nltk_lex',
       'spacy_NB_sentiment'],
      dtype='object')

In [104]:
tweets.loc[:,['text','emoji','nltk_lex','spacy_lex','spacy_NB_sentiment']].sample(10)

Unnamed: 0,text,emoji,nltk_lex,spacy_lex,spacy_NB_sentiment
224395,$TLRD had 634 stores of 1450 Locations open as of June 5th. $$$$Revenue $BA $TLRD $OAS $IDEX $DAL $CPE $SHIP $NCLH $ENLC $TTI $NFLX $M $RCL $HTZ $PRTY $QEP $OXY $NIO $NKLA $AAPL $LLEX $USO $MRO $MPC $KIRK $NVDA $AMD $MSFT $TSLA $FB $AMZN $INO $INTC $CSCO $CCL $AAL $ADBE,,0.0,0.6484,1
318593,Jim Cramer advises investors not to get too optimistic over Moderna vaccine progress $MRNA $UAL $NCLH $MAR #coronavirus #COVID2019,,0.79062,0.79062,1
225316,$amzn too : 1626 to Cup N' handle target 2795-6,,0.0,0.0,1
249016,2020-05-21 Short sale volume (not short interest) for $JD is 36%. http://shortvolumes.com/?t=JD $XLK 61% $LITB 64% $BABA 49%,,0.0,0.0,1
5848,What time does limit down in $AAPL start today? Asking for a friend....at SNB,,0.95374,0.95374,1
18992,"Really with this logic anyone profiting of gaming and video just as bad. Which includes $msft, $aapl, $goog, $amzn, and $nflx. Throw moral $dis in as well which has embraced sports gambling. Cigs are a carcinogen no matter what while social media can be used productively.",,0.17258,0.17258,0
69828,$SPY $DIA $AAPL $AMD $CCL $AAL Plot twist- get unemployment and buy stocks 😳👀,😳👀,0.19634,0.19634,1
80663,Greg Abel to Share Stage With Warren Buffett at Annual Meeting $BRK.B $AAPL $WFC,,0.0,0.36242,1
118021,"$AAPL, $AMZN, $BRK/B, $HD, $LLY, $MA, $MCD, $MSFT, $NFLX, $NVDA, $PYPL, $TWLO, $UNP, V to Calls_For_Scalp_Very_Risky ($SL 20%).",,0.0,0.0,1
318405,Equities Analysts Offer Predictions for Marriott International Inc’s Q3 2020 Earnings $MAR http://theenterpriseleader.com/?p=3138251,,0.0,0.25304,1


In [105]:
# save file
current_time = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
tweets.to_pickle(r"c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\06_SentLabelled\SentimentLabels2_"+current_time+".pkl")

# 3. VADER

In [106]:
# Vader sentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [107]:
sid = SentimentIntensityAnalyzer()
def get_VADER_polarity(text):
    """applies VADER analysis for polarity score"""
    polarity_score =  sid.polarity_scores(text)
    return polarity_score

In [108]:
def extract_compound_polarity(input_dict, polarity_score = 'compound'):
    """extract compount score from VADER polarity score"""
    cmp_score = input_dict[polarity_score]
    return cmp_score

In [109]:
%%time
tweets.loc[:,'spacy_VADER_polarity'] = tweets.loc[:,'spacy_text'].apply(get_VADER_polarity)

Wall time: 1min 2s


In [110]:
%%time
tweets.loc[:,'VADER_spacy_score'] = tweets.loc[:,'spacy_VADER_polarity'].apply(extract_compound_polarity)

Wall time: 705 ms


In [113]:
tweets.loc[:,['text','emoji','nltk_lex','spacy_lex','spacy_NB_sentiment','VADER_spacy_score']].sample(5)

Unnamed: 0,text,emoji,nltk_lex,spacy_lex,spacy_NB_sentiment,VADER_spacy_score
90918,"Today Top Flow in S&P 500 #SP500, Buy Flow and Sell Flow $NVDA $JPM $AMD $GILD $DHR $IBM $PG $CVX $COST $MU $AAPL $C $BA $GOOG $JNJ $PFE $MSFT $MRK $ABT $MA#stocks #StockMarket #Investment #investing https://apple.co/2XZuTYw",,0.0,0.15326,1,0.2023
113221,Some big players bought $136MM worth of $SPY at 302.15 around 11 am EST. $SPY $AAPL flagging up as well I think $SPY 302.71~300.47 will be the floor. Expecting a big gap up on Sunday enter a small starter $SPX 0615 3150c at 4.2 #daytrading #swingtrading #optiontrading #options,,0.0,0.0,1,0.2263
224993,On the surface it would seem $AMZN could be a buyer. Is the thought that Amazon reaps more rewards by letting $FSLY innovate on its own which in turn makes its AWS offering more compelling?,,0.25304,0.25304,1,0.8591
401385,"The Weekly Market Review | March 6, 2020 | $AAPL $TGT $UAL | https://youtu.be/bxQ4I0yOYh4 | #Stocks #TravelIndustry #Airlines #Investments #WallStreetNews",,0.0,0.0,1,0.0
171810,HQ3 + SpacePort #HQ3 #SpacePort $AMZN,,0.0,0.0,0,0.0


In [114]:
def VADER_sentiment_classifier(vader_score):
    if vader_score > 0.1 :
        sentiment = 1
    elif vader_score < -0.1:
        sentiment = 0
    else:
        sentiment = -1
    return sentiment

In [116]:
%%time
tweets.loc[:,'VADER_spacy_score'] = tweets.loc[:,'VADER_spacy_score'].apply(VADER_sentiment_classifier)

Wall time: 210 ms


In [118]:
tweets.loc[:,['text','emoji','nltk_lex','spacy_lex','spacy_NB_sentiment','VADER_spacy_score']].sample(5)

Unnamed: 0,text,emoji,nltk_lex,spacy_lex,spacy_NB_sentiment,VADER_spacy_score
128180,"Join us for Daily Powerful Watchlist, Swing & Day Option Trading Alerts Paypal monthly link in bio, $99.99 DM for biweekly link $59.99 $fb $aapl $amzn $nflx $googl $bidu $roku $spy $amd $nvda $tsla $ba $baba $shop #trading #OptionsTrading",,0.0,0.0,1,1
130930,"At breakeven now, if $AMZN cracks low of day looking ot cash in around 1883/84, that should be close to 100% on the 3 puts",,0.0,0.0,1,0
248641,"close $BABA IC 190/230 $5wings JUN $157db a $24 win over night, super small win, nearly a scratch, but by bye, don't want to be here @TraderNickyBAT @Tomunderwater @Tony_BATtista @TFMTrades @tastytradar #tastytrades",,0.4917,0.4917,1,1
7421,Weekly Stock Market Review: ✅ Week #3✅Stay Ahead Of The Curve: Top 10 Stock Picks For January 2020: http://messages.responder.co.il/4370841/ $AAPL $NVDA $MU $AMZN $FB $GOOGL $NFLX $SPY $QQQ $DIA $INTC $AMD $F $GOOG $BRK_B $GLD $SBUX $BA $VIX $CC $VAC $WIX #NASDAQ #Apple #trad...,✅✅,0.0,0.25856,1,1
75977,$AAPL #Options Power and #Profile Update #OptionsTrading https://apple.co/2XZuTYw https://twitter.com/minteractapp/status/1252643349699280896?s=21,,0.0,0.0,1,-1


In [120]:
# save file
current_time = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
tweets.to_pickle(r"c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\06_SentLabelled\SentimentLabels3_"+current_time+".pkl")

# 3. Teach on external labelled dataset

In [121]:
from nltk.corpus import movie_reviews

In [122]:
# http://www.nltk.org/howto/twitter.html
from nltk.corpus import twitter_samples
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [123]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\jaromir\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [124]:
all_words = all_words = nltk.FreqDist(movie_reviews.words()).most_common(5000)

In [125]:
feature_vector = list(all_words)[:4000]

In [126]:
list_of_file_id = movie_reviews.fileids()

In [127]:
movie_reviews_list = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]

In [128]:
def format_movie_reviews(list_of_tuples):
    df = pd.DataFrame(columns=['text','score'])
    
    # text
    text = [tup[0] for tup in list_of_tuples]
    df['text'] = text
    df.loc[:,'text'] = df.loc[:,'text'].apply(lambda x: ' '.join(x))
    
    # scores
    scores = [1 if tup[1]=='pos' else 0 for tup in list_of_tuples]
    df.loc[:,'score'] = scores
        
    return df

In [129]:
movie_reviews_df = format_movie_reviews(movie_reviews_list)

In [130]:
train_X, test_X, train_y, test_y = train_test_split(movie_reviews_df['text'],
                                                    movie_reviews_df['score'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=movie_reviews_df['score'])

In [131]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [132]:
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 20000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__fit_prior': (True, False)
}

In [822]:
# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_X, train_y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__fit_prior': (True, False),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 20000),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 14.9min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed: 17.8min finished


done in 1074.542s

Best score: 0.841
Best parameters set:
	clf__fit_prior: True
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: 20000
	vect__ngram_range: (1, 2)


### making rpediction with NB learned on movie reviews

In [133]:
pipeline = Pipeline([
    ('vect', CountVectorizer(max_df=0.5, max_features = 20000, ngram_range = (1, 2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', MultinomialNB(fit_prior = True)),
])

In [134]:
X = movie_reviews_df['text']
y = movie_reviews_df['score']
X_full = tweets.spacy_text

In [135]:
pipeline.fit(X, y)

Pipeline(steps=[('vect',
                 CountVectorizer(max_df=0.5, max_features=20000,
                                 ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [136]:
tweets.loc[:,'spacy_movie_NB_sentiment'] = pipeline.predict(X_full)

In [137]:
tweets.loc[:,['text','emoji','nltk_lex','spacy_lex','spacy_NB_sentiment','spacy_movie_NB_sentiment']].sample(10)

Unnamed: 0,text,emoji,nltk_lex,spacy_lex,spacy_NB_sentiment,spacy_movie_NB_sentiment
379315,$spy $spx $aapl $amzn $qqq $twtr $mrna $msft,,0.0,0.0,1,0
119686,$AAPL hits new high on store closures. What a world.,,0.0,0.0,1,1
193764,David Faber debating against likelihood of $AMZN buyout of $AMC. $AMC up 45% on buyout mentioned in Daily Mail report,,0.0,0.0,1,1
266243,"Also, if you are new to $GILD here's a background article on Finpedia for you https://finpedia.co/bin/Gilead%20Sciences/",,0.0,0.0,1,1
211341,2 Cheap Technology Stocks to Buy Now $MSFT $QCOM Also $AAPL $AMZN $GOOG $FB,,0.0,0.948,1,0
44453,"If you own $AAPL stock, now is the time to look at these three suppliers.",,0.0,0.0,1,0
144133,Why didn’t $AMZN go w $TSLA $TSLAQ? They know it’s a piece of shit?,,-0.54696,-0.54696,1,1
409637,On Watch. $UAL $UAA $GIS $MAR $CAH $ON $BA $TSLA. Let the games begin!!!,,0.8039,0.8039,1,0
78829,Price gainers on Friday - $AZO $SHOP $W $ORLY $CMG $GWW $BYND $QDEL $ZM $TMO $SRPT $LTRPB $DXCM $CHK $MESO $TDOC $SIVB $TSA $HD $VICR $COHR $QURE $HUBS $TSCO $AAPL $RNG $WSO $RH $GBT $RGEN $EBS $MASI $PZZA $LAD $WWE $VEEV $MKTX $NXPI $CRWD $ANET $NVRO,,0.16822,0.16822,1,0
134662,Mozart trading system: https://profectussystems.autotradenow.com/details/85617966 $TQQQ $WOOF $XLP $QLD $IBB $HDV $CCI $GLD $EFX $CNP $IFF $TSLA $AMZN $MSFT $IBM $GOOGL $SPY,,0.2213,0.2213,1,1


In [138]:
# save file
current_time = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
tweets.to_pickle(r"c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\06_SentLabelled\SentimentLabels4_"+current_time+".pkl")

In [141]:
def do_cool_stuff():
    
    os.chdir(out_path)
    
    do_countdown(5)
    time.sleep(1)
    
    print("working ...")
    time.sleep(2)
    print("The output is ready!")
    time.sleep(1)
    os.system("START /MAX notepad.exe msg.txt") 

# 4. Transfer-learning model

In [None]:
vectorizer.get_feature_names()

# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray())

# Map the column names to vocabulary 
bow_df.columns = vectorizer.get_feature_names()

# Print bow_df
print(bow_df)

train_X, test_X, train_y, test_y = train_test_split(df['review'], df['sentiment'], test_size=0.5, random_state=42, stratify=df['sentiment'])

# Generating ngrams
vectorizer = CountVectorizer(ngram_range=(1,3))
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

## TF-IDF



# Create CountVectorizer object
vectorizer = TfidfVectorizer(lowercase=False)

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(df_text.text)

# Print the shape of bow_matrix
print(bow_matrix_lem.shape)

# Remember, the value corresponding to the ith row and jth column of a similarity matrix 
# denotes the similarity score for the ith and jth vector.
cosine_sim = linear_kernel(bow_matrix[:10000,:], bow_matrix[:10000,:])

cosine_sim[0,:]

def max_sim(row_idx ,cosine_sim_matrix):
    # finds the index value of the 
    max_sim = np.argsort(cosine_sim_matrix[row_idx,:])[-2]
    
    print(df_text.loc[row_idx,'text'],"\n")
    print(df_text.loc[max_sim,'text'])

df_text.loc[:,'text']



In [None]:
### Spacy - Named entity recognition

def find_persons(text):
  # Create Doc object
  doc = nlp(text)
  
  # Identify the persons
  persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
  
  # Return persons
  return persons

print(find_persons(tc))

df_text.loc[np.argsort(cosine_sim[1,:])[-2],'text']

np.where(cosine_sim[0,:] == max_sim)

np.argsort(cosine_sim[0,:])[-2]

max_sim(3,cosine_sim)

cosine_sim[0,:][np.argsort(cosine_sim[0,:]) == 998][0]

df_text.loc[[0],'text']

df_text.loc[[517],'text']

## Embeddings

import en_core_web_lg
nlp_lg = en_core_web_lg.load()

doc = nlp_lg("I am happy")
for token1 in doc:
    for token2 in doc:
        print(token1.similarity(token2))

In [None]:
vader.loc[:,'date'] = pd.to_datetime(vader.loc[:,'date']).dt.date

vader_ts = vader[(vader.VADER_sentiment != 'N') & (vader.ticker == 'JPM')].groupby(['ticker','date']).mean()

jpm_price = stock_price[stock_price.ticker == 'JPM']
jpm_price.loc[:,'Date'] = pd.to_datetime(jpm_price.Date)
jpm_price.set_index('Date', inplace=True)
jpm_price = jpm_price['Adj Close']
jpm_price.head()

vader_ts.index = vader_ts.index.droplevel()

vader_ts.head(3)

sent_price = vader_ts.merge(jpm_price, left_index=True, right_index=True)

x = sent_price.index
y1 = sent_price.VADER_score
y2 = sent_price['Adj Close']

fig, ax1 = plt.subplots(figsize=(10,6))

ax1.plot(x,y1, c='royalblue', label = 'VADER_score')
ax2 = ax1.twinx()
ax2.plot(x,y2, c ='red', label = 'Stock price')

ax1.set_xticklabels(x,rotation=30)
ax1.set_title('JPM sentiment against stock price in time')
fig.legend(bbox_to_anchor=(0.32, -0.2, 0.5, 0.5))
plt.show()

stock_price