In [65]:
import pandas as pd
from glob import glob
import re
import matplotlib.pyplot as plt
import numpy as np

In [9]:
pd.set_option('display.max_colwidth',None)

In [2]:
src_data = r'c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData'

In [3]:
filenames = glob(src_data+'\#*.csv')
df = pd.DataFrame()

for file in filenames:
    # read a csv from source folder
    data = pd.read_csv(file, encoding = 'utf-8')
    # parese the name of the file to extract the ticker name and save it 
    # as a new attribute
    data['ticker'] = re.search('#[A-Z]+_',file).group(0)[1:-1]
    # concatenate the csv with the rest of the DataFrame
    df = pd.concat([df, data], axis=0, sort=False, ignore_index = True)
    df.drop(labels = 'Unnamed: 0', inplace=True, axis=1)

In [4]:
# Load fiannce data
stock_price = pd.read_csv(src_data+'/stock_price.csv')

In [5]:
# Tokenization
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from nltk.tokenize.casual import TweetTokenizer as CasualTweetTokenizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from functools import reduce
from collections import Counter

In [51]:
df_text = df[['text','ticker']]

In [46]:
df_text.head(3)

Unnamed: 0,text,ticker
0,"#PBPitch Kintsugi's embarrassed by her golden prosthetic leg. But when she's the only one who can save her friend, her “brokenness” is recognized as valuable and even beautiful. Facts about kintsugi, the art of mending broken bowls with gold, included. #OWN #ownvoices #DIS",DIS
1,"Buddy plans to pen the jazziest love letter to his mom; but, like him, his letters never met a line they liked. Dropping beats and busting moves are more their forte. Will he get the biggest baddest band in formation and avoid the Mother’s Day blues? #PBPitch #L #BVM #POC #DIS",DIS
2,"After one-year-old ALBIE is checked into the hospital for heart surgery, he goes on an adventure, navigating the seemingly endless halls, getting lost, and making new friends while searching for his mom so they can go home. #PB #DIS #PBPitch",DIS


In [53]:
# select a random sample of 5 tweets for each ticker
smpl = df_text.groupby('ticker').apply(lambda x: x.sample(n=5, random_state = 10))

# get rid of multi-level index
smpl.reset_index(drop=True, inplace=True)

# print the results
smpl

Unnamed: 0,text,ticker
0,The Latest: California gov fears virus spread amid protests $DIS #DIS #Health #Diseaseoutbreaks #Publichealth,DIS
1,#DIS wishes you all a very #HappyHoli. #Holi2019 #HoliCelebration,DIS
2,$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL #stocks #stockstobuy #stockstowatch #wynn #mgm #starbucks #carnival #nike #marriott #disney #dis,DIS
3,Your buy earlier at 137 support $DIS #DIS wow!,DIS
4,"GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead, 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe. But the refuge has secrets, and Rhea must learn them or risk losing her sister for good. #PitMad #YA #SF #dis",DIS
5,I vow to one day serve this country ........#CCM chama langu #JPM tuchape kaz .....mbele daima,JPM
6,"J.P. Morgan Symphony Orchestra Autumn Concert, 25th October 2019, St John's Waterloo, London. #jpm #jpmorgan #jpmso #Symphony #Orchestra #orkestra #senfoni #music #müzik #london #londra #stjohnswaterloo #concert #event #ezgigunuc #violin #keman #strings #photograph #sonyalpha",JPM
7,China's 2019 economic growth weakens amid trade war $JPM #JPM #Governmentspending #Fiscalpolicy #Economicpolicy http://zpr.io/thaAY,JPM
8,First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com,JPM
9,How are there people allowed to continue in business. At minimum they should be investigated and brought to justice #Tsla #jpm #JPMorgan ⁦@jpmorgan⁩ #nasdaq #NYSE,JPM


In [81]:
# Sentence Tokenization
smpl.loc[:,'Sent_Token'] = smpl.text.apply(lambda x: sent_tokenize(x))
smpl = smpl

In [114]:
smpl.drop(labels=['Sent_Token'], axis=1, inplace=True)

In [97]:
def reorder(df):
    """ switches the last and the penultimate attribute of a DataFrame"""
    df = df.iloc[:,np.append(np.delete(np.arange(-len(df.columns),0),-2),-2)]
    return df

In [98]:
smpl = reorder(smpl)
smpl

Unnamed: 0,text,Sent_Token,ticker
0,The Latest: California gov fears virus spread amid protests $DIS #DIS #Health #Diseaseoutbreaks #Publichealth,[The Latest: California gov fears virus spread amid protests $DIS #DIS #Health #Diseaseoutbreaks #Publichealth],DIS
1,#DIS wishes you all a very #HappyHoli. #Holi2019 #HoliCelebration,"[#DIS wishes you all a very #HappyHoli., #Holi2019 #HoliCelebration]",DIS
2,$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL #stocks #stockstobuy #stockstowatch #wynn #mgm #starbucks #carnival #nike #marriott #disney #dis,[$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL #stocks #stockstobuy #stockstowatch #wynn #mgm #starbucks #carnival #nike #marriott #disney #dis],DIS
3,Your buy earlier at 137 support $DIS #DIS wow!,[Your buy earlier at 137 support $DIS #DIS wow!],DIS
4,"GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead, 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe. But the refuge has secrets, and Rhea must learn them or risk losing her sister for good. #PitMad #YA #SF #dis","[GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead, 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe., But the refuge has secrets, and Rhea must learn them or risk losing her sister for good., #PitMad #YA #SF #dis]",DIS
5,I vow to one day serve this country ........#CCM chama langu #JPM tuchape kaz .....mbele daima,[I vow to one day serve this country ........#CCM chama langu #JPM tuchape kaz .....mbele daima],JPM
6,"J.P. Morgan Symphony Orchestra Autumn Concert, 25th October 2019, St John's Waterloo, London. #jpm #jpmorgan #jpmso #Symphony #Orchestra #orkestra #senfoni #music #müzik #london #londra #stjohnswaterloo #concert #event #ezgigunuc #violin #keman #strings #photograph #sonyalpha","[J.P. Morgan Symphony Orchestra Autumn Concert, 25th October 2019, St John's Waterloo, London., #jpm #jpmorgan #jpmso #Symphony #Orchestra #orkestra #senfoni #music #müzik #london #londra #stjohnswaterloo #concert #event #ezgigunuc #violin #keman #strings #photograph #sonyalpha]",JPM
7,China's 2019 economic growth weakens amid trade war $JPM #JPM #Governmentspending #Fiscalpolicy #Economicpolicy http://zpr.io/thaAY,[China's 2019 economic growth weakens amid trade war $JPM #JPM #Governmentspending #Fiscalpolicy #Economicpolicy http://zpr.io/thaAY],JPM
8,First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com,[First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com],JPM
9,How are there people allowed to continue in business. At minimum they should be investigated and brought to justice #Tsla #jpm #JPMorgan ⁦@jpmorgan⁩ #nasdaq #NYSE,"[How are there people allowed to continue in business., At minimum they should be investigated and brought to justice #Tsla #jpm #JPMorgan ⁦@jpmorgan⁩ #nasdaq #NYSE]",JPM


In [103]:
# Word Tokenization
smpl.loc[:,'Word_Token'] = smpl.text.apply(lambda x: TweetTokenizer().tokenize(x))
smpl = reorder(smpl)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [110]:
smpl

Unnamed: 0,text,Sent_Token,Word_Token,ticker
0,The Latest: California gov fears virus spread amid protests $DIS #DIS #Health #Diseaseoutbreaks #Publichealth,[The Latest: California gov fears virus spread amid protests $DIS #DIS #Health #Diseaseoutbreaks #Publichealth],"[The, Latest, :, California, gov, fears, virus, spread, amid, protests, $, DIS, #DIS, #Health, #Diseaseoutbreaks, #Publichealth]",DIS
1,#DIS wishes you all a very #HappyHoli. #Holi2019 #HoliCelebration,"[#DIS wishes you all a very #HappyHoli., #Holi2019 #HoliCelebration]","[#DIS, wishes, you, all, a, very, #HappyHoli, ., #Holi2019, #HoliCelebration]",DIS
2,$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL #stocks #stockstobuy #stockstowatch #wynn #mgm #starbucks #carnival #nike #marriott #disney #dis,[$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL #stocks #stockstobuy #stockstowatch #wynn #mgm #starbucks #carnival #nike #marriott #disney #dis],"[$, WYNN, $, MGM, $, OXY, $, BP, $, MAR, $, DIS, $, NKE, $, SBUX, $, BA, $, CLL, #stocks, #stockstobuy, #stockstowatch, #wynn, #mgm, #starbucks, #carnival, #nike, #marriott, #disney, #dis]",DIS
3,Your buy earlier at 137 support $DIS #DIS wow!,[Your buy earlier at 137 support $DIS #DIS wow!],"[Your, buy, earlier, at, 137, support, $, DIS, #DIS, wow, !]",DIS
4,"GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead, 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe. But the refuge has secrets, and Rhea must learn them or risk losing her sister for good. #PitMad #YA #SF #dis","[GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead, 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe., But the refuge has secrets, and Rhea must learn them or risk losing her sister for good., #PitMad #YA #SF #dis]","[GATTACA, x, GUNSLINGER, GIRL, Running, from, a, govt, that, wants, her, sister, dead, ,, 17, y, /, o, sharpshooter, Rhea, is, hunting, for, the, place, that, keeps, defects, like, Addie, safe, ., But, the, refuge, has, secrets, ,, and, Rhea, must, learn, them, or, risk, losing, her, sister, for, good, ., #PitMad, #YA, #SF, #dis]",DIS
5,I vow to one day serve this country ........#CCM chama langu #JPM tuchape kaz .....mbele daima,[I vow to one day serve this country ........#CCM chama langu #JPM tuchape kaz .....mbele daima],"[I, vow, to, one, day, serve, this, country, ..., #CCM, chama, langu, #JPM, tuchape, kaz, ..., mbele, daima]",JPM
6,"J.P. Morgan Symphony Orchestra Autumn Concert, 25th October 2019, St John's Waterloo, London. #jpm #jpmorgan #jpmso #Symphony #Orchestra #orkestra #senfoni #music #müzik #london #londra #stjohnswaterloo #concert #event #ezgigunuc #violin #keman #strings #photograph #sonyalpha","[J.P. Morgan Symphony Orchestra Autumn Concert, 25th October 2019, St John's Waterloo, London., #jpm #jpmorgan #jpmso #Symphony #Orchestra #orkestra #senfoni #music #müzik #london #londra #stjohnswaterloo #concert #event #ezgigunuc #violin #keman #strings #photograph #sonyalpha]","[J, ., P, ., Morgan, Symphony, Orchestra, Autumn, Concert, ,, 25th, October, 2019, ,, St, John's, Waterloo, ,, London, ., #jpm, #jpmorgan, #jpmso, #Symphony, #Orchestra, #orkestra, #senfoni, #music, #müzik, #london, #londra, #stjohnswaterloo, #concert, #event, #ezgigunuc, #violin, #keman, #strings, #photograph, #sonyalpha]",JPM
7,China's 2019 economic growth weakens amid trade war $JPM #JPM #Governmentspending #Fiscalpolicy #Economicpolicy http://zpr.io/thaAY,[China's 2019 economic growth weakens amid trade war $JPM #JPM #Governmentspending #Fiscalpolicy #Economicpolicy http://zpr.io/thaAY],"[China's, 2019, economic, growth, weakens, amid, trade, war, $, JPM, #JPM, #Governmentspending, #Fiscalpolicy, #Economicpolicy, http://zpr.io/thaAY]",JPM
8,First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com,[First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com],"[First-Quarter, 30, +, Day, Credit, Card, Delinquency, Among, Top, 4, U.S.Issuers, on, Upward, Trajectory, http://dld.bz/h2Jtr, #chase, #JPM, #capitalone, #COF, #bankofamerica, #BAC, citibank, #, C, #secondquarter, #ramresearch, #creditcards, http://www.CardData.com]",JPM
9,How are there people allowed to continue in business. At minimum they should be investigated and brought to justice #Tsla #jpm #JPMorgan ⁦@jpmorgan⁩ #nasdaq #NYSE,"[How are there people allowed to continue in business., At minimum they should be investigated and brought to justice #Tsla #jpm #JPMorgan ⁦@jpmorgan⁩ #nasdaq #NYSE]","[How, are, there, people, allowed, to, continue, in, business, ., At, minimum, they, should, be, investigated, and, brought, to, justice, #Tsla, #jpm, #JPMorgan, ⁦, @jpmorgan, ⁩, #nasdaq, #NYSE]",JPM


In [111]:
# Turn everything lower case
def lower_list(list_of_word):
    """ turn all strings into lower case"""
    low_list = [word.lower() for word in list_of_word]
    return low_list

In [112]:
smpl.loc[:,'Word_Token'] = smpl.loc[:,'Word_Token'].apply(lower_list)

Unnamed: 0,text,Sent_Token,Word_Token,ticker
0,The Latest: California gov fears virus spread amid protests $DIS #DIS #Health #Diseaseoutbreaks #Publichealth,[The Latest: California gov fears virus spread amid protests $DIS #DIS #Health #Diseaseoutbreaks #Publichealth],"[the, latest, :, california, gov, fears, virus, spread, amid, protests, $, dis, #dis, #health, #diseaseoutbreaks, #publichealth]",DIS
1,#DIS wishes you all a very #HappyHoli. #Holi2019 #HoliCelebration,"[#DIS wishes you all a very #HappyHoli., #Holi2019 #HoliCelebration]","[#dis, wishes, you, all, a, very, #happyholi, ., #holi2019, #holicelebration]",DIS
2,$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL #stocks #stockstobuy #stockstowatch #wynn #mgm #starbucks #carnival #nike #marriott #disney #dis,[$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL #stocks #stockstobuy #stockstowatch #wynn #mgm #starbucks #carnival #nike #marriott #disney #dis],"[$, wynn, $, mgm, $, oxy, $, bp, $, mar, $, dis, $, nke, $, sbux, $, ba, $, cll, #stocks, #stockstobuy, #stockstowatch, #wynn, #mgm, #starbucks, #carnival, #nike, #marriott, #disney, #dis]",DIS
3,Your buy earlier at 137 support $DIS #DIS wow!,[Your buy earlier at 137 support $DIS #DIS wow!],"[your, buy, earlier, at, 137, support, $, dis, #dis, wow, !]",DIS
4,"GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead, 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe. But the refuge has secrets, and Rhea must learn them or risk losing her sister for good. #PitMad #YA #SF #dis","[GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead, 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe., But the refuge has secrets, and Rhea must learn them or risk losing her sister for good., #PitMad #YA #SF #dis]","[gattaca, x, gunslinger, girl, running, from, a, govt, that, wants, her, sister, dead, ,, 17, y, /, o, sharpshooter, rhea, is, hunting, for, the, place, that, keeps, defects, like, addie, safe, ., but, the, refuge, has, secrets, ,, and, rhea, must, learn, them, or, risk, losing, her, sister, for, good, ., #pitmad, #ya, #sf, #dis]",DIS
5,I vow to one day serve this country ........#CCM chama langu #JPM tuchape kaz .....mbele daima,[I vow to one day serve this country ........#CCM chama langu #JPM tuchape kaz .....mbele daima],"[i, vow, to, one, day, serve, this, country, ..., #ccm, chama, langu, #jpm, tuchape, kaz, ..., mbele, daima]",JPM
6,"J.P. Morgan Symphony Orchestra Autumn Concert, 25th October 2019, St John's Waterloo, London. #jpm #jpmorgan #jpmso #Symphony #Orchestra #orkestra #senfoni #music #müzik #london #londra #stjohnswaterloo #concert #event #ezgigunuc #violin #keman #strings #photograph #sonyalpha","[J.P. Morgan Symphony Orchestra Autumn Concert, 25th October 2019, St John's Waterloo, London., #jpm #jpmorgan #jpmso #Symphony #Orchestra #orkestra #senfoni #music #müzik #london #londra #stjohnswaterloo #concert #event #ezgigunuc #violin #keman #strings #photograph #sonyalpha]","[j, ., p, ., morgan, symphony, orchestra, autumn, concert, ,, 25th, october, 2019, ,, st, john's, waterloo, ,, london, ., #jpm, #jpmorgan, #jpmso, #symphony, #orchestra, #orkestra, #senfoni, #music, #müzik, #london, #londra, #stjohnswaterloo, #concert, #event, #ezgigunuc, #violin, #keman, #strings, #photograph, #sonyalpha]",JPM
7,China's 2019 economic growth weakens amid trade war $JPM #JPM #Governmentspending #Fiscalpolicy #Economicpolicy http://zpr.io/thaAY,[China's 2019 economic growth weakens amid trade war $JPM #JPM #Governmentspending #Fiscalpolicy #Economicpolicy http://zpr.io/thaAY],"[china's, 2019, economic, growth, weakens, amid, trade, war, $, jpm, #jpm, #governmentspending, #fiscalpolicy, #economicpolicy, http://zpr.io/thaay]",JPM
8,First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com,[First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com],"[first-quarter, 30, +, day, credit, card, delinquency, among, top, 4, u.s.issuers, on, upward, trajectory, http://dld.bz/h2jtr, #chase, #jpm, #capitalone, #cof, #bankofamerica, #bac, citibank, #, c, #secondquarter, #ramresearch, #creditcards, http://www.carddata.com]",JPM
9,How are there people allowed to continue in business. At minimum they should be investigated and brought to justice #Tsla #jpm #JPMorgan ⁦@jpmorgan⁩ #nasdaq #NYSE,"[How are there people allowed to continue in business., At minimum they should be investigated and brought to justice #Tsla #jpm #JPMorgan ⁦@jpmorgan⁩ #nasdaq #NYSE]","[how, are, there, people, allowed, to, continue, in, business, ., at, minimum, they, should, be, investigated, and, brought, to, justice, #tsla, #jpm, #jpmorgan, ⁦, @jpmorgan, ⁩, #nasdaq, #nyse]",JPM


In [115]:
smpl

Unnamed: 0,text,Word_Token,ticker
0,The Latest: California gov fears virus spread amid protests $DIS #DIS #Health #Diseaseoutbreaks #Publichealth,"[the, latest, :, california, gov, fears, virus, spread, amid, protests, $, dis, #dis, #health, #diseaseoutbreaks, #publichealth]",DIS
1,#DIS wishes you all a very #HappyHoli. #Holi2019 #HoliCelebration,"[#dis, wishes, you, all, a, very, #happyholi, ., #holi2019, #holicelebration]",DIS
2,$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL #stocks #stockstobuy #stockstowatch #wynn #mgm #starbucks #carnival #nike #marriott #disney #dis,"[$, wynn, $, mgm, $, oxy, $, bp, $, mar, $, dis, $, nke, $, sbux, $, ba, $, cll, #stocks, #stockstobuy, #stockstowatch, #wynn, #mgm, #starbucks, #carnival, #nike, #marriott, #disney, #dis]",DIS
3,Your buy earlier at 137 support $DIS #DIS wow!,"[your, buy, earlier, at, 137, support, $, dis, #dis, wow, !]",DIS
4,"GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead, 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe. But the refuge has secrets, and Rhea must learn them or risk losing her sister for good. #PitMad #YA #SF #dis","[gattaca, x, gunslinger, girl, running, from, a, govt, that, wants, her, sister, dead, ,, 17, y, /, o, sharpshooter, rhea, is, hunting, for, the, place, that, keeps, defects, like, addie, safe, ., but, the, refuge, has, secrets, ,, and, rhea, must, learn, them, or, risk, losing, her, sister, for, good, ., #pitmad, #ya, #sf, #dis]",DIS
5,I vow to one day serve this country ........#CCM chama langu #JPM tuchape kaz .....mbele daima,"[i, vow, to, one, day, serve, this, country, ..., #ccm, chama, langu, #jpm, tuchape, kaz, ..., mbele, daima]",JPM
6,"J.P. Morgan Symphony Orchestra Autumn Concert, 25th October 2019, St John's Waterloo, London. #jpm #jpmorgan #jpmso #Symphony #Orchestra #orkestra #senfoni #music #müzik #london #londra #stjohnswaterloo #concert #event #ezgigunuc #violin #keman #strings #photograph #sonyalpha","[j, ., p, ., morgan, symphony, orchestra, autumn, concert, ,, 25th, october, 2019, ,, st, john's, waterloo, ,, london, ., #jpm, #jpmorgan, #jpmso, #symphony, #orchestra, #orkestra, #senfoni, #music, #müzik, #london, #londra, #stjohnswaterloo, #concert, #event, #ezgigunuc, #violin, #keman, #strings, #photograph, #sonyalpha]",JPM
7,China's 2019 economic growth weakens amid trade war $JPM #JPM #Governmentspending #Fiscalpolicy #Economicpolicy http://zpr.io/thaAY,"[china's, 2019, economic, growth, weakens, amid, trade, war, $, jpm, #jpm, #governmentspending, #fiscalpolicy, #economicpolicy, http://zpr.io/thaay]",JPM
8,First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com,"[first-quarter, 30, +, day, credit, card, delinquency, among, top, 4, u.s.issuers, on, upward, trajectory, http://dld.bz/h2jtr, #chase, #jpm, #capitalone, #cof, #bankofamerica, #bac, citibank, #, c, #secondquarter, #ramresearch, #creditcards, http://www.carddata.com]",JPM
9,How are there people allowed to continue in business. At minimum they should be investigated and brought to justice #Tsla #jpm #JPMorgan ⁦@jpmorgan⁩ #nasdaq #NYSE,"[how, are, there, people, allowed, to, continue, in, business, ., at, minimum, they, should, be, investigated, and, brought, to, justice, #tsla, #jpm, #jpmorgan, ⁦, @jpmorgan, ⁩, #nasdaq, #nyse]",JPM


In [116]:
# Get rid of stop words
def remove_stopwords(list_of_word):
    """ removes stopwords """
    no_stopwords = [t for t in list_of_word if t not in stopwords.words('english')]
    return no_stopwords

In [117]:
smpl.loc[:,'Word_Token_NS'] = smpl.loc[:,'Word_Token'].apply(remove_stopwords)
smpl = reorder(smpl)

In [118]:
smpl

Unnamed: 0,text,Word_Token,Word_Token_NS,ticker
0,The Latest: California gov fears virus spread amid protests $DIS #DIS #Health #Diseaseoutbreaks #Publichealth,"[the, latest, :, california, gov, fears, virus, spread, amid, protests, $, dis, #dis, #health, #diseaseoutbreaks, #publichealth]","[latest, :, california, gov, fears, virus, spread, amid, protests, $, dis, #dis, #health, #diseaseoutbreaks, #publichealth]",DIS
1,#DIS wishes you all a very #HappyHoli. #Holi2019 #HoliCelebration,"[#dis, wishes, you, all, a, very, #happyholi, ., #holi2019, #holicelebration]","[#dis, wishes, #happyholi, ., #holi2019, #holicelebration]",DIS
2,$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL #stocks #stockstobuy #stockstowatch #wynn #mgm #starbucks #carnival #nike #marriott #disney #dis,"[$, wynn, $, mgm, $, oxy, $, bp, $, mar, $, dis, $, nke, $, sbux, $, ba, $, cll, #stocks, #stockstobuy, #stockstowatch, #wynn, #mgm, #starbucks, #carnival, #nike, #marriott, #disney, #dis]","[$, wynn, $, mgm, $, oxy, $, bp, $, mar, $, dis, $, nke, $, sbux, $, ba, $, cll, #stocks, #stockstobuy, #stockstowatch, #wynn, #mgm, #starbucks, #carnival, #nike, #marriott, #disney, #dis]",DIS
3,Your buy earlier at 137 support $DIS #DIS wow!,"[your, buy, earlier, at, 137, support, $, dis, #dis, wow, !]","[buy, earlier, 137, support, $, dis, #dis, wow, !]",DIS
4,"GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead, 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe. But the refuge has secrets, and Rhea must learn them or risk losing her sister for good. #PitMad #YA #SF #dis","[gattaca, x, gunslinger, girl, running, from, a, govt, that, wants, her, sister, dead, ,, 17, y, /, o, sharpshooter, rhea, is, hunting, for, the, place, that, keeps, defects, like, addie, safe, ., but, the, refuge, has, secrets, ,, and, rhea, must, learn, them, or, risk, losing, her, sister, for, good, ., #pitmad, #ya, #sf, #dis]","[gattaca, x, gunslinger, girl, running, govt, wants, sister, dead, ,, 17, /, sharpshooter, rhea, hunting, place, keeps, defects, like, addie, safe, ., refuge, secrets, ,, rhea, must, learn, risk, losing, sister, good, ., #pitmad, #ya, #sf, #dis]",DIS
5,I vow to one day serve this country ........#CCM chama langu #JPM tuchape kaz .....mbele daima,"[i, vow, to, one, day, serve, this, country, ..., #ccm, chama, langu, #jpm, tuchape, kaz, ..., mbele, daima]","[vow, one, day, serve, country, ..., #ccm, chama, langu, #jpm, tuchape, kaz, ..., mbele, daima]",JPM
6,"J.P. Morgan Symphony Orchestra Autumn Concert, 25th October 2019, St John's Waterloo, London. #jpm #jpmorgan #jpmso #Symphony #Orchestra #orkestra #senfoni #music #müzik #london #londra #stjohnswaterloo #concert #event #ezgigunuc #violin #keman #strings #photograph #sonyalpha","[j, ., p, ., morgan, symphony, orchestra, autumn, concert, ,, 25th, october, 2019, ,, st, john's, waterloo, ,, london, ., #jpm, #jpmorgan, #jpmso, #symphony, #orchestra, #orkestra, #senfoni, #music, #müzik, #london, #londra, #stjohnswaterloo, #concert, #event, #ezgigunuc, #violin, #keman, #strings, #photograph, #sonyalpha]","[j, ., p, ., morgan, symphony, orchestra, autumn, concert, ,, 25th, october, 2019, ,, st, john's, waterloo, ,, london, ., #jpm, #jpmorgan, #jpmso, #symphony, #orchestra, #orkestra, #senfoni, #music, #müzik, #london, #londra, #stjohnswaterloo, #concert, #event, #ezgigunuc, #violin, #keman, #strings, #photograph, #sonyalpha]",JPM
7,China's 2019 economic growth weakens amid trade war $JPM #JPM #Governmentspending #Fiscalpolicy #Economicpolicy http://zpr.io/thaAY,"[china's, 2019, economic, growth, weakens, amid, trade, war, $, jpm, #jpm, #governmentspending, #fiscalpolicy, #economicpolicy, http://zpr.io/thaay]","[china's, 2019, economic, growth, weakens, amid, trade, war, $, jpm, #jpm, #governmentspending, #fiscalpolicy, #economicpolicy, http://zpr.io/thaay]",JPM
8,First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com,"[first-quarter, 30, +, day, credit, card, delinquency, among, top, 4, u.s.issuers, on, upward, trajectory, http://dld.bz/h2jtr, #chase, #jpm, #capitalone, #cof, #bankofamerica, #bac, citibank, #, c, #secondquarter, #ramresearch, #creditcards, http://www.carddata.com]","[first-quarter, 30, +, day, credit, card, delinquency, among, top, 4, u.s.issuers, upward, trajectory, http://dld.bz/h2jtr, #chase, #jpm, #capitalone, #cof, #bankofamerica, #bac, citibank, #, c, #secondquarter, #ramresearch, #creditcards, http://www.carddata.com]",JPM
9,How are there people allowed to continue in business. At minimum they should be investigated and brought to justice #Tsla #jpm #JPMorgan ⁦@jpmorgan⁩ #nasdaq #NYSE,"[how, are, there, people, allowed, to, continue, in, business, ., at, minimum, they, should, be, investigated, and, brought, to, justice, #tsla, #jpm, #jpmorgan, ⁦, @jpmorgan, ⁩, #nasdaq, #nyse]","[people, allowed, continue, business, ., minimum, investigated, brought, justice, #tsla, #jpm, #jpmorgan, ⁦, @jpmorgan, ⁩, #nasdaq, #nyse]",JPM


### Remove unwanted strings and words

In [246]:
# remove urls
url_filter = re.compile(r"www.[\w\d]+.\w+|http://\S+|https://\S+")
re.sub(url_filter, "",smpl.loc[8,'text'])

'First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory  #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards '

In [208]:
# remove hashtags
hashtag_filter = re.compile(r"#\w+")
re.sub(hashtag_filter, "",smpl.loc[8,'text'])

'First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr       citibank     http://www.CardData.com'

In [206]:
# remove mentions
mentions_filter = re.compile(r"@\w+")
re.sub(mentions_filter, "",smpl.loc[8,'text'])

'First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com'

In [207]:
# remove special characters
schar_filter = re.compile(r"[+?!,;.(){}-]")
re.sub(schar_filter, "",smpl.loc[8,'text'])

'FirstQuarter 30 Day Credit Card Delinquency Among Top 4 USIssuers on Upward Trajectory http://dldbz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://wwwCardDatacom'

In [176]:
def filter_results(document, filter_condition):
    """replace given regEx string with a space"""
    document = re.sub(filter_condition, "", document)
    return document

In [247]:
filters = [url_filter, hashtag_filter, mentions_filter, schar_filter]

In [182]:
filter_results(smpl.loc[8,'text'],url_filter)

'First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory  #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards '

In [241]:
def filter_documents(df, filters):
    df.loc[:,'filtered_text'] = df.loc[:,'text']
    for char_filter in filters:       
        df.loc[:,'filtered_text'] = df.loc[:,'filtered_text'].apply(filter_results, args=[char_filter])
    return df

In [248]:
smpl = filter_documents(smpl, filters)

In [243]:
smpl = reorder(smpl)

In [249]:
smpl[['text','filtered_text']]

Unnamed: 0,text,filtered_text
0,The Latest: California gov fears virus spread amid protests $DIS #DIS #Health #Diseaseoutbreaks #Publichealth,The Latest: California gov fears virus spread amid protests $DIS
1,#DIS wishes you all a very #HappyHoli. #Holi2019 #HoliCelebration,wishes you all a very
2,$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL #stocks #stockstobuy #stockstowatch #wynn #mgm #starbucks #carnival #nike #marriott #disney #dis,$WYNN $MGM $OXY $BP $MAR $DIS $NKE $SBUX $BA $CLL
3,Your buy earlier at 137 support $DIS #DIS wow!,Your buy earlier at 137 support $DIS wow
4,"GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead, 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe. But the refuge has secrets, and Rhea must learn them or risk losing her sister for good. #PitMad #YA #SF #dis",GATTACA x GUNSLINGER GIRL Running from a govt that wants her sister dead 17 y/o sharpshooter Rhea is hunting for the place that keeps defects like Addie safe But the refuge has secrets and Rhea must learn them or risk losing her sister for good
5,I vow to one day serve this country ........#CCM chama langu #JPM tuchape kaz .....mbele daima,I vow to one day serve this country chama langu tuchape kaz mbele daima
6,"J.P. Morgan Symphony Orchestra Autumn Concert, 25th October 2019, St John's Waterloo, London. #jpm #jpmorgan #jpmso #Symphony #Orchestra #orkestra #senfoni #music #müzik #london #londra #stjohnswaterloo #concert #event #ezgigunuc #violin #keman #strings #photograph #sonyalpha",JP Morgan Symphony Orchestra Autumn Concert 25th October 2019 St John's Waterloo London
7,China's 2019 economic growth weakens amid trade war $JPM #JPM #Governmentspending #Fiscalpolicy #Economicpolicy http://zpr.io/thaAY,China's 2019 economic growth weakens amid trade war $JPM
8,First-Quarter 30+ Day Credit Card Delinquency Among Top 4 U.S.Issuers on Upward Trajectory http://dld.bz/h2Jtr #chase #JPM #capitalone #COF #bankofamerica #BAC citibank #C #secondquarter #ramresearch #creditcards http://www.CardData.com,FirstQuarter 30 Day Credit Card Delinquency Among Top 4 USIssuers on Upward Trajectory citibank
9,How are there people allowed to continue in business. At minimum they should be investigated and brought to justice #Tsla #jpm #JPMorgan ⁦@jpmorgan⁩ #nasdaq #NYSE,How are there people allowed to continue in business At minimum they should be investigated and brought to justice ⁦⁩


In [None]:
# Lemmatize all tokens into a new list: lemmatized
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize_words(list_of_words):
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in list_of_words]
    return lemmatized

df_text.loc[:,'Word_Token_NS_L'] = df_text.loc[:,'Word_Token_NS'].apply(lemmatize_words)
df_text.loc[:,'Word_casual_NS_L'] = df_text.loc[:,'Word_casual_NS'].apply(lemmatize_words)

# Create the bag-of-words: bow

def concat_lists(x1, x2): return x1 + x2

def create_bow(df_attribute):
    bow_dict = {}
    for ticker in df_text.ticker.unique():
        ticker_words = reduce(concat_lists, df_text[df_text.ticker == ticker][df_attribute])
        bow = Counter(ticker_words)
        bow_dict[ticker] = bow
    return bow_dict
    

bow = create_bow('Word_Token_NS_L')

# Vader sentiment

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

df_text.loc[:,'VADER'] = df_text.loc[:,'text'].apply(sid.polarity_scores)
df_text.loc[:,'VADER_cmp'] = df_text.loc[:,'VADER'].apply(lambda x:x['compound'])


def create_fdist(tick, df_attribute):
    ticker_words = reduce(concat_lists, df_text[df_text.ticker == tick][df_attribute])
    fdist = FreqDist(ticker_words)
    return fdist

# Print the 10 most common tokens
print(bow.most_common(10))

KO_fdist = create_fdist('KO','Word_casual_NS_L')

import matplotlib.pyplot as plt
KO_fdist.plot(30,cumulative=False)
plt.show()