In [148]:
import pandas as pd
import nltk
import re 
from constants import macroWords, sectorWords, tickerList, companies, fillerWords


In [149]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

def lemmaStem(word, lemmatizer, stemmer):
    return stemmer.stem(lemmatizer.lemmatize(word))

def unique(a):
    """ return the list with duplicate elements removed """
    return list(set(a))

def intersect(a, b):
   """ return the intersection of two lists """
   return list(set(a) & set(b))

In [150]:
xl = pd.ExcelFile('Data/LoughranMcDonald_SentimentWordLists_2018.xlsx')
sentiments = []

xl.sheet_names  # see all sheet names
porter = nltk.PorterStemmer() # also lancaster stemmer
wnl = nltk.WordNetLemmatizer()

sentientWords = {}
for sheet_name in xl.sheet_names:
    df = xl.parse(sheet_name, header = None)
    lst = set()
    for val in df.values:
        word = lemmaStem(str(val[0]).lower(), wnl, porter)
        if len(word) >= 3:
            lst.add(word)
    sentientWords[sheet_name.lower()] = list(lst)
    sentiments.append(sheet_name.lower())

In [185]:
for key in sentientWords:
    print(key)
    print(len(sentientWords[key]))
    print()

negative
910

positive
151

uncertainty
137

litigious
452

strongmodal
17

weakmodal
20

constraining
58



In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords

stopwordslst = set()
chars = ['{','}','#','%','\(','\)','\[','\]','<','>',',', '!', '.', ';', 
'?', '*', '\\', '\/', '~', '_','|','=','+','^',':','\"','\'','@']
porter = nltk.PorterStemmer() # also lancaster stemmer
wnl = nltk.WordNetLemmatizer()

temp = stopwords.words("english")
for word in temp:
    word = re.sub('[%s]' % ''.join(chars), '', word)
    if 'nt' in word or 'no' in word or len(word) <= 2:
        continue
    stopwordslst.add(word)
    
print('as' in stopwordslst)



In [None]:
text = "ford said it would restate results from 2001 through the second quarter and cautioned that operating results would weaken further in the current quarter. ford also said it was considering raising new funding secured by its automotive assets in order to protect its cash position as it pays the bill to close 16 plants and hasnt cut up to 45,000 jobs. these business results are clearly unacceptable, said chief executive alan mulally, who took over at ford in early september. ford posted a net loss of $3.08 per share for the third quarter compared with a loss of $284 million, or 15 cents per share, a year earlier. that included a battery of charges that totaled $4.6 billion after taxes, or $2.46 per share. fords loss from continuing operations was 62 cents per share, matching analysts average forecast as tracked by reuters estimates. analysts have urged mulally to consider further asset sales to protect fords cash position, and in his first post-earnings conference call with analysts some credited him with a more open approach to the companys turnaround. one of the things thats positive about mulally coming in is that nothing is sacred and he is willing to take a fresh look, said morningstar analyst john novak. said argus research analyst kevin tynan simply shrinking again is not necessarily the answer. ford needs to become a company flexible enough to be profitable at lower production volumes on each line and on each platform. revenue down, cash in focus revenue for the quarter was $36.7 billion, down $4.1 billion. auto sales accounted for $32.6 billion of total revenue. high gasoline prices have caused u.s. consumers to shift away from sport utility vehicles and pickup trucks, a segment that represents over 60 percent of fords current sales. ford ended the quarter with cash of $23.6 billion but said that total would drop near $20 billion by the end of the year as it restructures. fitch ratings, which placed fords debt on watch for a downgrade on monday, said the automaker needed to hold at least $15 billion in consolidated cash in order to avoid raising concern among suppliers and customers. chief financial officer don leclair said ford was exploring various financing strategies, including secured financing involving a substantial portion of its automotive operations. ford is selling its british luxury brand aston martin. leclair said the company is in the process of preparing a short list of bidders but does not expect to close a sale this year. mulally also said ford is open to reviewing its other luxury brands leaving the door open to a potential sale of jaguar, volvo or land rover. i really think its going to hinge on how the businesses are doing and can we make profitable growth businesses out of them with the action we have taken and additional actions that might be required, he said. buyouts on the table ford is offering buyouts to all of its 75,000 unionized workers in a bid to reduce its factory work force by nearly half. the bulk of hourly workers taking buyouts to leave the company in the first and second quarter of 2007. about 4,000 employees at former visteon corp. factories, now grouped under automotive component holdings, have already accepted buyouts, the company said. in the third quarter, ford took pretax charges of $861 million for job cuts related to plant closings in north america, $259 million for job cuts elsewhere, and $437 million to pay out pensions earlier than planned . ford also took pretax charges of $2.2 billion to write down the value of north american assets and $1.6 billion for the impairment of jaguar and land rover assets. ford motor credits net profit fell to $262 million from $577 million a year earlier in part because of higher financing costs driven by the parent companys junk credit rating. the reduced profitability at ford motor credit now shines a more intense light on the weakness of fords fundamental operation the automotive business, tynan said. ford executives repeated that the company has no plans to sell its finance arm, although leclair said the automaker would be open to partnerships in some markets. shares of ford were down 14 cents or 1.8 percent to $7.87 in early afternoon trade on the new york stock exchange. additional reporting by  jui chakravorty  and  kevin krolicki  "

In [None]:
def invertNegations(tokens):
    for i in range(len(tokens)):
        if tokens[i] in ['wasnt', 'hasnt', 'wouldnt', 'cant', 'isnt', 'havent', 'shouldnt', 'couldnt', 'didnt', 'no', 'not']:
            tokens[i+1] = '-'+tokens[i+1]
    return tokens

In [None]:
def calculateSentimentScores(text, sentimentDict, lemmatizer, stemmer, lst=fillerWords):
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in lst]
    tokens = [lemmaStem(t, lemmatizer, stemmer) for t in tokens]
    tokens = invertNegations(tokens)
    scores = {}
    flip = {'positive': 'negative', 'negative': 'positive', 'weakmodal': 'strongmodal', 'strongmodal': 'weakmodal'}
    for key in sentimentDict:
        scores[key] = 0
    total = 0
    for token in tokens:
         for key in sentimentDict:
            if token[0] == '-':
                if token[1:] in sentimentDict[key] and key in flip:
                    scores[flip[key]] += 1
            else:
                if token in sentimentDict[key]:
                    scores[key] += 1       
    return scores 

In [None]:
porter = nltk.PorterStemmer()
wnl = nltk.WordNetLemmatizer()
calculateSentimentScores(text, sentientWords, wnl, porter)

In [None]:
def stringtoDict(string):
    lst = string[1:-1]
    elements = lst.split(",")
    return elements[0]

In [None]:
from collections import defaultdict
newsNumber = defaultdict(int)
df = pd.read_csv('Data/stocks_newsMicro.csv', parse_dates=['date'], index_col=['date'])

temp = []
for index, row in df.iterrows():
    try:
        date_var = index
        main_ticker = stringtoDict(row['tickers'])
        text = str(row['fulltext'])
        porter = nltk.PorterStemmer()
        wnl = nltk.WordNetLemmatizer()
        raw_scores = calculateSentimentScores(text, sentientWords, wnl, porter)
        if raw_scores['positive'] + raw_scores['negative'] != 0:
            polarity = (raw_scores['positive'] - raw_scores['negative'])/(raw_scores['positive'] + raw_scores['negative'])
        else:
            polarity = 0
        if raw_scores['strongmodal'] + raw_scores['weakmodal'] != 0:
            modality = (raw_scores['strongmodal'] - raw_scores['weakmodal'])/(raw_scores['strongmodal'] + raw_scores['weakmodal'])
        else:
            modality = 0
        if abs(modality) + raw_scores['uncertainty'] != 0:
            possibility = (modality - raw_scores['uncertainty'])/ (abs(modality) + raw_scores['uncertainty']) 
        else:
            possibility = 0
        temp.append([date_var, main_ticker, raw_scores, polarity, possibility])
        newsNumber[main_ticker] +=1
    except:
        continue

column_names = ['date', 'primary ticker', 'raw_scores', 'polarity', 'possibility']
df_sentimentScores = pd.DataFrame(temp, columns = column_names)
df_sentimentScores.set_index('date')
print(df_sentimentScores.head())

    
    

In [127]:
df_sentimentScores.head()

Unnamed: 0,primary ticker,raw_scores,polarity,possibility,datetime
0,'wmt',"{'negative': 20, 'positive': 12, 'uncertainty'...",-0.25,-1.0,2006-10-23
1,'f',"{'negative': 22, 'positive': 7, 'uncertainty':...",-0.517241,-1.0,2006-10-23
2,'f',"{'negative': 6, 'positive': 6, 'uncertainty': ...",0.0,-1.0,2006-10-23
3,'f',"{'negative': 2, 'positive': 0, 'uncertainty': ...",-1.0,-1.0,2006-10-23
4,'f',"{'negative': 11, 'positive': 6, 'uncertainty':...",-0.294118,-1.0,2006-10-23


In [128]:
lst = []
for index, row in df_sentimentScores.iterrows():
    myDict = row['raw_scores']
    myDict['datetime'] = row['datetime']
    myDict['primary ticker'] = row['primary ticker']
    lst.append(myDict)
    
    

In [129]:
df_raw_scores = pd.DataFrame(lst)  
df_raw_scores.head()

Unnamed: 0,negative,positive,uncertainty,litigious,strongmodal,weakmodal,constraining,datetime,primary ticker
0,20,12,5,1,2,3,0,2006-10-23,'wmt'
1,22,7,3,1,2,2,2,2006-10-23,'f'
2,6,6,2,1,0,1,1,2006-10-23,'f'
3,2,0,1,0,0,1,0,2006-10-23,'f'
4,11,6,3,0,0,2,0,2006-10-23,'f'


In [154]:
df_new = df_raw_scores.groupby(['primary ticker', 'datetime']).mean()

In [155]:
df_new

Unnamed: 0_level_0,Unnamed: 1_level_0,negative,positive,uncertainty,litigious,strongmodal,weakmodal,constraining
primary ticker,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
'a',2006-11-28,0.0,0.0,1.0,1.0,0.0,1.0,0.0
'a',2006-11-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'a',2006-12-21,1.0,1.0,2.0,7.0,1.0,1.0,1.0
'a',2007-02-15,8.0,6.0,0.0,0.0,1.0,0.0,0.0
'a',2007-03-29,8.0,7.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
'zts',2012-06-07,6.0,7.0,5.0,2.0,0.0,3.0,0.0
'zts',2013-05-22,1.0,4.0,1.0,1.0,1.0,1.0,0.0
'zts',2013-06-24,2.0,3.0,2.0,0.0,0.0,2.0,0.0
'zts',2013-07-29,24.0,9.0,13.0,2.0,0.0,11.0,3.0


In [115]:
list(temp.columns)

['negative',
 'positive',
 'uncertainty',
 'litigious',
 'strongmodal',
 'weakmodal',
 'constraining']

In [117]:
import sklearn
from sklearn.preprocessing import StandardScaler
temp = df_new.loc['\'wmt\'']
temp[list(temp.columns)] = StandardScaler().fit_transform(temp[list(temp.columns)])
temp.head()

Unnamed: 0_level_0,negative,positive,uncertainty,litigious,strongmodal,weakmodal,constraining
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-10-23,-0.008905,0.633925,0.356062,-0.224327,0.316722,0.467505,-0.542631
2006-10-24,-0.649938,-0.620647,-0.601122,-0.377972,-0.411704,-0.586979,-0.309249
2006-10-26,-0.989309,-0.746104,-0.002882,-0.531618,-0.411704,0.11601,-0.542631
2006-10-27,0.594421,0.383011,-0.002882,-0.224327,-0.411704,-0.411232,-0.542631
2006-10-28,-0.989309,-0.934289,-0.72077,-0.377972,-0.411704,-0.938474,-0.542631


In [97]:
temp = df_new.loc['\'wmt\''].reset_index()
print(temp.shape)
temp[temp['possibility'] > 0].shape

(590, 3)


(24, 3)

In [29]:
compression_opts = dict(method='zip', archive_name='out.csv')  
df_sentimentScores.to_csv('out.zip', index=False, compression=compression_opts)

In [None]:
import operator
sorted_d = dict(sorted(newsNumber.items(), key=operator.itemgetter(1),reverse=True))
print('Dictionary in descending order by value : ',sorted_d)

In [138]:
tickerDict = {v: k for k, v in companies.items()}

In [144]:
toKeep = {}
for key, value in newsNumber.items():
    if value >= 500:
        key = key[1:-1]
        toKeep[key] = tickerDict[key]
        

In [145]:
toKeep

{'wmt': 'wal-mart stores',
 'f': 'ford motor',
 'gm': 'general motors',
 'ba': 'boeing company',
 'wfc': 'wells fargo',
 'ge': 'general electric',
 'd': 'dominion energy',
 'c': 'citigroup inc.',
 'ms': 'morgan stanley',
 'gs': 'goldman sachs group',
 'bac': 'bank of america corp',
 'aapl': 'apple inc.',
 'low': "lowe's cos.",
 'a': 'agilent technologies inc',
 'it': 'gartner inc',
 'jpm': 'jpmorgan chase & co.',
 'v': 'visa inc.'}

In [153]:
df_new.loc['\'wmt\'']

Unnamed: 0_level_0,negative,positive,uncertainty,litigious,strongmodal,weakmodal,constraining
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-10-23,-0.008905,0.633925,0.356062,-0.224327,0.316722,0.467505,-0.542631
2006-10-24,-0.649938,-0.620647,-0.601122,-0.377972,-0.411704,-0.586979,-0.309249
2006-10-26,-0.989309,-0.746104,-0.002882,-0.531618,-0.411704,0.116010,-0.542631
2006-10-27,0.594421,0.383011,-0.002882,-0.224327,-0.411704,-0.411232,-0.542631
2006-10-28,-0.989309,-0.934289,-0.720770,-0.377972,-0.411704,-0.938474,-0.542631
...,...,...,...,...,...,...,...
2013-11-11,0.933791,3.017611,0.715006,1.465775,0.680934,0.643253,1.557811
2013-11-12,-0.310568,-0.746104,0.715006,0.082965,-0.411704,0.643253,-0.542631
2013-11-14,1.009207,0.884839,1.073950,-0.019466,0.316722,1.521990,-0.309249
2013-11-17,-0.423691,-0.746104,-1.079714,-0.377972,0.134615,-0.938474,-0.542631


In [183]:
from os import mkdir
from os.path import isdir
from sklearn import preprocessing
print('{')
for ticker, company in toKeep.items():
    print("\'" + ticker + "\'" + ": " + "\'" + str(company) + "\'" + ", ")
    df = df_new.loc['\'' + ticker + '\'']
    min_max_scaler = preprocessing.MinMaxScaler()
    df[list(temp.columns)] = min_max_scaler.fit_transform(df[list(temp.columns)])
    polarity = df['positive'] - df['negative']
    modality = df['strongmodal'] - df['weakmodal']
    uncertainty = df['uncertainty']
    df_x = pd.DataFrame({'polarity': polarity,'modality': modality, 'uncertainty': uncertainty})
    df_x[list(df_x.columns)] = StandardScaler().fit_transform(df_x[list(df_x.columns)])
    sp =  'news_scores/'+ticker+".csv"
    df_x.to_csv(path_or_buf = sp)
print('}') 

{
'wmt': 'wal-mart stores', 
'f': 'ford motor', 
'gm': 'general motors', 
'ba': 'boeing company', 
'wfc': 'wells fargo', 
'ge': 'general electric', 
'd': 'dominion energy', 
'c': 'citigroup inc.', 
'ms': 'morgan stanley', 
'gs': 'goldman sachs group', 
'bac': 'bank of america corp', 
'aapl': 'apple inc.', 
'low': 'lowe's cos.', 
'a': 'agilent technologies inc', 
'it': 'gartner inc', 
'jpm': 'jpmorgan chase & co.', 
'v': 'visa inc.', 
}


In [161]:
df = df_new.loc['\'' + ticker + '\'']
df[list(temp.columns)] = StandardScaler().fit_transform(df[list(temp.columns)])

In [167]:
print(df.head())
polarity = df['positive'] - df['negative']
polarity

            negative  positive  uncertainty  litigious  strongmodal  \
datetime                                                              
2007-05-17 -1.818678  0.859257    -0.204163  -1.463169     2.147779   
2007-05-18 -1.322460  1.169403     0.141448  -1.381646    -0.715530   
2007-10-03 -1.984084 -1.311766    -1.240996  -1.707736    -0.715530   
2008-03-06 -0.991649  4.891155     0.487059  -1.055556     3.579433   
2008-03-18 -1.984084 -1.311766     0.832670  -1.544691    -0.715530   

            weakmodal  constraining  
datetime                             
2007-05-17   0.433595     -0.918242  
2007-05-18   0.923581     -0.918242  
2007-10-03  -1.036363     -0.918242  
2008-03-06  -0.056391     -0.918242  
2008-03-18  -1.036363     -0.918242  


datetime
2007-05-17    2.677935
2007-05-18    2.491863
2007-10-03    0.672319
2008-03-06    5.882803
2008-03-18    0.672319
                ...   
2013-11-11   -0.526835
2013-11-12   -1.312513
2013-11-15    0.527618
2013-11-18    0.858483
2013-11-19   -0.030617
Length: 401, dtype: float64