# Use the Loughran-McDonald sentiment word lists to generate vocabulary

In [1]:
import numpy as np
import pandas as pd

## Lemmatize

In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


def lemmatize_words(words):
    lemmatized_words = [WordNetLemmatizer().lemmatize(word, 'v') for word in words]
    return lemmatized_words

In [3]:
sentiments = ['negative', 'positive', 'uncertainty', 'litigious', 'constraining', 'interesting']

# Read the file
sentiment_df = pd.read_csv('LoughranMcDonald_MasterDictionary_2018.csv')
sentiment_df.columns = [column.lower() for column in sentiment_df.columns]

# Remove unused information
sentiment_df = sentiment_df[sentiments + ['word']]
sentiment_df[sentiments] = sentiment_df[sentiments].astype(bool)
sentiment_df = sentiment_df[(sentiment_df[sentiments]).any(1)]

# Apply the same preprocessing to these words as the 10-k words
sentiment_df['word'] = lemmatize_words(sentiment_df['word'].str.lower())
sentiment_df = sentiment_df.drop_duplicates('word')

sentiment_df.head()

Unnamed: 0,negative,positive,uncertainty,litigious,constraining,interesting,word
9,True,False,False,False,False,False,abandon
12,True,False,False,False,False,False,abandonment
13,True,False,False,False,False,False,abandonments
51,True,False,False,False,False,False,abdicate
54,True,False,False,False,False,False,abdication


In [4]:
# Use the positive words, negative words and uncertainty words as vocabulary 
positive_voc = sentiment_df[sentiment_df['positive']]['word']
negative_voc = sentiment_df[sentiment_df['negative']]['word']
uncertain_voc = sentiment_df[sentiment_df['uncertainty']]['word']

voc = positive_voc.append(negative_voc)
voc = voc.append(uncertain_voc)

In [6]:
# Write to the csv file
voc.to_csv('vocabulary.csv')

  
