In [None]:
#Import Necesssary Libraries
import os
import pandas as pd
from datetime import datetime
from collections import Counter
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
import yfinance as yf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pysentiment2 as ps
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader lexicon')

In [None]:
#Define function to tokenize
def process_text(text):
    tokens = word_tokenize(text.lower())
    remove_punc = str.maketrans('','', string.punctuation + string.digits)
    filtered_tokens = [word.translate(remove_punc) for word in tokens if word.isalpha() and word not in stopwords.words('english')]
    filtered_tokens = [word for word in filtered_tokens if word]
    return filtered_tokens

In [None]:
#Define function to calculate tone
def calculate_tone(document, positive_words, negative_words):
    word_freq = Counter(process_text(document))
    positive_count = sum(word_freq[word] for word in positive_words if word in word_freq)
    negative_count = sum(word_freq[word] for word in negative_words if word in word_freq)
    tone = (positive_count - negative_count) / len(word_freq)
    return tone

In [None]:
# Function to get the score for a document using pysentiment2
def get_hiv4_score(text):
    hiv4 = ps.HIV4()
    tokens = hiv4.tokenize(text)
    return hiv4.get_score(tokens)

In [None]:
# Load LM list
lm_dictionary_path = "/Loughran-McDonald_MasterDictionary_1993-2021.csv"
lm_dictionary = pd.read_csv(lm_dictionary_path)

# Extract lists of positive and negative words
positive_words_lm = set(lm_dictionary[lm_dictionary['Positive'] != 0]['Word'].str.lower())
negative_words_lm = set(lm_dictionary[lm_dictionary['Negative'] != 0]['Word'].str.lower())

In [None]:
# Load FOMC documents and dates
fomc_documents = []
fomc_dates = []
path_to_fomc_docs = "/Text Files"
for file_name in os.listdir(path_to_fomc_docs):
    date_str = file_name.split('.')[0]
    date_obj = datetime.strptime(date_str, '%Y%m%d')
    fomc_dates.append(date_obj)
    with open(os.path.join(path_to_fomc_docs, file_name), 'r', encoding='ISO-8859-1') as file:
        fomc_documents.append(file.read())

In [None]:
# Sort the dates and start counter
fomc_dates.sort()
overall_word_freq = Counter()

# Process and count words in each document
for doc in fomc_documents:
    processed_text = process_text(doc)
    overall_word_freq.update(processed_text)
    
# Convert to DataFrame and find the most influential unigrams
df_word_freq = pd.DataFrame(overall_word_freq.items(), columns=['Word', 'Frequency'])
df_word_freq_sorted = df_word_freq.sort_values(by='Frequency', ascending=False).reset_index(drop=True)
print(df_word_freq_sorted.head(10))


In [None]:
# Calculate tone, sort by date
tones_lm = [calculate_tone(doc, positive_words_lm, negative_words_lm) for doc in fomc_documents]
fomc_dates, tones_lm = zip(*sorted(zip(fomc_dates, tones_lm)))
df_tones = pd.DataFrame({'Date': fomc_dates, 'Tone': tones_lm})
df_tones

In [None]:
# Plotting tone of FOMC documents over time
plt.figure(figsize=(12, 6))
plt.plot(df_tones['Date'], df_tones['Tone'], marker='o', linestyle='-', color='blue')
plt.title('Tone of Federal Reserve Policy Statements (2008-2018)')
plt.xlabel('Date')
plt.ylabel('Tone')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Convert to datetime and group by year
df_tones['Date'] = pd.to_datetime(df_tones['Date'])
df_tones['Year'] = df_tones['Date'].dt.year
average_tone_per_year = df_tones.groupby('Year')['Tone'].mean().reset_index()
print("Average Tone per Year:")
print(average_tone_per_year)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(average_tone_per_year['Year'], average_tone_per_year['Tone'], marker='o', linestyle='-', color='green')
plt.title('Average Tone of Federal Reserve Policy Statements by Year (2008-2018)')
plt.xlabel('Year')
plt.ylabel('Average Tone')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Define function to couunt positive and negative words in LM
def count_lm_words(document, positive_words, negative_words):
    word_freq = Counter(process_text(document))
    positive_count = sum(word_freq[word] for word in positive_words if word in word_freq)
    negative_count = sum(word_freq[word] for word in negative_words if word in word_freq)
    return positive_count, negative_count

# Make counters for positive and negative words
lm_positive_counts = []
lm_negative_counts = []
harvard_positive_counts = []
harvard_negative_counts = []

# Loop for documents
for doc in fomc_documents:
    lm_pos_count, lm_neg_count = count_lm_words(doc, positive_words_lm, negative_words_lm)
    lm_positive_counts.append(lm_pos_count)
    lm_negative_counts.append(lm_neg_count)
    score = get_hiv4_score(doc)
    harvard_positive_counts.append(score['Positive'])
    harvard_negative_counts.append(score['Negative'])

# Create a DataFrame for comparison
df_comparison = pd.DataFrame({
    'Document Date': fomc_dates,
    'LM Positive': lm_positive_counts,
    'LM Negative': lm_negative_counts,
    'Harvard Positive': harvard_positive_counts,
    'Harvard Negative': harvard_negative_counts
})
print(df_comparison.head(81))

In [None]:
# Plotting Negative Word Counts from LM and Harvard
plt.figure(figsize=(14, 7))
plt.plot(df_comparison['Document Date'], df_comparison['LM Negative'], label='LM Negative', marker='o', linestyle='-', color='red')
plt.plot(df_comparison['Document Date'], df_comparison['Harvard Negative'], label='Harvard Negative', marker='x', linestyle='--', color='black')
plt.title('Comparison of Negative Word Counts (LM vs Harvard)')
plt.xlabel('Document Date')
plt.ylabel('Count of Negative Words')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Plotting Positive Word Counts from LM and Harvard
plt.figure(figsize=(14, 7))
plt.plot(df_comparison['Document Date'], df_comparison['LM Positive'], label='LM Positive', marker='o', linestyle='-', color='blue')
plt.plot(df_comparison['Document Date'], df_comparison['Harvard Positive'], label='Harvard Positive', marker='x', linestyle='--', color='green')
plt.title('Comparison of Positive Word Counts (LM vs Harvard)')
plt.xlabel('Document Date')
plt.ylabel('Count of Positive Words')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Load the LM word list
lm_dictionary = pd.read_csv(lm_dictionary_path)
positive_words_lm = set(lm_dictionary[lm_dictionary['Positive'] != 0]['Word'].str.lower())
negative_words_lm = set(lm_dictionary[lm_dictionary['Negative'] != 0]['Word'].str.lower())

def process_text(text):
    tokens = word_tokenize(text.lower())
    remove_punc = str.maketrans('', '', string.punctuation + string.digits)
    filtered_tokens = [word.translate(remove_punc) for word in tokens if word.isalpha()]
    filtered_tokens = [word for word in filtered_tokens if word not in stopwords.words('english')]
    return filtered_tokens

all_words_freq = Counter()

# Process each document in the folder and count all words
for file_name in os.listdir(path_to_fomc_docs):
    if file_name.endswith('.txt'):
        file_path = os.path.join(path_to_fomc_docs, file_name)
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            text = file.read()
            tokens = process_text(text)
            all_words_freq.update(tokens)

# Filter into positive/negative sets, sort frequencies
positive_word_freq = {word: freq for word, freq in all_words_freq.items() if word in positive_words_lm}
negative_word_freq = {word: freq for word, freq in all_words_freq.items() if word in negative_words_lm}

sorted_positive_words = sorted(positive_word_freq.items(), key=lambda item: item[1], reverse=True)
sorted_negative_words = sorted(negative_word_freq.items(), key=lambda item: item[1], reverse=True)

top_positive_words = sorted_positive_words[:10]
top_negative_words = sorted_negative_words[:10]

positive_word_freq = {word: freq for word, freq in overall_word_freq.items() if word in positive_words_lm}
negative_word_freq = {word: freq for word, freq in overall_word_freq.items() if word in negative_words_lm}

top_positive_words = pd.DataFrame(sorted(positive_word_freq.items(), key=lambda x: x[1], reverse=True)[:10], columns=['Word', 'Frequency'])
top_negative_words = pd.DataFrame(sorted(negative_word_freq.items(), key=lambda x: x[1], reverse=True)[:10], columns=['Word', 'Frequency'])

# Display the dataframes as tables
print("Top 10 Positive Words:")
print(top_positive_words.to_string(index=False))
print("\nTop 10 Negative Words:")
print(top_negative_words.to_string(index=False))

In [None]:

harvard_scores = []

# Loop for documents
for doc in fomc_documents:
    score = get_hiv4_score(doc)
    harvard_scores.append(score)
for date, score in zip(fomc_dates, harvard_scores):
    print(f"Document Date: {date}, Score: {score}")

In [None]:
# Convert the dates into datetime
dates = [datetime.strptime(str(date), '%Y-%m-%d %H:%M:%S') for date in fomc_dates]
fomc_scores = [get_hiv4_score(doc) for doc in fomc_documents]
polarity_scores = [score['Polarity'] for score in fomc_scores]


# Sort and plot
dates, polarity_scores = zip(*sorted(zip(dates, polarity_scores)))

plt.figure(figsize=(14, 7))
plt.plot_date(dates, polarity_scores, linestyle='solid', marker=None)
plt.title('Polarity Scores Over Time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Polarity Score', fontsize=12)
plt.grid(True)
plt.tight_layout()

plt.show()


In [None]:
all_words_freq = Counter()

# Process documents
for doc in fomc_documents:
    tokens = process_text(doc)
    all_words_freq.update(tokens)

harvard_sentiment_word_freq = Counter()

for doc in fomc_documents:
    tokens = process_text(doc)
    score = get_hiv4_score(doc)
    for token in tokens:
        if score['Polarity'] > 0:
            harvard_sentiment_word_freq[token] += score['Polarity']
        elif score['Polarity'] < 0:
            harvard_sentiment_word_freq[token] -= score['Polarity']

# But you can now sort this to get the words that have the highest and lowest scores
sorted_sentiment_words = sorted(harvard_sentiment_word_freq.items(), key=lambda item: item[1], reverse=True)
print(sorted_sentiment_words[:10]) 
print(sorted_sentiment_words[-10:]) 

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
positive_word_count = Counter()
negative_word_count = Counter()

# Loop for documents
for doc in fomc_documents:
    tokens = process_text(doc)
    for token in tokens:
        if len(token) > 1:  
            score = sia.polarity_scores(token)
            if score['compound'] > 0.1: 
                positive_word_count[token] += 1
            elif score['compound'] < -0.1: 
                negative_word_count[token] += 1

top_positive_words = positive_word_count.most_common(10)
top_negative_words = negative_word_count.most_common(10)

# Convert counts to DataFrames
df_top_positive_words = pd.DataFrame(top_positive_words, columns=['Word', 'Count'])
df_top_negative_words = pd.DataFrame(top_negative_words, columns=['Word', 'Count'])

print("Top Positive Words:")
print(df_top_positive_words)
print("\nTop Negative Words:")
print(df_top_negative_words)

In [None]:
# LM
# Fetch S&P 500 data from 2008-2017
sp500 = yf.download('^GSPC', start='2008-01-01', end='2018-01-01')

# Calculate returns, format dates
sp500['Returns'] = sp500['Adj Close'].pct_change()
sp500_monthly_returns = sp500['Returns'].resample('M').agg(lambda x: (x + 1).prod() - 1)
sp500_monthly_returns = sp500_monthly_returns.reset_index()
df_tones['Date'] = pd.to_datetime(df_tones['Date'])
sp500_monthly_returns['Date'] = pd.to_datetime(sp500_monthly_returns['Date'])

# Merge data
merged_data = pd.merge_asof(df_tones.sort_values('Date'), sp500_monthly_returns.sort_values('Date'), on='Date', direction='nearest')
merged_data.dropna(subset=['Returns'], inplace=True)

# Regression analysis
X = sm.add_constant(merged_data['Tone'])
Y = merged_data['Returns']

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 
print(model.summary())

In [None]:
#HARVARD
# Define function to get the score for a document using pysentiment2
def get_hiv4_score(text):
    hiv4 = ps.HIV4()
    tokens = hiv4.tokenize(text)
    return hiv4.get_score(tokens)

# Load FOMC documents and their datesx
fomc_documents = []
fomc_dates = []
path_to_fomc_docs = "/Text Files"
for file_name in os.listdir(path_to_fomc_docs):
    date_str = file_name.split('.')[0]
    date_obj = pd.to_datetime(date_str, format='%Y%m%d')
    fomc_dates.append(date_obj)
    with open(os.path.join(path_to_fomc_docs, file_name), 'r', encoding='ISO-8859-1') as file:
        fomc_documents.append(file.read())

fomc_scores = [get_hiv4_score(doc) for doc in fomc_documents]
harvard_polarity_scores = [score['Polarity'] for score in fomc_scores]

# Fetch S&P 500 data, calculate returns, format
sp500 = yf.download('^GSPC', start='2008-01-01', end='2018-01-01')
sp500['Returns'] = sp500['Adj Close'].pct_change()
sp500_monthly_returns = sp500['Returns'].resample('M').agg(lambda x: (x + 1).prod() - 1)
sp500_monthly_returns = sp500_monthly_returns.reset_index()
sp500_monthly_returns['Date'] = pd.to_datetime(sp500_monthly_returns['Date'])

# Align polarity with S&P 500 data
aligned_data_harvard = pd.DataFrame({
    'Date': fomc_dates,
    'Harvard_Polarity': harvard_polarity_scores
})

# Merge data
merged_data_harvard = pd.merge_asof(
    aligned_data_harvard.sort_values('Date'), 
    sp500_monthly_returns.sort_values('Date'), 
    on='Date', 
    direction='nearest'
)
merged_data_harvard.dropna(subset=['Returns'], inplace=True)

# Regression analysis
X_harvard = sm.add_constant(merged_data_harvard['Harvard_Polarity']) 
Y_harvard = merged_data_harvard['Returns']

model_harvard = sm.OLS(Y_harvard, X_harvard).fit()
print("Harvard Dictionary Polarity Regression Summary:")
print(model_harvard.summary())