In [None]:
# Import necessary libraries
import sys
import os
import json
import pdfplumber
import csv
from collections import defaultdict, Counter
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
import spacy
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords

In [None]:
nlp = spacy.load("en_core_web_lg")

nltk.download("stopwords") 
stopwords = stopwords.words('english')

nltk.download('words')
hyphenated_words = set(word for word in words.words() if '-' in word)


In [None]:
#Find and import config file
config_path = os.getcwd()
os.chdir("..")
sys.path.append(config_path)
import config

In [None]:
#Variables, Paramaters, and Pathnames needed for this script
database_file = config.database

In [None]:
# Load data from the JSON database
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

# Write data to a JSON file
def write_data(file, data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [None]:
# Handle hyphenation in text
def handle_hyphenation(text):
    for word in text.split():
        if '-' in word and word not in hyphenated_words:
            text = text.replace(word, word.replace('-', ''))
    return text

def handle_stopwords(text):
    for word in text:
        if word in stopwords:
            text = text.replace(word, '')
    return text

In [None]:
# Load the data from the JSON database
database_data = load_data(database_file)

In [None]:
# Define accepted types
accepted_types = ['Beige Book']  # replace these with actual types

# Extract file paths from the database data
files = [(entry["path"], entry["date"][:10], entry["type"]) for entry in database_data if "path" in entry and entry["type"] in accepted_types] 

# Sort files by date
files.sort(key=lambda x: x[1])  # sort by year_month_day


In [None]:
# Specify the percentage of files you want to process
percentage_to_process = .05
files_to_process = files[::int(1 / percentage_to_process)]

In [None]:
# Specify the year and month you want to start and end processing files from
start_year_month_day = '2007-12-01'
end_year_month_day = '2023-12-31'

# Only process files from the selected year and month range
files_to_process = [file for file in files_to_process if start_year_month_day <= file[1] <= end_year_month_day]


In [None]:
data = {"Date": [], "Text": []}

In [None]:
# List to hold processed segments from the PDF files
final = []

# Process each PDF file
keyword_freq_ts = defaultdict(lambda: defaultdict(Counter))
for file, year_month_day, doc_type in files_to_process:
    data["Date"].append(year_month_day)
    with pdfplumber.open(file) as pdf:
        docpages = pdf.pages[1]
        for page in pdf.pages:
            text = page.extract_text()
            text = handle_hyphenation(text)
            #text = handle_stopwords(text)
            doc = nlp(text)  # pass the text into the Spacy NLP model
            text = text.replace("\n", " ")
            segments = text.split('. ') 
            for segment in segments:
                if segment != '':
                    final.append(segment)
    data["Text"].append(final)
            
            

In [None]:
print(final)

In [None]:
# Write the processed data to a JSON file
write_data("/Users/kylenabors/Documents/MS-Thesis Data/Database/Fed Data/fed_data_blocks.json", final)
write_data("/Users/kylenabors/Documents/MS-Thesis Data/Database/Fed Data/keyword_freq_ts_blocks.json", keyword_freq_ts)