# Canada's Monetary Policy Report: if text could speak, what would it say?

### This notebook adds to the appropriate folder and cleans the most recent MPR.
#### Before running this code - Download the latest MPR (PDF) from the BoC website - transform it into a text file - remove charts, tables, footnotes, appendices, and any non-text elements at the beginning or the end of the MPR.

#### This workbook should "just work". If it does not, or if you find an error, please email the authors.

##### André Binette <abinette@bankofcanada.ca>

##### Dmitri Tchebotarev <dtchebotarev@bankofcanada.ca>

---------------------------------------------------------------------------------------------------------------------------

## Set up the environment

### Import required modules

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
import re
from pathlib import Path

## Define helper functions

In [2]:
# Function to clean MPRs

def clean_mpr_text(org_text):
    text = org_text
    
    # convert some common Unicode symbols to the corresponding ASCII
    # https://en.wikipedia.org/wiki/Quotation_mark
    text = re.sub("\u2018|\u2019|\u201A|\u201B|\u00B4|\u2039|\u203A", "\'", text)
    text = re.sub("\u201C|\u201D|\u201E|\u201F|\u00AB|\u00BB|\u2E42", "\"", text)
    # https://en.wikipedia.org/wiki/Dash
    text = re.sub("\u2010|\u2011|\u2012|\u2013|\u2014\u2015", "-", text)
    # https://en.wikipedia.org/wiki/Bullet_(typography)
    text = re.sub("\u2022|\u2023|\u2043|\u204C|\u204D", "*", text)
    
    text = re.sub(r"\n\s*\n", "\n\n", text)
    text = re.sub(r"([^\n])\n([^\n])", "\\1 \\2", text)
    text = re.sub(r"\n\n", "\n \n", text)
    text = re.sub(r"\+/\-", "±", text)
    text = re.sub(r"\s>\s", " - ", text)
    text = re.sub(r"U\.S\.\s*(Federal\s*Reserve|CARS|Residential|Real\s*GDP|Treasuries|Treasury|Midwest|GDP)", "US \\1", text)
    text = re.sub(r"U\.S\.(\s*[A-Z])?", lambda m: "US" + (("."+m.group(1)) if m.group(1) else ""), text)
    text = "\n".join( map( lambda par: re.sub(r"\s", " ", par), text.splitlines()) )
    # remove URL
    text = re.sub(r"\s*(\()?(<)?http[^>\)]*?(?(2)>)(?(1)\))\s*(\.)?", 
                  lambda m: "." if m.group(3) else " ", text)
    
    text = re.sub(r"\s*\((?:\w*|available on the Bank's website.*?|(?:See|Chart|Table|Box|Technical).*?)\)\s*(\.)?", 
                  lambda m: "." if m.group(1) else " ", text)
    
    return text    


# Quick check for the sentences (very short or very long sentences should be look at)

def check_mpr_text(text):
    found = False
    sents = []
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\. ", ".  ", text)
    for s in nlp(text).sents:
        sents.append(s)
        if found:
            print(fname, ":", s, '\n')
            found = False
        nwords = sum(t.is_alpha for t in s)     
        if (nwords < 3) or any(tok.text in (".", "?", "!") for tok in s[:-2]) or (nwords > 100):
            found = True
            print(fname, ":", s)


# Save the resulting MPR

def save_mpr(text, year, month, day):
    fname = "mpr_text/mpr-%4d-%02d-%02d.txt" % (year, month, day)    
    with open(fname, "wt") as ff:
        ff.write(text)

        
# Date format to be enter

def date_from_mpr_fname(fname):
    match = re.search(r"(\d{4})-(\d{2})-(\d{2})", str(fname))
    if not match:
        raise ValueError("No date found in the given filename.")
    return [int(g) for g in match.groups()]

## Run the function -- need to enter the path of the latest MPR text files

In [3]:
# Set file path to the RAW UNCLEANED Text files
raw_path = Path.cwd().joinpath("mpr_text_raw")

# Clean, check and save the MPR for a specific date 
################################################
new_mpr_files = [
    raw_path.joinpath("mpr-2019-01-09.txt"),
    raw_path.joinpath("mpr-2019-04-24.txt"),
    raw_path.joinpath("mpr-2019-07-10.txt"),
    raw_path.joinpath("mpr-2019-10-30.txt"),
    raw_path.joinpath("mpr-2020-01-22.txt"),
    raw_path.joinpath("mpr-2020-04-15.txt"),
    raw_path.joinpath("mpr-2020-07-15.txt"),
    raw_path.joinpath("mpr-2020-10-28.txt"),
    raw_path.joinpath("mpr-2021-01-20.txt"),
    raw_path.joinpath("mpr-2021-04-21.txt"),
    raw_path.joinpath("mpr-2021-07-14.txt"),
    raw_path.joinpath("mpr-2021-10-27.txt"),
    raw_path.joinpath("mpr-2022-01-26.txt"),
    raw_path.joinpath("mpr-2022-04-13.txt"),
    raw_path.joinpath("mpr-2022-07-13.txt"),
    raw_path.joinpath("mpr-2022-10-26.txt"),
    raw_path.joinpath("mpr-2023-01-25.txt"),
    raw_path.joinpath("mpr-2023-04-12.txt"),
]
################################################

assert isinstance(new_mpr_files, list), "It must be a list, not a string!"
for mpr_filename in new_mpr_files:
    fname = mpr_filename # Undefined error without this line, Added by Kelston #
    try:
        year, month, day = date_from_mpr_fname(mpr_filename)
    except ValueError:
        print("Enter the publication date for", mpr_filename)
        year = int(input("year:"))
        month = int(input("month:"))
        day = int(input("day:"))
    except:
        raise

    print("%4d-%02d-%02d" % (year, month, day), mpr_filename)
        
    with open(mpr_filename, "rt") as ff:
        text = clean_mpr_text(ff.read())
        check_mpr_text(text)
        save_mpr(text, year, month, day)

2019-01-09 /Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2019-01-09.txt
2019-04-24 /Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2019-04-24.txt
2019-07-10 /Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2019-07-10.txt
2019-10-30 /Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2019-10-30.txt
/Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2019-10-30.txt :  
2020-01-22 /Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2020-01-22.txt
2020-04-15 /Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2020-04-15.txt
2020-07-15 /Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2020-07-15.txt
2020-10-28 /Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2020-10-28.txt
/Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2020-10-28.txt :  2
/Users/kelstonchen/Documents/QueensMA/MA_Essay/mpr_text_raw/mpr-2020-10-28.txt : The pandemic is also expected to