In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import time
import re
from langdetect import detect

from collections import Counter

import reprlib

import gc

In [1]:
FILENAME = "../data/train_series.csv"
FILENAME_ECB = "../data/ecb_data.csv"
FILENAME_FED = "../data/fed_data.csv"
FILENAME_ECB_P = "../data/ecb_data_preprocessed.csv"
FILENAME_FED_P = "../data/fed_data_preprocessed.csv"

In [3]:
def text_print(text, line_char_lim=150):
    text_ = text.split('\n')
    for subtext in text_:
        n = len(subtext)
        k = 0
        while k <= n:
            print(subtext[k:min(n, k + line_char_lim)])
            k += line_char_lim
        print('\n')

In [4]:
fed = pd.read_csv(FILENAME_FED_P, index_col=0)

In [5]:
# View length statistics
fed["text"].apply(len).describe()

count      739.000000
mean     22104.081191
std      10999.279224
min       2011.000000
25%      13922.000000
50%      22575.000000
75%      28200.500000
max      69492.000000
Name: text, dtype: float64

In [6]:
# View texts

In [6]:
def find_footnote(x):
    found = re.match(r"(.*)footnote", x, re.IGNORECASE)
    if not found is None:
        insensitive_footnote = re.compile(re.escape('footnote'), re.IGNORECASE)
        return insensitive_footnote.sub("", found.group()).strip()
    else:
        return x

In [7]:
# Find amount of text we can remove by removing footnotes
without_footnote = fed["text"].apply(
    find_footnote
)

In [9]:
without_footnote.isna().sum()

0

In [10]:
def find_useless_thanks(x):
    if x is not None:
        found = re.findall(r"([^.]*?(thank | congratulat)[^.]*\.)", x, re.IGNORECASE)
    if not found is None:
        res = x
        for substring in found:
            res = re.sub(re.escape(substring[0]), "", res)
        return res
    return x

In [11]:
# Thanks are useless. Let's just remove them.
without_thanks = without_footnote.apply(find_useless_thanks)

In [12]:
without_thanks.isna().sum()

0

In [13]:
fed["text"].str.split(" ").apply(len).describe()

count      739.000000
mean      3543.518268
std       1730.606814
min        309.000000
25%       2256.500000
50%       3592.000000
75%       4530.500000
max      10676.000000
Name: text, dtype: float64

In [14]:
without_thanks.str.split(" ").apply(len).describe()

count      739.000000
mean      3435.964817
std       1693.646100
min        286.000000
25%       2175.500000
50%       3458.000000
75%       4461.000000
max      10676.000000
Name: text, dtype: float64

In [15]:
def remove_video_code(text):
    if not text is None:
        res = re.sub("Accessible Keys for Video.*myPlayer\.play\(\);(.*?)\}(.*?)\}", "", text).strip()
        res = re.sub("^(Watch|View) Video", "", res)
        return res.strip()
    return text

In [16]:
def remove_refs_fed(text):
    if not text is None:
        res = re.sub(r'References.*', '', text)
        res = re.sub(r'Return to text.*$', '', res)
        return res.strip()
    return text

In [17]:
def website_remover(text):
    # Remove websites
    regex = "((http|https)://)(www.)?" \
        + "[a-zA-Z0-9@:%._\\+~#?&//=]{2,256}\\.[a-z]" \
        + "{2,6}\\b([-a-zA-Z0-9@:%._\\+~#?&//=]*)"
    res = re.sub(regex, "", text).strip()
    return res

In [27]:
def remove_greetings(text):
    if not text is None:
        res = re.sub(r'^(.*?)Good (morning|afternoon|evening)[^.]*\.', '', text)
        res = re.sub(r'^(.*?)Ladies and (g|G)entlemen[^.]*\.', '', res)
        res = re.sub(r'Hello.', '', res)
        return res.strip()
    return text

In [28]:
def pipeline_fed(x, tolist=False):
    res = x["title"]
    text = x["text"]
    if not text is None and isinstance(text, str):
        res = find_footnote(text)
        res = remove_video_code(res)
        res = find_useless_thanks(res)
        res = remove_refs_fed(res)
        res = remove_greetings(res)
        res = website_remover(res)
    return res


In [29]:
fed_ = fed.apply(pipeline_fed, axis=1)

In [34]:
fed_.iloc[279]

'I will begin with a brief update on the outlook for the U.S. economy, then discuss recent developments in global commodity markets that are significantly affecting both the U.S. and world economies, and conclude with some thoughts on the prospects for monetary policy.       The Outlook for Growth        U.S. economic growth so far this year looks to have been somewhat slower than expected. Aggregate output increased at only'

In [35]:
text_print(fed["text"].iloc[279])

         I would like to thank the organizers for inviting me to participate once again in the International Monetary Conference. I will begin with a 
brief update on the outlook for the U.S. economy, then discuss recent developments in global commodity markets that are significantly affecting both t
he U.S. and world economies, and conclude with some thoughts on the prospects for monetary policy.       The Outlook for Growth        U.S. economic g
rowth so far this year looks to have been somewhat slower than expected. Aggregate output increased at only 1.8 percent at an annual rate in the first
 quarter, and supply chain disruptions associated with the earthquake and tsunami in Japan are hampering economic activity this quarter. A number of i
ndicators also suggest some loss of momentum in the labor market in recent weeks. We are, of course, monitoring these developments. That said, with th
e effects of the Japanese disaster on manufacturing output likely to dissipate in coming month

In [26]:
weird_entries = fed_[fed_.str.contains("Ladies and gentlemen")]
weird_entries

Series([], dtype: object)

In [373]:
weird_text = weird_entries.iloc[0]
weird_text

'The Federal Reserve is best known for its role in the national economy and monetary policy. But through the 12 Federal Reserve Banks across the country, it also gets involved in efforts to support local communities and their economies. This work helps to enhance our understanding of the pace of economic recovery and further creates a backdrop for a national dialogue about common problems and their potential solutions.              Over the last several years, every community across the country has felt the effects of the financial crisis. Foreclosed, vacant, and abandoned properties threaten neighborhoods nationwide, and community leaders are working to stabilize those neighborhoods. While the problem touches every community, it doesn\'t look the same in each because it\'s shaped by the circumstances that prevailed in those neighborhoods before the crisis hit.              Neighborhood stabilization efforts are critical, now more than ever, as not all communities will be stabilized wi

In [331]:
weird_entries.iloc[0]

'           Accessible Keys for Video [Space Bar] toggles play/pause; [Right/Left Arrows] seeks the video forwards and back (5 sec ); [Up/Down Arrows] increase/decrease volume; [M] toggles mute on/off; [F] toggles fullscreen on/off (Except IE 11); The [Tab] key may be used in combination with the [Enter/Return] key to navigate and activate control buttons, such as caption on/off.                  videojs(\'frb-video6917\').ready(function() {                 var myPlayer;                 myPlayer = this;                 myPlayer.on(\'loadstart\',function(){                   var videoInfo = "";                   var transcriptLinkLabel;                   if (myPlayer.mediainfo.custom_fields["actualdatetext"]) {                     videoInfo += "<span class=\'col-xs-6\'>" + myPlayer.mediainfo.custom_fields["actualdatetext"] + "</span>";                   }                                if (myPlayer.mediainfo.custom_fields["transcriptlinkurl"]) {                     if (myPlayer.mediainf

In [332]:
weird_entries.iloc[2]

'           Accessible Keys for Video [Space Bar] toggles play/pause; [Right/Left Arrows] seeks the video forwards and back (5 sec ); [Up/Down Arrows] increase/decrease volume; [M] toggles mute on/off; [F] toggles fullscreen on/off (Except IE 11); The [Tab] key may be used in combination with the [Enter/Return] key to navigate and activate control buttons, such as caption on/off.                  videojs(\'frb-video6899\').ready(function() {                 var myPlayer;                 myPlayer = this;                 myPlayer.on(\'loadstart\',function(){                   var videoInfo = "";                   var transcriptLinkLabel;                   if (myPlayer.mediainfo.custom_fields["actualdatetext"]) {                     videoInfo += "<span class=\'col-xs-6\'>" + myPlayer.mediainfo.custom_fields["actualdatetext"] + "</span>";                   }                                if (myPlayer.mediainfo.custom_fields["transcriptlinkurl"]) {                     if (myPlayer.mediainf