##Preparation: We load in the necessary libraries

In [None]:
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##  1. Parsing of Speech Text into Individual Sentences

In [None]:
# We load the CSV containing the scraped speeches, last update January 2023 (all speeches in 2022 included)

### Input the speeches for filepath: such as 'CentralBankRoBERTa/Data/ECB/all_ECB_speeches.csv', 'CentralBankRoBERTa/Data/FED//all_FED_speeches.csv', 'CentralBankRoBERTa/Data/BIS//all_BIS_speeches.csv'

filepath = 'CentralBankRoBERTa/Data/FED//all_FED_speeches.csv' ## For example, change precise directory to fit your system
data = pd.read_csv(filepath, encoding="latin-1")

In [None]:
# We give every speech a doc_id
for i in range(0, len(data)):
  data['doc_id'][i] = int(i)+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['doc_id'][i] = int(i)+1


In [None]:
# Converting to string
text = data['text']
text = text.astype("string")
text.head()

0    For release on delivery9:00 A.M., E.S.T.April ...
1    For release on delivery 10:00 a.m. EDT June 4,...
2    For Release on Delivery4:00 P.M., EDTJune 1986...
3    For release on delivery 9:30 a.m. E.D.T.June 1...
4    For release on delivery 12:3t) p.m., E.D.T.Sep...
Name: text, dtype: string

In [None]:
# We put every speech in a parser. Speeches the parser cannot handle are skipped (skipped speech # are printed).
# We give every parsed sentence a number ("Sentence_Rank") that indicates the position it occured in the speech.
new_df_parsed = pd.DataFrame(columns = ["Parsed_Text", "Speaker", "Date", "Location", "Doc_ID"])
index = 0
count = 1
for i in text:
    r = sent_tokenize(i)
    if r == []:
        print("we skipped speech #", index)
        count = count + 1
    else:
        new_df_parsed_intermediate = pd.DataFrame(r, columns = ["Parsed_Text"] )
        new_df_parsed_intermediate.loc[:,"Speaker"] = data['author'].iloc[index]
        new_df_parsed_intermediate.loc[:,"Date"] = data['date'].iloc[index]
        new_df_parsed_intermediate.loc[:,"Location"] = data['location'].iloc[index]
        new_df_parsed_intermediate.loc[:,"Doc_ID"] = data['doc_id'].iloc[index]
        index_list = []
        for k in range(0, len(new_df_parsed_intermediate)):
          index_list.append(k)
        new_df_parsed_intermediate['Sentence_Rank'] = index_list
        new_df_parsed = pd.concat([new_df_parsed, new_df_parsed_intermediate], ignore_index=True, sort=False)
    index = index + 1
new_df_parsed

we skipped speech # 615


Unnamed: 0,Parsed_Text,Speaker,Date,Location,Doc_ID,Sentence_Rank
0,"For release on delivery9:00 A.M., E.S.T.April ...","Johnson, Manuel H.",1986-04-26,Florida,1,0.0
1,"April 26, 1986 Remarks by Manuel H. Johnson, M...","Johnson, Manuel H.",1986-04-26,Florida,1,1.0
2,"It is also an opportunity for graduates, their...","Johnson, Manuel H.",1986-04-26,Florida,1,2.0
3,"If I may, Ifd like to take this opportunity of...","Johnson, Manuel H.",1986-04-26,Florida,1,3.0
4,When the freshman class entered college in the...,"Johnson, Manuel H.",1986-04-26,Florida,1,4.0
...,...,...,...,...,...,...
858390,We have a fabulous lineup today.,Jim Bullard,2022-06-29,,6764,17.0
858391,Our first panel will establish a fact base of ...,Jim Bullard,2022-06-29,,6764,18.0
858392,Weâll close out today with remarks from my S...,Jim Bullard,2022-06-29,,6764,19.0
858393,Weâre also eager to engage all of you in tod...,Jim Bullard,2022-06-29,,6764,20.0


In [None]:
df = new_df_parsed

##  2. Text Cleaning

In [None]:
# We use Regex to remove special characters
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [None]:
# We remove all sentences that are shorter than 20 characters
df = df[df['Parsed_Text'].str.len()>=20]

In [None]:
# Remove common redundant strings
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('cid173', '', x))
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('digitized for fraser httpfraserstlouisfedorg federal reserve bank of st louis', '', x))
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('UFB03', 'ffi', x))
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('U0080U0099', '', x))
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('U0080U0091', '', x))
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('U0080U0094', '', x))
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('U0080', '', x))
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('U0099', '', x))
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('U0093', '', x))
df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('U009', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('cid173', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Parsed_Text'] = df['Parsed_Text'].apply(lambda x: re.sub('digitized for fraser httpfraserstlouisfedorg federal reserve bank of st louis', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

In [None]:
# We make all text lowercase
df['Parsed_Text'] = df['Parsed_Text'].map(str.lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Parsed_Text'] = df['Parsed_Text'].map(str.lower)


In [None]:
# We remove all dublicate sentences
df = df.drop_duplicates(subset=['Parsed_Text'])

In [None]:
df = df.reset_index()

In [None]:
df = df.drop('index', axis=1)

In [None]:
# We save the data
savepath = # Enter your savepath here
df.to_csv(savepath)
# The pre-processing is finished