In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

file_path = "/Users/kelstonchen/Documents/QueensMA/MA_Essay/Data/"

In [2]:
df = pd.read_csv(file_path+"mpr_data_merged/mpr_full_V1.csv").reset_index()

## Select data with inflation in the text

In [3]:
inflation_sentences = list()

# Keep only rows with inflation mentioned
for sentence in df.text:
    if 'inflation' in sentence:
        inflation_sentences.append(df[df.text == sentence])

inflation_only_data = pd.concat(inflation_sentences)

# Change mpr date to datetime type
inflation_only_data.mpr = pd.to_datetime(inflation_only_data.mpr)
# Format the date to year month, allows us to match with CPI data
inflation_only_data.mpr = inflation_only_data.mpr.dt.strftime('%Y-%m')

# Duplicate rows were created, so drop them!
inflation_only_data = inflation_only_data.drop_duplicates()
# Select only ones which I classified as 2
inflation_only_data = inflation_only_data[inflation_only_data['class'] == 2]

inflation_only_data

Unnamed: 0,index,text,class,mpr
1,1,These judgments include an assessment of the e...,2,1995-05
3,3,"First, in analysing aggregate measures of infl...",2,1995-05
6,6,"Thus, both inflation and price stability are i...",2,1995-05
8,8,In both situations - generalized inflation or ...,2,1995-05
10,10,"Second, given the numerous factors and the lag...",2,1995-05
...,...,...,...,...
19036,19036,Short-term inflation expectations are also pro...,2,2023-04
19037,19037,"As a result, inflation in services prices slow...",2,2023-04
19038,19038,The inflation outlook is subject to considerab...,2,2023-04
19039,19039,Risks to the inflation outlook have evolved si...,2,2023-04


We will merge the CPI data with the inflation text data so that we can get a sense of the inflationary environment during that time. 

In [4]:
### IMPORT CPI DATA ###
cpi = pd.read_csv(file_path + "re-labelling_data/cpi_common.csv")

# Change cpi date to datetime type
cpi.date = pd.to_datetime(cpi.date)
cpi.date = cpi.date.dt.strftime('%Y-%m')

In [5]:
export = pd.merge(inflation_only_data, cpi, how='left', left_on='mpr', right_on='date').drop('date', axis=1)
export

Unnamed: 0,index,text,class,mpr,CPI_COMMON
0,1,These judgments include an assessment of the e...,2,1995-05,1.9
1,3,"First, in analysing aggregate measures of infl...",2,1995-05,1.9
2,6,"Thus, both inflation and price stability are i...",2,1995-05,1.9
3,8,In both situations - generalized inflation or ...,2,1995-05,1.9
4,10,"Second, given the numerous factors and the lag...",2,1995-05,1.9
...,...,...,...,...,...
2762,19036,Short-term inflation expectations are also pro...,2,2023-04,5.7
2763,19037,"As a result, inflation in services prices slow...",2,2023-04,5.7
2764,19038,The inflation outlook is subject to considerab...,2,2023-04,5.7
2765,19039,Risks to the inflation outlook have evolved si...,2,2023-04,5.7


In [6]:
### EXPORT DATA for review ###
# export.to_excel(Path.cwd().joinpath("Data", "re-label_inflation.xlsx"))

## Import the re-labelled data

We will merge the re-labelled data back with the original dataset, keeping all changes made to `class`.

In [7]:
imported = pd.read_excel(file_path + "re-labelling_data/re-label_inflation_post.xlsx", usecols=['index', 'class']).drop_duplicates()
imported

Unnamed: 0,index,class
0,1,2
1,3,2
2,6,2
3,8,2
4,10,2
...,...,...
2762,19036,2
2763,19037,3
2764,19038,2
2765,19039,2


In [8]:
relabelled_data = pd.merge(df, imported, how='left', on=['index', 'class']).drop('index', axis=1)
relabelled_data

Unnamed: 0,text,class,mpr
0,While the decision to focus monetary policy on...,2,1995-05-15
1,These judgments include an assessment of the e...,2,1995-05-15
2,"However, two other points need to be stressed ...",2,1995-05-15
3,"First, in analysing aggregate measures of infl...",2,1995-05-15
4,Inflation exists when generalized increases in...,2,1995-05-15
...,...,...,...
19049,The limited tightening in financial conditions...,1,2023-04-12
19050,"However, if global banking stresses intensify ...",1,2023-04-12
19051,"If this risk materializes, a more severe globa...",1,2023-04-12
19052,The Canadian economy would be affected through...,1,2023-04-12


In [9]:
### EXPORT relablled data ###
# relabelled_data.to_csv(file_path.joinpath('mpr_full_R.csv'), index=False)

# Updating neutral sentiment labels

A list of neutral sentiment phrases which contained either, uncertainty, constraining, weak or strong modal words was created for re-labelling.

This was done in the `dictionary_method` notebook.

In [10]:
### IMPORT data with added columns from dictionary method ###
df = pd.read_csv(file_path + 'mpr_added_dict.csv')

In [11]:
def update_labels(df, new_labels):
    for sentence in df['text']:
        for key, value in zip(new_labels['text'], new_labels['class']):
            if key in sentence:
                df.loc[df['text'] == key, 'class'] = value
    print("Updated!")
    return df

In [12]:
df.head()

Unnamed: 0,text,class,mpr,wordcount,NPositiveWords,NNegativeWords,NNeutralWords,NUncertainWords,NStrongWords,NWeakWords,NConstWords,Poswords,Negwords,Neuwords,Unwords,Strongwords,Weakwords,Conwords
0,While the decision to focus monetary policy on...,2,1995-05-15,27,1,0,26,0,0,0,1,stability,,while the decision to focus monetary policy on...,,,,require
1,These judgments include an assessment of the e...,2,1995-05-15,21,1,0,21,0,0,0,0,strength,,these judgments include an assessment of the e...,,,,
2,"However, two other points need to be stressed ...",2,1995-05-15,13,0,1,13,0,0,0,0,,stressed,however two other points need to be stressed b...,,,,
3,"First, in analysing aggregate measures of infl...",2,1995-05-15,25,0,0,25,0,0,0,0,,,first in analysing aggregate measures of infla...,,,,
4,Inflation exists when generalized increases in...,2,1995-05-15,15,0,1,15,0,0,0,0,,persist,inflation exists when generalized increases in...,,,,


In [13]:
only_neutral = df.loc[df['class'] == 2]

### This includes all neutral labelled data with uncertianty, modality and constraining words ###
(only_neutral.loc[(df['NUncertainWords'] > 0) | (df['NStrongWords'] > 0) | (df['NWeakWords'] > 0) | (df['NConstWords'] > 0), 'text']
 .value_counts())[:5]

# Export
# (only_neutral.loc[(df['NUncertainWords'] > 0) | (df['NStrongWords'] > 0) | (df['NWeakWords'] > 0) | (df['NConstWords'] > 0)]['text'].value_counts()
#  .to_excel("re-label_phrases__.xlsx"))

The uncertainty surrounding the Bank's inflation projection is illustrated using fan charts.                                                          15
This range is intended to convey a sense of forecast uncertainty.                                                                                     10
The outlook for inflation is subject to several risks emanating from both the external environment and the domestic economy.                          10
Relatedly, if there were a sudden weakening in the Canadian housing sector, it could have sizable spillover effects on other areas of the economy.     9
Overall, the Bank judges that the risks to the inflation outlook in Canada are roughly balanced over the projection period.                            7
Name: text, dtype: int64

In [14]:
## UPDATE DATA ##
new_labels = pd.read_excel(file_path + 're-labelling_data/re-label_phrases.xlsx', usecols=['text', 'class'])
df2 = update_labels(df, new_labels)

# Update dataframe for further queries
only_neutral = df2.loc[df2['class'] == 2]

Updated!


### Updating neutral sentiment with positive words

In [15]:
## NEW QUERY ##
# Same query as before with the addition of positives

# (only_neutral.loc[((df2['NUncertainWords'] > 0) | (df2['NStrongWords'] > 0) | (df2['NWeakWords'] > 0) | (df2['NConstWords'] > 0)) & (df2['NPositiveWords'] > 1)]['text']
#  .value_counts()).to_excel(file_path + "re-labelling_data/re-label_very_pos.xlsx")

In [16]:
## UPDATE DATA ##
new_labels = pd.read_excel(file_path + 're-labelling_data/re-label_very_pos.xlsx', usecols=['text', 'class'])
df3 = update_labels(df2, new_labels)

# Update dataframe for further queries
only_neutral = df3.loc[df3['class'] == 2]

Updated!


In [17]:
## NEW QUERY ##
# (only_neutral.loc[((df3['NUncertainWords'] > 0) | (df3['NStrongWords'] > 0) | (df3['NWeakWords'] > 0) | (df3['NConstWords'] > 0)) & (df3['NPositiveWords'] > df3['NNegativeWords'])]['text']
#  .value_counts()).to_excel(file_path + "re-labelling_data/re-label_pos.xlsx")

In [18]:
## UPDATE DATA ##
new_labels = pd.read_excel(file_path + 're-labelling_data/re-label_pos.xlsx', usecols=['text', 'class'])
df4 = update_labels(df3, new_labels)

# Update dataframe for further queries
only_neutral = df4.loc[df4['class'] == 2]

Updated!


### Updating neutral sentiment with negative words

In [19]:
## NEW QUERY ##
# (only_neutral.loc[((df3['NUncertainWords'] > 0) | (df3['NStrongWords'] > 0) | (df3['NWeakWords'] > 0) | (df3['NConstWords'] > 0)) & (df3['NPositiveWords']+2 < df3['NNegativeWords'])]['text']
#  .value_counts()).to_excel(file_path + "re-labelling_data/re-label_very_neg.xlsx")

In [20]:
## UPDATE DATA ##
new_labels = pd.read_excel(file_path + 're-labelling_data/re-label_very_neg.xlsx', usecols=['text', 'class'])
df5 = update_labels(df4, new_labels)

# Update dataframe for further queries
only_neutral = df5.loc[df5['class'] == 2]

Updated!


In [21]:
## NEW QUERY ##
# (only_neutral.loc[((df3['NUncertainWords'] > 0) | (df3['NStrongWords'] > 0) | (df3['NWeakWords'] > 0) | (df3['NConstWords'] > 0)) & (df3['NPositiveWords']+1 < df3['NNegativeWords'])]['text']
#  .value_counts()).to_excel(file_path + "re-labelling_data/re-label_neg.xlsx")

In [22]:
## UPDATE DATA ##
new_labels = pd.read_excel(file_path + 're-labelling_data/re-label_neg.xlsx', usecols=['text', 'class'])
df6 = update_labels(df5, new_labels)

# Update dataframe for further queries
only_neutral = df6.loc[df6['class'] == 2]

Updated!


### Export data

In [26]:
### EXPORT relablled data ###
relabelled_data = df6.copy()
# relabelled_data.to_csv(file_path + 'mpr_data_merged/mpr_full_R2D.csv', index=False)