In [7]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from google.colab import drive
drive.mount('/content/drive')
data_file = '/content/drive/MyDrive/capstone-pimco/Part1/data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Introduction

In this notebook we are trying to figure out the directionality words in each sentence. We carry out two approaches; one is comparing words with dictionary words by stemmer; the other is comparing words with dictionary by similarity.

## Import data

In [2]:
topics = ['credit', 'fed_funds_rate',
       'financial_markets', 'geopolitical_uncertainty',
       'growth', 'housing', 'inflation', 'labor_market',
       'liquidity_measures', 'quantitative_easing']

In [8]:
topic_dfs = {}
for topic in topics:
    topic_dfs[topic] = pd.read_csv(data_file+'feature_contribution_train/'+ topic + '.csv', index_col=0)

Import the dictionary which records the directionality words and their direction (-1 or 1).

In [4]:
direction_words = pd.read_excel("/content/drive/MyDrive/capstone-pimco/Part1/data/Directionality_dictionary_alltopic_v2.xlsx")
direction_words['direction'] = direction_words['direction'].astype('int')

In [9]:
direction_words

Unnamed: 0,sign_word,direction
0,healthy,1
1,unhealthy,-1
2,strengthen,1
3,weaken,-1
4,boost,1
...,...,...
150,ease,-1
151,broader,1
152,advance,1
153,recede,-1


In [10]:
direction_dictionary = {}
for idx in direction_words.index:
    if direction_words.loc[idx, 'sign_word'] not in direction_dictionary:
        direction_dictionary[direction_words.loc[idx, 'sign_word']] = [direction_words.loc[idx, 'direction']]
    else:
        direction_dictionary[direction_words.loc[idx, 'sign_word']].append(direction_words.loc[idx, 'direction'])

In [11]:
# filter out words that has postive direction and negative direction at the same time.
for word in direction_dictionary:
    if len(set(direction_dictionary[word])) > 1:
        print(word)

ease


In [12]:
# delete all bigrams
direction_dictionary_all = {}
for word in direction_dictionary:
    if len(set(direction_dictionary[word])) == 1 and len(word.split()) == 1:
        direction_dictionary_all[word] = direction_dictionary[word][0]

In [13]:
direction_dictionary_all_df = pd.DataFrame(direction_dictionary_all.items(), columns=['word', 'direction'])
direction_dictionary_all_df

Unnamed: 0,word,direction
0,healthy,1
1,unhealthy,-1
2,strengthen,1
3,weaken,-1
4,boost,1
...,...,...
148,looseness,-1
149,broader,1
150,advance,1
151,recede,-1


In [15]:
direction_dictionary_all_df.to_csv(data_file+"directionality_dictionary_all_df.csv")

In [16]:
classes = ['credit', 'fed funds rate',
       'financial markets', 'geopolitical uncertainty',
       'growth', 'housing', 'inflation', 'labor market',
       'liquidity measures', 'quantitative easing']

In [17]:
from tqdm import tqdm

## Stemming

For every word in the sentence, if its stem is equal to the stem of any word in the dictionary, then this word is one of the directionality words for the sentence and its direction is the direction of corresponding word in the dictionary.

If there are negative word before the directionality words, then we reverse its direction.

In [18]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()

In [19]:
negative_words = ["cannot", "not", "n't"]

In [20]:
from collections import defaultdict

In [21]:
direction_lemms_dictionary = defaultdict(list)
for word in direction_dictionary_all:
    direction_lemms_dictionary[ps.stem(word)].append(direction_dictionary_all[word])
for word in direction_lemms_dictionary:
    if len(set(direction_lemms_dictionary[word])) > 1:
        print(word)

In [22]:
import re

print out the stem of each word in the sentence

In [23]:
text = "The unemployment rate declined to percent in December, about a year sooner than forecasters were expecting when the year began."
text = re.findall(r'(?u)\b\w\w+\b', text)
[ps.stem(word) for word in text]

['the',
 'unemploy',
 'rate',
 'declin',
 'to',
 'percent',
 'in',
 'decemb',
 'about',
 'year',
 'sooner',
 'than',
 'forecast',
 'were',
 'expect',
 'when',
 'the',
 'year',
 'began']

For each sentence, we return 6 lists (because there might be more than one direcitonality words in one sentence):

1. list of indices of directionality words

2. list of indices of corresponding dictionary words

3. list of directionality words

4. list of corresponding dictionary words

5. list of number of negative words in front of the directionality words. We only consider two words preceding the directionality words.

6. list of directionalities

In [24]:
direction_lemms = list(direction_lemms_dictionary.keys())
for topic in topics:
    print(topic)
    topic_df = topic_dfs[topic].copy()
    topic_df['direction_idx'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['similar_idx'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['direction_words'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['similar_words'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['negative_words'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['directionalities'] = np.empty((len(topic_df), 0)).tolist()
    for i in tqdm(topic_df.index):
        text = re.findall(r'(?u)\b\w\w+\b', topic_df.loc[i, 'cleaned_text'])
        # stems of sentence words
        word_lemms = [ps.stem(word) for word in text]
        result = [[], [], []]
        for j, word in enumerate(word_lemms):
            # if the stem is equal to the stem of any word in the dictionary
            if word in direction_lemms:
                result[0].append(direction_lemms.index(word))
                result[1].append(j)
                result[2].append(0)
                # if there are negative words preceding the directionality words, record the number of occurrence 
                for preceding_word in text[:j][-2:]:
                    if preceding_word == "cannot" or preceding_word == "not" or preceding_word[-3:] == "n't":
                        result[2][-1] = (result[2][-1] + 1) % 2

        topic_df.at[i, 'similar_idx'] = result[1]
        topic_df.at[i, 'direction_idx'] = result[0]
        topic_df.at[i, 'similar_words'] = [text[i] for i in result[1]]
        topic_df.at[i, 'direction_words'] = [direction_lemms[i] for i in result[0]]
        topic_df.at[i, 'negative_words'] = result[2]
        topic_df.at[i, 'directionalities'] = [direction_lemms_dictionary[direction_lemms[i]][0] for i in result[0]]

    topic_df.to_csv(data_file+'direction_words_train_stem/directionwords_' + topic + '.csv')

credit


100%|██████████| 100/100 [00:00<00:00, 1061.48it/s]


fed_funds_rate


100%|██████████| 100/100 [00:00<00:00, 1354.94it/s]


financial_markets


100%|██████████| 100/100 [00:00<00:00, 1339.11it/s]


geopolitical_uncertainty


100%|██████████| 100/100 [00:00<00:00, 1553.39it/s]


growth


100%|██████████| 100/100 [00:00<00:00, 1624.93it/s]


housing


100%|██████████| 100/100 [00:00<00:00, 1436.30it/s]


inflation


100%|██████████| 100/100 [00:00<00:00, 1315.29it/s]


labor_market


100%|██████████| 100/100 [00:00<00:00, 1489.91it/s]


liquidity_measures


100%|██████████| 100/100 [00:00<00:00, 1061.51it/s]


quantitative_easing


100%|██████████| 100/100 [00:00<00:00, 1433.44it/s]


## Similarities

In the second approach, we compare spacy similarities between directionality words and dictionary words and select words with high similarities (with cosine similarity larger than a designated threshold).

In [25]:
import spacy

In [26]:
# Downloading the small model containing tensors.
!python -m spacy download en_core_web_sm

# Downloading over 1 million word vectors.
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 7.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.1 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=f17b22

In [27]:
import en_core_web_lg
nlp = en_core_web_lg.load()

In [28]:
import warnings
warnings.filterwarnings("ignore")

For each sentence, we return 7 lists (because there might be more than one direcitonality words in one sentence):

1. list of indices of directionality words

2. list of indices of corresponding dictionary words

3. list of directionality words

4. list of corresponding dictionary words

5. list of similarities (between 0 and 1)

6. list of number of negative words in front of the directionality words. We only consider two words preceding the directionality words.

7. list of directionalities

### Similarity Threshold = 0.8

In [None]:
threshold = 0.8
for topic in topics:
    print(topic)
    topic_df = topic_dfs[topic].copy()
    topic_df['direction_idx'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['similar_idx'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['direction_words'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['similar_words'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['similarities'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['negative_words'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['directionalities'] = np.empty((len(topic_df), 0)).tolist()
    direction_embeddings = np.array([nlp(word).vector for word in direction_lemms])
    for i in tqdm(topic_df.index):
        text = topic_df.loc[i, 'cleaned_text'].split()
        text = [re.sub(r'[^\w\s]', '', word) for word in text]
        word_embeddings = np.array([nlp(word).vector for word in text])
        sim = 1 - cdist(direction_embeddings, word_embeddings, 'cosine')
        # find words with similarities over the threshold
        result = np.where(sim > threshold)
        result = list(result)
        result.append([])
        for j in result[1]:
            result[2].append(0)
            for preceding_word in text[:j][-2:]:
                if preceding_word == "cannot" or preceding_word == "not" or preceding_word[-3:] == "n't":
                    result[2][-1] = (result[2][-1] + 1) % 2
        topic_df.at[i, 'similar_idx'] = [i for i in result[1]]
        topic_df.at[i, 'direction_idx'] = [i for i in result[0]]
        topic_df.at[i, 'similar_words'] = [text[i] for i in result[1]]
        topic_df.at[i, 'direction_words'] = [direction_lemms[i] for i in result[0]]
        topic_df.at[i, 'similarities'] = [sim[i, j] for i, j in zip(result[0], result[1])]
        topic_df.at[i, 'negative_words'] = result[2]
        topic_df.at[i, 'directionalities'] = [direction_lemms_dictionary[direction_lemms[i]][0] for i in result[0]]

    topic_df.to_csv(data_file+'direction_words_train0.8/directionwords_' + topic + '.csv')

credit


100%|██████████| 100/100 [00:21<00:00,  4.64it/s]


fed_funds_rate


100%|██████████| 100/100 [00:21<00:00,  4.62it/s]


financial_markets


100%|██████████| 100/100 [00:16<00:00,  6.20it/s]


geopolitical_uncertainty


100%|██████████| 100/100 [00:15<00:00,  6.51it/s]


growth


100%|██████████| 100/100 [00:13<00:00,  7.68it/s]


housing


100%|██████████| 100/100 [00:16<00:00,  6.08it/s]


inflation


100%|██████████| 100/100 [00:17<00:00,  5.78it/s]


labor_market


100%|██████████| 100/100 [00:17<00:00,  5.86it/s]


liquidity_measures


100%|██████████| 100/100 [00:22<00:00,  4.53it/s]


quantitative_easing


100%|██████████| 100/100 [00:18<00:00,  5.32it/s]


### Similarity Threshold = 0.6

In [None]:
threshold = 0.6
for topic in topics:
    print(topic)
    topic_df = topic_dfs[topic].copy()
    topic_df['direction_idx'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['similar_idx'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['direction_words'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['similar_words'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['similarities'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['negative_words'] = np.empty((len(topic_df), 0)).tolist()
    topic_df['directionalities'] = np.empty((len(topic_df), 0)).tolist()
    direction_embeddings = np.array([nlp(word).vector for word in direction_lemms])
    for i in tqdm(topic_df.index):
        text = re.findall(r'(?u)\b\w\w+\b', topic_df.loc[i, 'cleaned_text'])
        word_embeddings = np.array([nlp(word).vector for word in text])
        sim = 1 - cdist(direction_embeddings, word_embeddings, 'cosine')
        result = np.where(sim > threshold)
        result = list(result)
        result.append([])
        for j in result[1]:
            result[2].append(0)
            for preceding_word in text[:j][-2:]:
                if preceding_word == "cannot" or preceding_word == "not" or preceding_word[-3:] == "n't":
                    result[2][-1] = (result[2][-1] + 1) % 2
        topic_df.at[i, 'similar_idx'] = [i for i in result[1]]
        topic_df.at[i, 'direction_idx'] = [i for i in result[0]]
        topic_df.at[i, 'similar_words'] = [text[i] for i in result[1]]
        topic_df.at[i, 'direction_words'] = [direction_lemms[i] for i in result[0]]
        topic_df.at[i, 'similarities'] = [sim[i, j] for i, j in zip(result[0], result[1])]
        topic_df.at[i, 'negative_words'] = result[2]
        topic_df.at[i, 'directionalities'] = [direction_lemms_dictionary[direction_lemms[i]][0] for i in result[0]]

    topic_df.to_csv(data_file+'direction_words_train0.6/directionwords_' + topic + '.csv')

credit


100%|██████████| 100/100 [00:24<00:00,  4.03it/s]


fed_funds_rate


100%|██████████| 100/100 [00:23<00:00,  4.18it/s]


financial_markets


100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


geopolitical_uncertainty


100%|██████████| 100/100 [00:20<00:00,  5.00it/s]


growth


100%|██████████| 100/100 [00:17<00:00,  5.88it/s]


housing


100%|██████████| 100/100 [00:21<00:00,  4.62it/s]


inflation


100%|██████████| 100/100 [00:22<00:00,  4.48it/s]


labor_market


100%|██████████| 100/100 [00:22<00:00,  4.47it/s]


liquidity_measures


100%|██████████| 100/100 [00:29<00:00,  3.33it/s]


quantitative_easing


100%|██████████| 100/100 [00:25<00:00,  3.95it/s]


In [None]:
topic_df

Unnamed: 0,uncleaned_text,text,topic,direction,true_topic,feature_contribution,direction_idx,similar_idx,direction_words,similar_words,similarities,negative_words,directionalities
0,"In either case, I think we should consider alt...","In either case, I think we should consider alt...",quantitative easing,1,,"[('holding', 0.04430109550158745), ('longer', ...","[20, 31, 32, 32, 32, 32, 32, 32, 32, 32, 116, ...","[3, 3, 1, 3, 4, 5, 8, 9, 17, 21, 14, 21, 17, 2...","[bad, good, well, well, well, well, well, well...","[think, think, either, think, we, should, such...","[0.6755347826201619, 0.6793030600094677, 0.644...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, ..."
1,The basic idea is that the repo borrower has t...,The basic idea is that the repo borrower has t...,quantitative easing,0,,"[('treasury', 0.03600305203662834), ('committe...","[20, 20, 25, 31, 31, 31, 31, 31, 32, 32, 32, 3...","[15, 16, 32, 2, 4, 11, 15, 16, 4, 15, 16, 28, ...","[bad, bad, profit, good, good, good, good, goo...","[because, it, cash, idea, that, little, becaus...","[0.7196452770154518, 0.6510833586164078, 0.645...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, ..."
2,Other studies draw on changes in the net suppl...,Other studies draw on changes in the net suppl...,quantitative easing,0,,"[('program', 0.04188938712156522), ('treasury'...","[32, 32, 38, 118, 124, 195, 195, 196, 196]","[0, 23, 22, 22, 0, 20, 25, 20, 25]","[well, well, turmoil, downturn, more, up, up, ...","[Other, and, crisis, crisis, Other, before, be...","[0.6628392858358015, 0.6822302916323205, 0.729...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, -1, -1, 1, 1, 1, -1, -1]"
3,Because the Fed already transacts about half i...,Because the Fed already transacts about half i...,quantitative easing,-1,,"[('treasury', 0.03600305203662834), ('committe...","[20, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 3...","[0, 0, 18, 0, 3, 10, 12, 13, 16, 18, 22, 24, 2...","[bad, good, good, well, well, well, well, well...","[Because, Because, would, Because, already, wi...","[0.7196452770154518, 0.6577158458847332, 0.630...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
4,The rationale for those tools was straightforw...,The rationale for those tools was straightforw...,quantitative easing,1,,"[('longer', 0.03476823201837611), ('committee'...","[20, 20, 26, 29, 31, 31, 31, 32, 32, 32, 32, 3...","[18, 47, 18, 48, 18, 47, 48, 3, 18, 33, 45, 47...","[bad, bad, lose, strong, good, good, good, wel...","[they, still, they, well, they, still, well, t...","[0.6185703755683117, 0.6327601415182633, 0.615...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Although much of the outflow went into governm...,Although much of the outflow went into governm...,quantitative easing,1,,"[('backed', 0.04774641031150915), ('agency', 0...","[20, 20, 25, 31, 31, 32, 32, 32, 32, 32, 32, 3...","[1, 28, 8, 1, 29, 0, 1, 12, 14, 16, 19, 24, 26...","[bad, bad, profit, good, good, well, well, wel...","[much, did, money, much, little, Although, muc...","[0.6682475792021424, 0.6074733700961459, 0.635...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1..."
96,Although private markets have used Treasury se...,Although private markets have used Treasury se...,quantitative easing,0,,"[('treasury', 0.03600305203662834), ('committe...","[32, 32, 32, 32, 32, 32, 32, 32, 56, 116, 124,...","[0, 3, 7, 10, 11, 16, 18, 19, 9, 0, 20, 19, 20...","[well, well, well, well, well, well, well, wel...","[Although, have, as, and, highly, are, and, wi...","[0.7537520585439043, 0.7131754785328752, 0.746...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1]"
97,"And if you believed this, you also would proba...","And if you believed this, you also would proba...",quantitative easing,0,,"[('treasury', 0.031805777207213934), ('committ...","[20, 20, 20, 20, 20, 31, 31, 31, 31, 31, 31, 3...","[1, 8, 9, 10, 23, 1, 2, 5, 7, 8, 9, 10, 11, 21...","[bad, bad, bad, bad, bad, good, good, good, go...","[if, probably, not, think, quite, if, you, you...","[0.6435724955527419, 0.6429419447115791, 0.648...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...","[-1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1..."
98,"To bring down this rate, the Treasury could ex...","To bring down this rate, the Treasury could ex...",quantitative easing,1,,"[('purchase', 0.09206950512433498), ('appropri...","[6, 20, 31, 32, 32, 32, 32, 32, 32, 67, 70, 77...","[21, 7, 7, 1, 3, 7, 13, 15, 28, 2, 21, 12, 18,...","[boost, bad, good, well, well, well, well, wel...","[increase, could, could, bring, this, could, t...","[0.6729925184848673, 0.604679436529893, 0.6054...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, -1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1..."
