In [None]:
!pip install treeinterpreter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting treeinterpreter
  Downloading treeinterpreter-0.2.3-py2.py3-none-any.whl (6.0 kB)
Installing collected packages: treeinterpreter
Successfully installed treeinterpreter-0.2.3


In this notebook we use tree interpreter to calculate the feature contribution, that is, the contribution of each word in the sentence to the topic classifier. And then we find out the topic words in each sentence that contribute most to the classifier.

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from treeinterpreter import treeinterpreter as ti

import re
import pickle
from nltk.corpus import stopwords

# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# download trained model from the previous section
data_file = '/content/drive/MyDrive/capstone-pimco/Part1/data/'
with open(data_file+'logi_clf.pickle', 'rb') as handle:
    logi_clf = pickle.load(handle)
with open(data_file+'rf.pickle', 'rb') as handle:
    rf = pickle.load(handle)
with open(data_file+'tfidf_vec.pickle', 'rb') as handle:
    tfidf_vec = pickle.load(handle)
with open(data_file+'tfidf_vec_is_growth.pickle', 'rb') as handle:
    tfidf_vec_is_growth = pickle.load(handle)

In [None]:
tfidf_vec.get_feature_names_out()

array(['abating', 'abating economic', 'abroad', ..., 'year sell',
       'year still', 'year year'], dtype=object)

In [None]:
topics = ['credit', 'fed_funds_rate',
       'financial_markets', 'geopolitical_uncertainty',
       'growth', 'housing', 'inflation', 'labor_market',
       'liquidity_measures', 'quantitative_easing']

In [None]:
topic_dfs = {}
tfidf_Xs = {}
for topic in topics:
    topic_df = pd.read_excel('/content/drive/MyDrive/capstone-pimco/Part1/data/manual_direction_eng_completed/sampled_' + topic + '_direction.xlsx', usecols=[1, 2, 3, 4, 5, 6, 7])
    topic_df['topic'] = topic_df['topic'].astype('str')
    topic_dfs[topic] = topic_df
    tfidf_Xs[topic] = tfidf_vec.transform(topic_df['cleaned_text'])

In [None]:
classes = ['credit', 'fed funds rate',
       'financial markets', 'geopolitical uncertainty',
       'growth', 'housing', 'inflation', 'labor market',
       'liquidity measures', 'quantitative easing']

In [None]:
from tqdm import tqdm

For each sentence, we first sort the words by their contributions in descending order. Then if the highest contribution is smaller than 0.1, we choose the first four words. If else, we choose all the words with contribution larger than 0.1.

In [None]:
for topic in topics:
    print(topic)
    topic_df = topic_dfs[topic]
    feature_contrbn = []
    _, _, contributions = ti.predict(rf, tfidf_Xs[topic])
    prediction = topic_df['topic']
    for i in tqdm(topic_df.index):
        pred = prediction[i]
        pred_idx = classes.index(pred)
        feature_list = []
        # sort the words by their contributions in descending order
        sorted_list = sorted(zip(contributions[i, :, pred_idx], tfidf_vec.get_feature_names_out()), 
                                key=lambda x: -abs(x[0]))
        # If the highest contribution is smaller than 0.1, we choose the first four words.
        if sorted_list[0][0] < 0.1:
            feature_list.append((sorted_list[0][1], sorted_list[0][0]))
            feature_list.append((sorted_list[1][1], sorted_list[1][0]))
            feature_list.append((sorted_list[2][1], sorted_list[2][0]))
            feature_list.append((sorted_list[3][1], sorted_list[3][0]))
        # If else, we choose all the words with contribution larger than 0.1.
        else:
            for c, feature in sorted_list:
                if c >= 0.1:
                    feature_list.append((feature, c))
        feature_contrbn.append(feature_list)
    topic_dfs[topic]['feature_contribution'] = feature_contrbn
    topic_dfs[topic].to_csv(data_file+'feature_contribution_train/'+ topic + '.csv')

credit


100%|██████████| 100/100 [00:00<00:00, 399.45it/s]


fed_funds_rate


100%|██████████| 100/100 [00:00<00:00, 387.37it/s]


financial_markets


100%|██████████| 100/100 [00:00<00:00, 435.13it/s]


geopolitical_uncertainty


100%|██████████| 100/100 [00:00<00:00, 457.99it/s]


growth


100%|██████████| 100/100 [00:00<00:00, 428.86it/s]


housing


100%|██████████| 100/100 [00:00<00:00, 450.99it/s]


inflation


100%|██████████| 100/100 [00:00<00:00, 440.82it/s]


labor_market


100%|██████████| 100/100 [00:00<00:00, 428.47it/s]


liquidity_measures


100%|██████████| 100/100 [00:00<00:00, 537.42it/s]


quantitative_easing


100%|██████████| 100/100 [00:00<00:00, 440.05it/s]
