In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from tqdm import tqdm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import math
from openpyxl import Workbook
import xlsxwriter

In [2]:
with open('../inference/24-02-2024_1200_impact_output.pkl','rb') as file:
    impact_output = pickle.load(file)
with open('../Data/index_label_mapping.pkl','rb') as file:
    index_label_mapping = pickle.load(file)

In [3]:
len(impact_output)

669785

In [4]:
unique_values = {x.replace("Job ",""):list(index_label_mapping[x].values()) for x in index_label_mapping}
unique_values

{'Role': ['GOVERNANCE RISK COMPLIANCE',
  'INFORMATION SECURITY',
  'IT GENERAL',
  'NETWORKING',
  'NON-ICP',
  'SYSTEMS'],
 'Function': ['ENGINEERING',
  'IT',
  'NON-ICP',
  'PROCUREMENT',
  'RISK/LEGAL/COMPLIANCE'],
 'Level': ['C-LEVEL',
  'CONTRIBUTOR',
  'DIRECTOR',
  'EXECUTIVE',
  'MANAGER',
  'UNKNOWN']}

In [5]:
impact_output[0]

{'Sequence': 'IT DIRECTOR',
 'Role': {'Prediction': 'NETWORKING',
  'Target': 'NETWORKING',
  'Correct?': True,
  'Distinct_Tokens': ['IT', 'DIRECTOR'],
  'Token_Importance': {'IT': 0.408794105052948, 'DIRECTOR': 0.591205894947052},
  'Token_Rank': {'IT': 2, 'DIRECTOR': 1},
  'Token_Marginal_Score_Positive': {'IT': 8.886344909667969,
   'DIRECTOR': 12.851602554321289},
  'Token_Marginal_Score_Raw': {'IT': 8.886344909667969,
   'DIRECTOR': 12.851602554321289}},
 'Function': {'Prediction': 'IT',
  'Target': 'IT',
  'Correct?': True,
  'Distinct_Tokens': ['IT', 'DIRECTOR'],
  'Token_Importance': {'IT': 0.6040630340576172,
   'DIRECTOR': 0.3959369659423828},
  'Token_Rank': {'IT': 1, 'DIRECTOR': 2},
  'Token_Marginal_Score_Positive': {'IT': 11.600852966308594,
   'DIRECTOR': 7.603853225708008},
  'Token_Marginal_Score_Raw': {'IT': 11.600852966308594,
   'DIRECTOR': 7.603853225708008}},
 'Level': {'Prediction': 'DIRECTOR',
  'Target': 'DIRECTOR',
  'Correct?': True,
  'Distinct_Tokens': ['I

In [6]:
# Initialize compilation dictionary - we'll have entries for role, function, and level, which will lead to
# entries for each potential output category, which will lead to entries for every individual word with:
# 1. Average token importance
# 2. Average number of unique tokens in sequence
# 3. Average marginal score (positive) - keyword
# 4. Average marginal score (raw)
# 5. Average marginal score (negative) - anti-keyword
# 6. Average token score rank
# For 1-6 we'll first need to just record every entry in the structure, then we can run through and take the average
keyword_dict_running = {**unique_values}
for x in keyword_dict_running:
    keyword_dict_running[x] = {}
    for y in unique_values[x]:
        keyword_dict_running[x][y] = {}

In [7]:
# Run through the data
# tqdm(enumerate(data_loader,0),total=len(data_loader))
for _,sequence in tqdm(enumerate(impact_output,0),total=len(impact_output)):
    for key in keyword_dict_running:
        this_sequence_info = sequence[key]
        this_prediction = this_sequence_info['Prediction']
        this_tokens = this_sequence_info['Distinct_Tokens']
        this_unique_tokens_count = len(this_tokens)
        for token in this_tokens:
            # If not already present in the keyword_dict_running, add it
            if token not in keyword_dict_running[key][this_prediction]:
                keyword_dict_running[key][this_prediction][token] = defaultdict(list)
            # Append to lists the abovementioned metrics
            token_importance = this_sequence_info['Token_Importance'][token]
            marginal_score_positive = this_sequence_info['Token_Marginal_Score_Positive'][token]
            raw_score = this_sequence_info['Token_Marginal_Score_Raw'][token]
            token_rank = this_sequence_info['Token_Rank'][token] 
            if math.isnan(token_importance):
                token_importance = 0
            keyword_dict_running[key][this_prediction][token]['Token_Importance'].append(token_importance)
            keyword_dict_running[key][this_prediction][token]['Unique_Tokens_Count'].append(this_unique_tokens_count)
            keyword_dict_running[key][this_prediction][token]['Marginal_Score_Positive'].append(marginal_score_positive)
            keyword_dict_running[key][this_prediction][token]['Marginal_Score_Raw'].append(raw_score)
            keyword_dict_running[key][this_prediction][token]['Marginal_Score_Negative'].append(min(raw_score,0))
            keyword_dict_running[key][this_prediction][token]['Token_Rank'].append(token_rank)
            keyword_dict_running[key][this_prediction][token]['Token_Occurrences'].append(1)

  0%|          | 0/669785 [00:00<?, ?it/s]

100%|██████████| 669785/669785 [00:33<00:00, 19869.29it/s]


In [8]:
keyword_dict_average = {**unique_values}
for x in keyword_dict_average:
    keyword_dict_average[x] = {}
    for y in unique_values[x]:
        keyword_dict_average[x][y] = {}

In [9]:
# Now we run through the above dictionary and calculate the averages for each token and underlying list
for key in keyword_dict_running:
    for prediction in keyword_dict_running[key]:
        keyword_dict_average[key][prediction] = defaultdict(list)
        for _,token in tqdm(enumerate(keyword_dict_running[key][prediction])):
            keyword_dict_average[key][prediction]['Token'].append(token)
            keyword_dict_average[key][prediction]['Avg_Token_Importance'].append(np.array(keyword_dict_running[key][prediction][token]['Token_Importance']).mean())
            keyword_dict_average[key][prediction]['Avg_Unique_Tokens_Count'].append(np.array(keyword_dict_running[key][prediction][token]['Unique_Tokens_Count']).mean())
            keyword_dict_average[key][prediction]['Avg_Marginal_Score_Positive'].append(np.array(keyword_dict_running[key][prediction][token]['Marginal_Score_Positive']).mean())
            keyword_dict_average[key][prediction]['Avg_Marginal_Score_Raw'].append(np.array(keyword_dict_running[key][prediction][token]['Marginal_Score_Raw']).mean())
            keyword_dict_average[key][prediction]['Avg_Marginal_Score_Negative'].append(np.array(keyword_dict_running[key][prediction][token]['Marginal_Score_Negative']).mean())
            keyword_dict_average[key][prediction]['Avg_Token_Rank'].append(np.array(keyword_dict_running[key][prediction][token]['Token_Rank']).mean())
            keyword_dict_average[key][prediction]['Total_Token_Occurrences'].append(np.array(keyword_dict_running[key][prediction][token]['Token_Occurrences']).sum())

1706it [00:00, 18196.92it/s]
9556it [00:00, 12284.14it/s]
4083it [00:00, 21315.79it/s]
8604it [00:00, 11714.27it/s]
13456it [00:00, 16285.65it/s]
2293it [00:00, 23263.14it/s]
4683it [00:00, 18333.95it/s]
17497it [00:01, 9487.11it/s] 
10285it [00:00, 20702.84it/s]
602it [00:00, 22362.89it/s]
533it [00:00, 26644.63it/s]
5048it [00:00, 16681.92it/s]
11147it [00:00, 14593.11it/s]
7688it [00:00, 14873.74it/s]
7318it [00:00, 16439.17it/s]
8310it [00:00, 13781.54it/s]
1407it [00:00, 29829.38it/s]


In [18]:
# Now we create a table for each output and possible value for that output. Let's start with a new dictionary to house it
keyword_table_dict = {}
for x in unique_values:
    keyword_table_dict[x] = {}

In [19]:
for key in keyword_dict_average:
    for output in keyword_dict_average[key]:
        keyword_table_dict[key][output] = pd.DataFrame.from_dict(keyword_dict_average[key][output])

In [22]:
# Write tables to excel - filter out values with less than 100 total token occurrences
writer_keyword = pd.ExcelWriter('../inference/impact_output_keyword.xlsx',engine = 'xlsxwriter')
for key in keyword_table_dict:
    for output in keyword_table_dict[key]:
        this_df = keyword_table_dict[key][output] 
        this_df[this_df.Total_Token_Occurrences >= 100].sort_values(by = 'Avg_Token_Importance',ascending=False).to_excel(
            writer_keyword,sheet_name='{}_{}'.format(key.replace('/',''),output.replace('/','')),index = False)
writer_keyword.close()