In [2]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from tqdm import tqdm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import math
from openpyxl import Workbook
import xlsxwriter

In [3]:
with open('../inference/25-02-2024_1324_antikey_output.pkl','rb') as file:
    impact_output = pickle.load(file)
with open('../Data/index_label_mapping.pkl','rb') as file:
    index_label_mapping = pickle.load(file)

In [4]:
len(impact_output)

669785

In [5]:
unique_values = {x.replace("Job ",""):list(index_label_mapping[x].values()) for x in index_label_mapping}
unique_values

{'Role': ['GOVERNANCE RISK COMPLIANCE',
  'INFORMATION SECURITY',
  'IT GENERAL',
  'NETWORKING',
  'NON-ICP',
  'SYSTEMS'],
 'Function': ['ENGINEERING',
  'IT',
  'NON-ICP',
  'PROCUREMENT',
  'RISK/LEGAL/COMPLIANCE'],
 'Level': ['C-LEVEL',
  'CONTRIBUTOR',
  'DIRECTOR',
  'EXECUTIVE',
  'MANAGER',
  'UNKNOWN']}

In [6]:
impact_output[0]

{'Sequence': 'IT DIRECTOR',
 'Role': [{'Anti-Prediction': 'GOVERNANCE RISK COMPLIANCE',
   'Distinct_Tokens': ['IT', 'DIRECTOR'],
   'Token_Importance': {'IT': 0.13329343497753143,
    'DIRECTOR': 0.8667065501213074},
   'Token_Rank': {'IT': 2, 'DIRECTOR': 1},
   'Token_Marginal_Score_Positive': {'IT': 0.7029886245727539,
    'DIRECTOR': 4.571004390716553},
   'Token_Marginal_Score_Raw': {'IT': 0.7029886245727539,
    'DIRECTOR': 4.571004390716553}},
  {'Anti-Prediction': 'INFORMATION SECURITY',
   'Distinct_Tokens': ['IT', 'DIRECTOR'],
   'Token_Importance': {'IT': 0.0, 'DIRECTOR': 1.0},
   'Token_Rank': {'IT': 2, 'DIRECTOR': 1},
   'Token_Marginal_Score_Positive': {'IT': 0.0, 'DIRECTOR': 4.458810329437256},
   'Token_Marginal_Score_Raw': {'IT': -0.7214062213897705,
    'DIRECTOR': 4.458810329437256}},
  {'Anti-Prediction': 'IT GENERAL',
   'Distinct_Tokens': ['IT', 'DIRECTOR'],
   'Token_Importance': {'IT': 0.33253294229507446,
    'DIRECTOR': 0.6674670577049255},
   'Token_Rank': {'

In [7]:
# Initialize compilation dictionary - we'll have entries for role, function, and level, which will lead to
# entries for each potential output category, which will lead to entries for every individual word with:
# 1. Average token importance
# 2. Average number of unique tokens in sequence
# 3. Average marginal score (positive) - keyword
# 4. Average marginal score (raw)
# 5. Average marginal score (negative) - anti-keyword
# 6. Average token score rank
# For 1-6 we'll first need to just record every entry in the structure, then we can run through and take the average
keyword_dict_running = {**unique_values}
for x in keyword_dict_running:
    keyword_dict_running[x] = {}
    for y in unique_values[x]:
        keyword_dict_running[x][y] = {}

In [8]:
# Run through the data
# tqdm(enumerate(data_loader,0),total=len(data_loader))
for _,sequence in tqdm(enumerate(impact_output,0),total=len(impact_output)):
    for key in keyword_dict_running:
        this_sequence_info = sequence[key]
        for anti_prediction in this_sequence_info:
            this_anti_prediction = anti_prediction['Anti-Prediction']
            this_tokens = anti_prediction['Distinct_Tokens']
            this_unique_tokens_count = len(this_tokens)
            for token in this_tokens:
                # If not already present in the keyword_dict_running, add it
                if token not in keyword_dict_running[key][this_anti_prediction]:
                    keyword_dict_running[key][this_anti_prediction][token] = defaultdict(list)
                # Append to lists the abovementioned metrics
                token_importance = anti_prediction['Token_Importance'][token]
                marginal_score_positive = anti_prediction['Token_Marginal_Score_Positive'][token]
                raw_score = anti_prediction['Token_Marginal_Score_Raw'][token]
                token_rank = anti_prediction['Token_Rank'][token] 
                if math.isnan(token_importance):
                    token_importance = 0
                keyword_dict_running[key][this_anti_prediction][token]['Token_Importance'].append(token_importance)
                keyword_dict_running[key][this_anti_prediction][token]['Unique_Tokens_Count'].append(this_unique_tokens_count)
                keyword_dict_running[key][this_anti_prediction][token]['Marginal_Score_Positive'].append(marginal_score_positive)
                keyword_dict_running[key][this_anti_prediction][token]['Marginal_Score_Raw'].append(raw_score)
                keyword_dict_running[key][this_anti_prediction][token]['Marginal_Score_Negative'].append(min(raw_score,0))
                keyword_dict_running[key][this_anti_prediction][token]['Token_Rank'].append(token_rank)
                keyword_dict_running[key][this_anti_prediction][token]['Token_Occurrences'].append(1)

100%|██████████| 669785/669785 [03:04<00:00, 3625.71it/s]


In [9]:
keyword_dict_average = {**unique_values}
for x in keyword_dict_average:
    keyword_dict_average[x] = {}
    for y in unique_values[x]:
        keyword_dict_average[x][y] = {}

In [10]:
# Now we run through the above dictionary and calculate the averages for each token and underlying list
for key in keyword_dict_running:
    for prediction in keyword_dict_running[key]:
        keyword_dict_average[key][prediction] = defaultdict(list)
        for _,token in tqdm(enumerate(keyword_dict_running[key][prediction])):
            keyword_dict_average[key][prediction]['Token'].append(token)
            keyword_dict_average[key][prediction]['Avg_Token_Importance'].append(np.array(keyword_dict_running[key][prediction][token]['Token_Importance']).mean())
            keyword_dict_average[key][prediction]['Avg_Unique_Tokens_Count'].append(np.array(keyword_dict_running[key][prediction][token]['Unique_Tokens_Count']).mean())
            keyword_dict_average[key][prediction]['Avg_Marginal_Score_Positive'].append(np.array(keyword_dict_running[key][prediction][token]['Marginal_Score_Positive']).mean())
            keyword_dict_average[key][prediction]['Avg_Marginal_Score_Raw'].append(np.array(keyword_dict_running[key][prediction][token]['Marginal_Score_Raw']).mean())
            keyword_dict_average[key][prediction]['Avg_Marginal_Score_Negative'].append(np.array(keyword_dict_running[key][prediction][token]['Marginal_Score_Negative']).mean())
            keyword_dict_average[key][prediction]['Avg_Token_Rank'].append(np.array(keyword_dict_running[key][prediction][token]['Token_Rank']).mean())
            keyword_dict_average[key][prediction]['Total_Token_Occurrences'].append(np.array(keyword_dict_running[key][prediction][token]['Token_Occurrences']).sum())

0it [00:00, ?it/s]

24531it [00:03, 6765.95it/s] 
20261it [00:02, 9743.35it/s] 
23633it [00:02, 8495.86it/s] 
21160it [00:02, 10190.44it/s]
17162it [00:02, 8007.79it/s] 
24324it [00:02, 9410.82it/s] 
23244it [00:02, 9123.02it/s] 
12830it [00:00, 17121.84it/s]
19460it [00:02, 8536.17it/s] 
24738it [00:02, 8701.63it/s] 
24759it [00:02, 8980.52it/s] 
23014it [00:02, 9317.69it/s] 
18898it [00:01, 9754.22it/s] 
21654it [00:02, 10037.73it/s]
21803it [00:02, 9436.58it/s] 
21216it [00:02, 10019.90it/s]
24581it [00:02, 9223.18it/s] 


In [11]:
# Now we create a table for each output and possible value for that output. Let's start with a new dictionary to house it
keyword_table_dict = {}
for x in unique_values:
    keyword_table_dict[x] = {}

In [12]:
for key in keyword_dict_average:
    for output in keyword_dict_average[key]:
        keyword_table_dict[key][output] = pd.DataFrame.from_dict(keyword_dict_average[key][output])

In [14]:
# Write tables to excel - filter out values with less than 100 total token occurrences
writer_keyword = pd.ExcelWriter('../inference/impact_output_antikeyword.xlsx',engine = 'xlsxwriter')
for key in keyword_table_dict:
    for output in keyword_table_dict[key]:
        this_df = keyword_table_dict[key][output] 
        this_df[this_df.Total_Token_Occurrences >= 100].sort_values(by = 'Avg_Token_Importance',ascending=False).to_excel(
            writer_keyword,sheet_name='{}_{}'.format(key.replace('/',''),output.replace('/','')),index = False)
writer_keyword.close()