In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from tqdm import tqdm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import math

In [2]:
with open('../inference/27-02-2024_1413_impact_output.pkl','rb') as file:
    impact_output = pickle.load(file)
with open('../Data/index_label_mapping.pkl','rb') as file:
    index_label_mapping = pickle.load(file)

In [3]:
len(impact_output)

669785

In [4]:
unique_values = {x.replace("Job ",""):list(index_label_mapping[x].values()) for x in index_label_mapping}
unique_values

{'Role': ['DEVELOPMENT',
  'GOVERNANCE RISK COMPLIANCE',
  'INFORMATION SECURITY',
  'IT GENERAL',
  'NETWORKING',
  'NON-ICP',
  'SYSTEMS'],
 'Function': ['ENGINEERING',
  'IT',
  'NON-ICP',
  'PROCUREMENT',
  'RISK/LEGAL/COMPLIANCE'],
 'Level': ['C-LEVEL',
  'CONTRIBUTOR',
  'DIRECTOR',
  'EXECUTIVE',
  'MANAGER',
  'UNKNOWN']}

In [5]:
impact_output[0]

{'Sequence': 'IT DIRECTOR',
 'Role': {'Prediction': 'NETWORKING',
  'Target': 'NETWORKING',
  'Correct?': True,
  'Distinct_Tokens': ['IT', 'DIRECTOR'],
  'Token_Importance': {'IT': 0.39951249957084656,
   'DIRECTOR': 0.6004875302314758},
  'Token_Rank': {'IT': 2, 'DIRECTOR': 1},
  'Token_Marginal_Score_Positive': {'IT': 9.991130828857422,
   'DIRECTOR': 15.017175674438477},
  'Token_Marginal_Score_Raw': {'IT': 9.991130828857422,
   'DIRECTOR': 15.017175674438477}},
 'Function': {'Prediction': 'IT',
  'Target': 'IT',
  'Correct?': True,
  'Distinct_Tokens': ['IT', 'DIRECTOR'],
  'Token_Importance': {'IT': 0.7296534180641174,
   'DIRECTOR': 0.2703465223312378},
  'Token_Rank': {'IT': 1, 'DIRECTOR': 2},
  'Token_Marginal_Score_Positive': {'IT': 8.235910415649414,
   'DIRECTOR': 3.0515170097351074},
  'Token_Marginal_Score_Raw': {'IT': 8.235910415649414,
   'DIRECTOR': 3.0515170097351074}},
 'Level': {'Prediction': 'DIRECTOR',
  'Target': 'DIRECTOR',
  'Correct?': True,
  'Distinct_Tokens

In [6]:
# Initialize compilation dictionary - we'll have entries for role, function, and level, which will lead to
# entries for each potential output category, which will lead to entries for every individual word with:
# 1. Average token importance
# 2. Average number of unique tokens in sequence
# 3. Average marginal score (positive) - keyword
# 4. Average marginal score (raw)
# 5. Average marginal score (negative) - anti-keyword
# 6. Average token score rank
# For 1-6 we'll first need to just record every entry in the structure, then we can run through and take the average
keyword_dict_running = {**unique_values}
for x in keyword_dict_running:
    keyword_dict_running[x] = {}
    for y in unique_values[x]:
        keyword_dict_running[x][y] = {}

In [7]:
# Run through the data
# tqdm(enumerate(data_loader,0),total=len(data_loader))
for _,sequence in tqdm(enumerate(impact_output,0),total=len(impact_output)):
    for key in keyword_dict_running:
        this_sequence_info = sequence[key]
        this_prediction = this_sequence_info['Prediction']
        this_tokens = this_sequence_info['Distinct_Tokens']
        this_unique_tokens_count = len(this_tokens)
        total_score_difference = np.array(list(this_sequence_info['Token_Marginal_Score_Positive'].values())).sum()
        average_score_difference = total_score_difference/this_unique_tokens_count
        for token in this_tokens:
            # If not already present in the keyword_dict_running, add it
            if token not in keyword_dict_running[key][this_prediction]:
                keyword_dict_running[key][this_prediction][token] = defaultdict(list)
            # Append to lists the abovementioned metrics
            token_importance = this_sequence_info['Token_Importance'][token]
            marginal_score_positive = this_sequence_info['Token_Marginal_Score_Positive'][token]
            raw_score = this_sequence_info['Token_Marginal_Score_Raw'][token]
            token_rank = this_sequence_info['Token_Rank'][token]
            priority_index = marginal_score_positive/average_score_difference
            if np.isnan(priority_index):
                priority_index = 0
            excess_ri = max(0,priority_index - 1)
            if math.isnan(token_importance):
                token_importance = 0
            keyword_dict_running[key][this_prediction][token]['Token_Importance'].append(token_importance)
            keyword_dict_running[key][this_prediction][token]['Unique_Tokens_Count'].append(this_unique_tokens_count)
            keyword_dict_running[key][this_prediction][token]['Marginal_Score_Positive'].append(marginal_score_positive)
            keyword_dict_running[key][this_prediction][token]['Marginal_Score_Raw'].append(raw_score)
            keyword_dict_running[key][this_prediction][token]['Marginal_Score_Negative'].append(min(raw_score,0))
            keyword_dict_running[key][this_prediction][token]['Token_Rank'].append(token_rank)
            keyword_dict_running[key][this_prediction][token]['Token_Occurrences'].append(1)
            keyword_dict_running[key][this_prediction][token]['Priority_Index_>_1'].append((priority_index > 1)*1)
            keyword_dict_running[key][this_prediction][token]['Priority_Index'].append(priority_index)

  priority_index = marginal_score_positive/average_score_difference
100%|██████████| 669785/669785 [01:12<00:00, 9220.85it/s] 


In [8]:
keyword_dict_average = {**unique_values}
for x in keyword_dict_average:
    keyword_dict_average[x] = {}
    for y in unique_values[x]:
        keyword_dict_average[x][y] = {}

In [9]:
# Now we run through the above dictionary and calculate the averages for each token and underlying list
for key in keyword_dict_running:
    for prediction in keyword_dict_running[key]:
        keyword_dict_average[key][prediction] = defaultdict(list)
        for _,token in tqdm(enumerate(keyword_dict_running[key][prediction])):
            priority_index = np.array(keyword_dict_running[key][prediction][token]['Priority_Index'])
            token_occurrences = np.array(keyword_dict_running[key][prediction][token]['Token_Occurrences'])
            pi_g_1 = np.array(keyword_dict_running[key][prediction][token]['Priority_Index_>_1'])
            keyword_dict_average[key][prediction]['Token'].append(token)
            keyword_dict_average[key][prediction]['Total_Token_Occurrences'].append(np.array(keyword_dict_running[key][prediction][token]['Token_Occurrences']).sum())
            keyword_dict_average[key][prediction]['Prob_Priority_Index_>_1'].append(pi_g_1.sum()/token_occurrences.sum())
            keyword_dict_average[key][prediction]['StDev_Priority_Index'].append(np.std(priority_index))
            keyword_dict_average[key][prediction]['Avg_Priority_Index'].append(np.mean(priority_index))

0it [00:00, ?it/s]

4943it [00:00, 18787.39it/s]
1700it [00:00, 21784.73it/s]
9566it [00:00, 15315.79it/s]
3952it [00:00, 23619.17it/s]
8623it [00:00, 15831.83it/s]
11093it [00:00, 20965.75it/s]
2301it [00:00, 27122.92it/s]
4691it [00:00, 20858.56it/s]
17506it [00:01, 13352.59it/s]
10275it [00:00, 21475.65it/s]
619it [00:00, 28532.99it/s]
533it [00:00, 30071.75it/s]
5040it [00:00, 19998.88it/s]
11178it [00:00, 17384.56it/s]
7681it [00:00, 18070.75it/s]
7324it [00:00, 19388.06it/s]
8322it [00:00, 17792.35it/s]
1248it [00:00, 27959.94it/s]


In [10]:
# Now we create a table for each output and possible value for that output. Let's start with a new dictionary to house it
keyword_table_dict = {}
for x in unique_values:
    keyword_table_dict[x] = {}

In [11]:
for key in keyword_dict_average:
    for output in keyword_dict_average[key]:
        keyword_table_dict[key][output] = pd.DataFrame.from_dict(keyword_dict_average[key][output])

In [12]:
# Write tables to excel - filter out values with less than 100 total token occurrences
writer_keyword = pd.ExcelWriter('../inference/impact_output_keyword_ALTERNATE.xlsx',engine = 'xlsxwriter')
for key in keyword_table_dict:
    for output in keyword_table_dict[key]:
        this_df = keyword_table_dict[key][output] 
        this_df[(this_df.Total_Token_Occurrences >= 100) & (this_df.Avg_Priority_Index > 1)].sort_values(by = 'Avg_Priority_Index',ascending=False).to_excel(
            writer_keyword,sheet_name='{}_{}'.format(key.replace('/',''),output.replace('/','')),index = False)
writer_keyword.close()