In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from operator import itemgetter
from tqdm import tqdm

In [2]:
# Load in validation results
with open('../logging/21-02-2024_0846/epoch00_train_val_summary','rb') as file:
    val_results = pickle.load(file)

In [3]:
# Take a look at accuracy of the validation results - these will be used to establish the weights for the loss function
# As in the model training, the weights for loss are based on 1/accuracy, to give slightly more weight to whichever of the 3 outputs
# has less accuracy
# Based on first edition of model ran on 10 epochs, with best checkpoint (on validation) retained
# I.e checkpoint 8 out of 10
acc_dict = val_results[-1]['training_accuracy']
acc_dict

{'Role': 0.9826903343200684,
 'Function': 0.9983294010162354,
 'Level': 0.9928633570671082}

In [4]:
# Calculate weights to use as inputs
weights_dict = {}
norm_factor = 0
for key in acc_dict:
    weights_dict[key]=1/acc_dict[key]
    norm_factor += weights_dict[key]
for key in weights_dict:
    weights_dict[key] = weights_dict[key]/norm_factor*3
weights_dict

{'Role': 1.0087123775354698,
 'Function': 0.9929106590511017,
 'Level': 0.9983769634134283}

In [5]:
# Can use above weights to run inference with loss option, and then come back here

# Bring in file from initial inference, as well as encoder for results

with open('../inference/21-02-2024_1718_inference.pkl','rb') as file:
    inference_results = pickle.load(file)

with open('../Data/index_label_mapping.pkl','rb') as file:
    encoder = pickle.load(file)

In [6]:
inference_results.head(20)

Unnamed: 0,Title,Job Function,Job Role,Job Level,Job Role Predicted,Job Function Predicted,Job Level Predicted,Loss
0,IT DIRECTOR,1,3,2,3,1,2,5e-06
1,ENGLISH DEPARTMENT CHAIR,2,4,2,4,2,2,0.000867
2,"SENIOR DIRECTOR, HOME FRAGRANCE INFORMATION TE...",1,3,2,3,1,2,0.003066
3,"MANAGER, INFORMATION TECHNOLOGY",1,1,4,1,1,4,8e-06
4,2X CERTIFIED SALESFORCE DEVELOPER,2,4,1,4,2,1,0.018354
5,NETWORK OPERATIONS SPECIALIST,1,3,1,3,1,1,7e-06
6,DIRECTOR CYBER POLICY AND PLANNING,1,3,2,3,1,2,0.020832
7,"MANAGER, NETWORK OPERATIONS",1,3,4,3,1,4,8e-06
8,"REGIONAL MANAGER, NETWORK OPERATIONS SUPPORT",1,3,4,4,1,4,0.82042
9,IT SPECIALIST,1,1,1,1,1,1,9e-06


In [7]:
this_table_values = []
this_table_col_names = []
for column in inference_results:
    if column not in ['Title','Loss']:
        this_table_col_names.append(column+'_Text')
        this_col = column.replace(" Predicted","")
        this_col_vals = []
        for _,value in tqdm(enumerate(inference_results[column].tolist())):
            this_col_vals.append(encoder[this_col][value])
        this_table_values.append(this_col_vals)
add_df = pd.DataFrame({k:v for k,v in zip(this_table_col_names,this_table_values)})
inference_results = pd.concat([inference_results,add_df],axis = 1)

669785it [00:00, 2972920.36it/s]
669785it [00:00, 3401561.85it/s]
669785it [00:00, 3452198.67it/s]
669785it [00:00, 3421001.43it/s]
669785it [00:00, 3373066.33it/s]
669785it [00:00, 3277693.86it/s]


In [8]:
# Now sort by loss descending to look at the observations that the model gets most wrong
inference_results_sorted = inference_results.sort_values(by='Loss',ascending=False)[['Title','Job Role_Text','Job Function_Text','Job Level_Text',
                                                                                    'Job Role Predicted_Text','Job Function Predicted_Text',
                                                                                    'Job Level Predicted_Text','Loss']]
inference_results_sorted.head(20)

Unnamed: 0,Title,Job Role_Text,Job Function_Text,Job Level_Text,Job Role Predicted_Text,Job Function Predicted_Text,Job Level Predicted_Text,Loss
584419,NETWORK SECURITY ENGINEER,INFORMATION SECURITY,IT,CONTRIBUTOR,NETWORKING,IT,CONTRIBUTOR,13.767385
377261,ÈNG,NON-ICP,NON-ICP,CONTRIBUTOR,NON-ICP,ENGINEERING,CONTRIBUTOR,11.715264
176880,"MANAGER, COMPUTING INFRASTRUCTURE",NON-ICP,IT,MANAGER,NETWORKING,IT,MANAGER,11.547041
58109,"MANAGER, SYSTEMS & SUPPORT",INFORMATION SECURITY,IT,MANAGER,NON-ICP,IT,MANAGER,11.06871
638590,"EXECUTIVE DIRECTOR, HEAD OF SOLUTION ARCHITECT...",IT GENERAL,IT,EXECUTIVE,NETWORKING,IT,EXECUTIVE,10.532409
424083,"DIRECTOR, END-USER COMPUTING OPERATIONS",NON-ICP,IT,DIRECTOR,NETWORKING,IT,DIRECTOR,10.000492
53254,CHIEF FINANCIAL OFFICER,SYSTEMS,NON-ICP,C-LEVEL,NON-ICP,NON-ICP,C-LEVEL,9.964821
133170,CHIEF FINANCIAL OFFICER,SYSTEMS,NON-ICP,C-LEVEL,NON-ICP,NON-ICP,C-LEVEL,9.964821
328736,PRINCIPAL CYBERSECURITY ADMINISTRATOR,SYSTEMS,IT,CONTRIBUTOR,INFORMATION SECURITY,IT,MANAGER,9.948408
622010,NETWORK SPECIALIST 2,NETWORKING,IT,UNKNOWN,NETWORKING,IT,CONTRIBUTOR,9.923821
