In [1]:
import pickle
import pandas as pd
from tqdm import tqdm
import dataframe_image as dfi

In [2]:
# Load in validation results
with open('../logging/26-02-2024_1922/epoch08_train_val_summary','rb') as file:
    val_results = pickle.load(file)

In [3]:
# Take a look at accuracy of the validation results - these will be used to establish the weights for the loss function
# As in the model training, the weights for loss are based on 1/accuracy, to give slightly more weight to whichever of the 3 outputs
# has less accuracy
# Based on first edition of model ran on 20 epochs, with best checkpoint (on validation) retained
# I.e checkpoint 9 out of 10
acc_dict = val_results[-1]['training_accuracy']
acc_dict

{'Role': 0.976859986782074,
 'Function': 0.9978083968162537,
 'Level': 0.9902929663658142}

In [4]:
# Calculate weights to use as inputs
weights_dict = {}
norm_factor = 0
for key in acc_dict:
    weights_dict[key]=1/acc_dict[key]
    norm_factor += weights_dict[key]
for key in weights_dict:
    weights_dict[key] = weights_dict[key]/norm_factor*3
weights_dict

{'Role': 1.0116539409992857,
 'Function': 0.990414851774975,
 'Level': 0.9979312072257394}

In [5]:
# Can use above weights to run inference with loss option, and then come back here

# Bring in file from initial inference, as well as encoder for results

with open('../inference/27-02-2024_1239_inference.pkl','rb') as file:
    inference_results = pickle.load(file)

with open('../Data/index_label_mapping.pkl','rb') as file:
    encoder = pickle.load(file)

In [6]:
inference_results.head(20)

Unnamed: 0,Title,Job Function,Job Role,Job Level,Job Role Predicted,Job Function Predicted,Job Level Predicted,Loss
0,IT DIRECTOR,1,4,2,4,1,2,1.4e-05
1,ENGLISH DEPARTMENT CHAIR,2,5,2,5,2,2,0.002494
2,"SENIOR DIRECTOR, HOME FRAGRANCE INFORMATION TE...",1,4,2,4,1,2,0.153269
3,"MANAGER, INFORMATION TECHNOLOGY",1,2,4,2,1,4,6e-06
4,2X CERTIFIED SALESFORCE DEVELOPER,2,5,1,5,2,1,0.08002
5,NETWORK OPERATIONS SPECIALIST,1,4,1,4,1,1,1.6e-05
6,DIRECTOR CYBER POLICY AND PLANNING,1,4,2,4,1,2,0.410957
7,"MANAGER, NETWORK OPERATIONS",1,4,4,4,1,4,9e-06
8,"REGIONAL MANAGER, NETWORK OPERATIONS SUPPORT",1,4,4,5,1,4,2.414484
9,IT SPECIALIST,1,2,1,2,1,1,2e-05


In [7]:
this_table_values = []
this_table_col_names = []
for column in inference_results:
    if column not in ['Title','Loss']:
        this_table_col_names.append(column+'_Text')
        this_col = column.replace(" Predicted","")
        this_col_vals = []
        for _,value in tqdm(enumerate(inference_results[column].tolist())):
            this_col_vals.append(encoder[this_col][value])
        this_table_values.append(this_col_vals)
add_df = pd.DataFrame({k:v for k,v in zip(this_table_col_names,this_table_values)})
inference_results = pd.concat([inference_results,add_df],axis = 1)

0it [00:00, ?it/s]

669785it [00:00, 2962883.72it/s]
669785it [00:00, 3259697.07it/s]
669785it [00:00, 3379595.39it/s]
669785it [00:00, 3138283.90it/s]
669785it [00:00, 3162697.54it/s]
669785it [00:00, 3268742.95it/s]


In [8]:
# Now sort by loss descending to look at the observations that the model gets most wrong
inference_results_sorted = inference_results.sort_values(by='Loss',ascending=False)[['Title','Job Role_Text','Job Function_Text','Job Level_Text',
                                                                                    'Job Role Predicted_Text','Job Function Predicted_Text',
                                                                                    'Job Level Predicted_Text','Loss']]
inference_results_sorted.head(20)

Unnamed: 0,Title,Job Role_Text,Job Function_Text,Job Level_Text,Job Role Predicted_Text,Job Function Predicted_Text,Job Level Predicted_Text,Loss
377261,ÈNG,NON-ICP,NON-ICP,CONTRIBUTOR,DEVELOPMENT,ENGINEERING,CONTRIBUTOR,17.035435
281934,SR.PM,NON-ICP,NON-ICP,CONTRIBUTOR,DEVELOPMENT,ENGINEERING,MANAGER,14.895061
651377,SR.PM,NON-ICP,NON-ICP,CONTRIBUTOR,DEVELOPMENT,ENGINEERING,MANAGER,14.895061
584419,NETWORK SECURITY ENGINEER,INFORMATION SECURITY,IT,CONTRIBUTOR,NETWORKING,IT,CONTRIBUTOR,12.008307
310351,"IT VICE PRESIDENT, INFORMATION SECURITY",INFORMATION SECURITY,IT,C-LEVEL,INFORMATION SECURITY,IT,EXECUTIVE,11.363048
127468,"SENIOR MANAGER, INFORMATION TECHNOLOGY INFRAST...",INFORMATION SECURITY,IT,UNKNOWN,INFORMATION SECURITY,IT,MANAGER,10.772053
231917,"ASSISTANT VP, NETWORK SECURITY OPERATIONS",INFORMATION SECURITY,IT,EXECUTIVE,NETWORKING,IT,CONTRIBUTOR,10.501913
176880,"MANAGER, COMPUTING INFRASTRUCTURE",NON-ICP,IT,MANAGER,NETWORKING,IT,MANAGER,10.376395
21019,CTO / CSO,SYSTEMS,IT,EXECUTIVE,NETWORKING,IT,C-LEVEL,10.323332
489206,"DIRECTOR, TECHNOLOGY SECURITY",INFORMATION SECURITY,IT,DIRECTOR,NETWORKING,IT,DIRECTOR,10.300467


Overall I think I'm fine with the differences above, yes the model does appear to be inaccurate in certain circumstances but considering these are where the model diverges the most with the actual historical tagged I'm fine if there are some errors, since the model can't be perfect. Indeed, most of the above also have a reasonable explanation for the model picks as well.

In [17]:
top_inaccuracies = inference_results_sorted.head(20)
top_inaccuracies.columns = [['','Actual','Actual','Actual','Predicted','Predicted','Predicted',''],['Title','Role','Function','Level','Role','Function','Level','Loss']]
dfi.export(top_inaccuracies,'../Reports/Top_Loss.png')

In [10]:
inference_results.loc[(inference_results['Job Role Predicted_Text'] == 'GOVERNANCE RISK COMPLIANCE') & (inference_results['Job Function Predicted_Text'] != 'RISK/LEGAL/COMPLIANCE')][['Title','Job Role_Text','Job Function_Text','Job Level_Text',
                                                                                    'Job Role Predicted_Text','Job Function Predicted_Text',
                                                                                    'Job Level Predicted_Text','Loss']]

Unnamed: 0,Title,Job Role_Text,Job Function_Text,Job Level_Text,Job Role Predicted_Text,Job Function Predicted_Text,Job Level Predicted_Text,Loss
85,VP OF COMPLIANCE,GOVERNANCE RISK COMPLIANCE,IT,EXECUTIVE,GOVERNANCE RISK COMPLIANCE,IT,EXECUTIVE,0.010169
89,"DIRECTOR, RISK QUANTIFICATION",GOVERNANCE RISK COMPLIANCE,IT,DIRECTOR,GOVERNANCE RISK COMPLIANCE,IT,DIRECTOR,0.053843
114,DATA GOVERNANCE SPECIALIST & VICE PRESIDENT,GOVERNANCE RISK COMPLIANCE,IT,EXECUTIVE,GOVERNANCE RISK COMPLIANCE,IT,EXECUTIVE,0.352693
178,"MANAGER, AUDIT AND COMPLIANCE",GOVERNANCE RISK COMPLIANCE,IT,MANAGER,GOVERNANCE RISK COMPLIANCE,IT,MANAGER,0.002494
318,CYBER SECURITY COMPLIANCE ANALYST,GOVERNANCE RISK COMPLIANCE,IT,CONTRIBUTOR,GOVERNANCE RISK COMPLIANCE,IT,CONTRIBUTOR,0.182990
...,...,...,...,...,...,...,...,...
669438,"DIRECTOR, SECURITY, RISK & COMPLIANCE",GOVERNANCE RISK COMPLIANCE,IT,DIRECTOR,GOVERNANCE RISK COMPLIANCE,IT,DIRECTOR,0.172749
669478,SR. AUDIT MANAGER,GOVERNANCE RISK COMPLIANCE,IT,MANAGER,GOVERNANCE RISK COMPLIANCE,IT,MANAGER,0.000554
669546,MANAGER CUSTOMS COMPLIANCE,GOVERNANCE RISK COMPLIANCE,IT,MANAGER,GOVERNANCE RISK COMPLIANCE,IT,MANAGER,0.013966
669739,COMPLIANCE/RISK/PRIVACY DIRECTOR/MANAGER/CONSU...,GOVERNANCE RISK COMPLIANCE,IT,CONTRIBUTOR,GOVERNANCE RISK COMPLIANCE,IT,CONTRIBUTOR,0.000914


Based on eyeballing a few of the above, the desired Function mapping of Risk/Legal/Compliance should probably also include the Role of Governance Risk Compliance, i.e. whatever function is in there should be overwritten to be Risk/Legal/Compliance when the Role is Risk/Legal/Compliance.

In [11]:
inference_results.loc[(inference_results['Job Function Predicted_Text'] == 'IT') & (inference_results['Job Role Predicted_Text'] == 'NON-ICP')][['Title','Job Role_Text','Job Function_Text','Job Level_Text',
                                                                                    'Job Role Predicted_Text','Job Function Predicted_Text',
                                                                                    'Job Level Predicted_Text','Loss']]

Unnamed: 0,Title,Job Role_Text,Job Function_Text,Job Level_Text,Job Role Predicted_Text,Job Function Predicted_Text,Job Level Predicted_Text,Loss
8,"REGIONAL MANAGER, NETWORK OPERATIONS SUPPORT",NETWORKING,IT,MANAGER,NON-ICP,IT,MANAGER,2.414484
79,"MANAGER, DATA CENTER FACILTIES",NON-ICP,IT,MANAGER,NON-ICP,IT,MANAGER,0.000041
81,SENIOR PROJECT MANAGER,NON-ICP,IT,MANAGER,NON-ICP,IT,MANAGER,0.000066
111,PROJECT MANAGER,NON-ICP,IT,MANAGER,NON-ICP,IT,MANAGER,0.000010
130,"TEAM LEAD, COMPUTER SUPPORT TECHNICIAN",NON-ICP,IT,CONTRIBUTOR,NON-ICP,IT,CONTRIBUTOR,0.074014
...,...,...,...,...,...,...,...,...
669651,DATA ANALYST,NON-ICP,IT,CONTRIBUTOR,NON-ICP,IT,CONTRIBUTOR,0.000045
669654,"PROJECT MANAGER, SYSTEMS",NON-ICP,IT,MANAGER,NON-ICP,IT,MANAGER,0.000246
669710,"MANAGER, CONFIGURATION MANAGEMENT",NON-ICP,IT,MANAGER,NON-ICP,IT,MANAGER,0.002323
669722,"CUSTOMER SERVICE DIRECTOR, NETWORK SERVICES",NON-ICP,IT,DIRECTOR,NON-ICP,IT,DIRECTOR,0.023922


In [30]:
# Take any and all results that don't agree between the predicted and actual and write to excel
writer = pd.ExcelWriter('../inference/model_historic_divergence.xlsx',engine = 'xlsxwriter')
inference_results_sorted.loc[(inference_results_sorted['Job Role_Text'] != inference_results_sorted['Job Role Predicted_Text']) | (inference_results_sorted['Job Function_Text'] != inference_results_sorted['Job Function Predicted_Text']) | (inference_results_sorted['Job Level_Text'] != inference_results_sorted['Job Level Predicted_Text'])
                             ].groupby(['Title','Job Role_Text','Job Function_Text','Job Level_Text','Job Role Predicted_Text',
                                        'Job Function Predicted_Text','Job Level Predicted_Text','Loss']).size().reset_index(name='Counts').sort_values(by='Loss',ascending=False).to_excel(
    writer, sheet_name = 'Prediction_Divergences',index = False
)
writer.close()