In [29]:
import pickle
import pandas as pd
from tqdm import tqdm

In [5]:
# Load in validation results
with open('../logging/13-02-2024_2126/epoch07_train_val_summary','rb') as file:
    val_results = pickle.load(file)

In [6]:
# Take a look at accuracy of the validation results - these will be used to establish the weights for the loss function
# As in the model training, the weights for loss are based on 1/accuracy, to give slightly more weight to whichever of the 3 outputs
# has less accuracy
# Based on first edition of model ran on 10 epochs, with best checkpoint (on validation) retained
# I.e checkpoint 8 out of 10
acc_dict = val_results[-1]['training_accuracy']
acc_dict

{'Role': 0.8868475556373596,
 'Function': 0.9789828658103943,
 'Level': 0.9484805464744568}

In [7]:
# Calculate weights to use as inputs
weights_dict = {}
norm_factor = 0
for key in acc_dict:
    weights_dict[key]=1/acc_dict[key]
    norm_factor += weights_dict[key]
for key in weights_dict:
    weights_dict[key] = weights_dict[key]/norm_factor*3
weights_dict

{'Role': 1.0560011736018362,
 'Function': 0.9566174161625742,
 'Level': 0.9873814102355893}

In [45]:
# Can use above weights to run inference with loss option, and then come back here

# Bring in file from initial inference, as well as encoder for results

with open('../inference/15-02-2024_0007_inference.pkl','rb') as file:
    inference_results = pickle.load(file)

with open('../Data/index_label_mapping.pkl','rb') as file:
    encoder = pickle.load(file)

In [46]:
inference_results.head(20)

Unnamed: 0,Title,Job Role,Job Function,Job Level,Job Role Predicted,Job Function Predicted,Job Level Predicted,Loss
0,Architect,3,1,1,3,1,1,0.001458
1,Ingeniero De Desarrollo,4,0,1,4,0,1,0.005348
2,???????????,4,2,1,4,2,1,1.630244
3,Sr. Offensive Security Manager,2,1,4,2,1,4,0.01988
4,"Project Manager, Infrastructure",3,1,4,3,1,4,0.004265
5,Co-Founder & Chief Technology Officer,1,1,3,1,1,3,0.001713
6,System Administrator & Business Intelligence B...,5,1,1,5,1,1,0.022417
7,Cio,2,1,0,2,1,0,0.596701
8,"Manager, Network Engineering",3,1,4,3,1,4,0.000741
9,"Services Supervisor, Information Technology",2,1,4,2,1,4,0.065887


In [47]:
this_table_values = []
this_table_col_names = []
for column in inference_results:
    if column not in ['Title','Loss']:
        this_table_col_names.append(column+'_Text')
        this_col = column.replace(" Predicted","")
        this_col_vals = []
        for _,value in tqdm(enumerate(inference_results[column].tolist())):
            this_col_vals.append(encoder[this_col][value])
        this_table_values.append(this_col_vals)
add_df = pd.DataFrame({k:v for k,v in zip(this_table_col_names,this_table_values)})
inference_results = pd.concat([inference_results,add_df],axis = 1)

1222412it [00:02, 469736.86it/s]
1222412it [00:02, 480992.78it/s]
1222412it [00:02, 473380.93it/s]
1222412it [00:02, 480666.08it/s]
1222412it [00:02, 475500.25it/s]
1222412it [00:02, 484029.21it/s]


In [50]:
# Now sort by loss descending to look at the observations that the model gets most wrong
inference_results_sorted = inference_results.sort_values(by='Loss',ascending=False)[['Title','Job Role_Text','Job Function_Text','Job Level_Text',
                                                                                    'Job Role Predicted_Text','Job Function Predicted_Text',
                                                                                    'Job Level Predicted_Text','Loss']]
inference_results_sorted.head(20)

Unnamed: 0,Title,Job Role_Text,Job Function_Text,Job Level_Text,Job Role Predicted_Text,Job Function Predicted_Text,Job Level Predicted_Text,Loss
272265,Chief Information Security Officer,IT General,IT,Director,Information Security,IT,C-Level,21.508377
1002518,Cto,Networking,IT,C-Level,IT General,IT,Executive,20.61183
254623,DX???????,Systems,IT,Contributor,Non-ICP,Non-ICP,Manager,15.255075
166917,Security Operations Manager,Networking,Non-ICP,Manager,Information Security,IT,Manager,15.049135
713354,Security Operations Manager,Networking,Non-ICP,Manager,Information Security,IT,Manager,15.049135
476769,Issm,Information Security,Engineering,Contributor,Non-ICP,Non-ICP,Manager,14.638029
251121,Security,Governance Risk Compliance,IT,Manager,Information Security,IT,Contributor,13.82131
574676,Security,Governance Risk Compliance,IT,Manager,Information Security,IT,Contributor,13.82131
955758,Dirección General,Non-ICP,Non-ICP,Director,Information Security,IT,C-Level,13.527388
290703,Dirección,Non-ICP,Non-ICP,Director,Information Security,IT,C-Level,13.114651
