In [65]:
# import all the required packages
import sys
import os
# change according to the status of GPU
# this command must before import pytorch
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'  # setting the GPUs

In [66]:
import pandas as pd
import numpy as np
import json, csv

import torch
from datasets import load_dataset, load_metric
from transformers import AutoModel, AutoTokenizer, BertForSequenceClassification, XLMRobertaForSequenceClassification, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score, hamming_loss
import warnings

In [67]:
# check the running environment
print(sys.executable)
print(torch.cuda.is_available())
torch.cuda.device_count()

/home/user/jinqli/anaconda3/envs/new_env/bin/python
True


2

## Import Test data

In [68]:
de_path = '/srv/scratch2/jinq/taskontology/task_to_GWA_IWA_DWA_DE.csv'

data_df = pd.read_csv(de_path, index_col=0)

In [69]:
test_path = '/srv/scratch2/jinq/taskontology/task_test.csv'

test_df = pd.read_csv(test_path, index_col=0)

In [70]:
MAX_LENGTH = 64
def get_prediction(model,tokenizer,text, top_n: int=5):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
    # perform inference to our model
    logits = model(**inputs).logits
    # get output probabilities by doing softmax
    probs = logits[0].softmax(0)
    
    #get the top_n candidates and corresponding prob as score
    value, indices = probs.topk(top_n, sorted=True)
    # results = [(id_.item(),round(val.item(),4)) for val,id_ in zip(value[0], indices[0])]
    results = [(id_.item(),round(val.item(),4)) for val,id_ in zip(value, indices)]
    results = [j for item in results for j in item ]
    return results

def apply_classify_on_df(model,tokenizer,df):
    """
    Apply a function and return multiple values so that you can create multiple columns, return a pd.Series with the values instead:
    Source: https://queirozf.com/entries/pandas-dataframes-apply-examples
    """
    df[['la1','la1score','la2','la2score','la3','la3score','la4','la4score','la5','la5score']] = df.apply(lambda row: pd.Series(get_prediction(model,tokenizer,row.loc['Task_de'])), axis=1)
    return df

In [71]:
# example for debugging

print(test_df['Task_de'].iloc[0])
model_checkpoint = m1_gbert

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

get_prediction(model,tokenizer,test_df['Task_de'].iloc[0])

['Operating lift, winches, or chain falls to move heavy curtain doors.']


[1, 0.5196, 5, 0.0964, 24, 0.0937, 13, 0.0369, 25, 0.0366]

In [24]:
def compute_macro(true_la, pred_la):
    
    acc = accuracy_score(true_la, pred_la)
    f1 = f1_score(true_la, pred_la,average='macro')
    recall = recall_score(true_la, pred_la,average='macro')
    precision = precision_score(true_la, pred_la,average='macro')
    hamming = hamming_loss(true_la, pred_la) 
    
    return {
      'accuracy': acc,
      'f1_macro': f1,
      'recall_macro': recall,
      'precision_macro': precision,
      'hamming_loss': hamming,
    }
    
def compute_weig():
    f1 = f1_score(true_la, pred_la,average='weighted')
    recall = recall_score(true_la, pred_la,average='weighted')
    precision = precision_score(true_la, pred_la,average='weighted')
    
    return f1,recall,precision
   

In [8]:
m1_gbert = 'trained_models/gbert_results/checkpoint-2000'
m2_job = 'trained_models/job_results/checkpoint-2000'
m3_multi = 'trained_models/m3_multibert_results/checkpoint-2000'
m4_multi_job = 'trained_models/m4_multijob_results/checkpoint-2000'

# encoding labels
y_encoded = LabelEncoder().fit(data_df['GWA Title'])

# a new dataframe to store all the predictions by models
GWA_predictions = pd.DataFrame(test_df, columns=['GWA Title'])

In [60]:
def model_pred(model_checkpoint, name):
    
    # import the model
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
    
    pred_df = apply_classify_on_df(model,tokenizer,test_df)

    pred_gwa_titles = y_encoded.inverse_transform(pred_df['la1'].astype(int)).tolist()

    pred_df['gwa_class'] = pred_gwa_titles
    GWA_predictions[name+'_pred'] = pred_gwa_titles

    # save m1 gbert model's predicted result
    outfile = 'pred_test/' + name + '.csv'
    pred_df.to_csv(outfile, header=True)

    return pred_df

## Predicting with models

In [61]:
# Model1: GBERT
m1_pred = model_pred(m1_gbert, 'gbert_test')
m1_pred[['GWA Title', 'gwa_class']]

Unnamed: 0,GWA Title,gwa_class
18834,Performing General Physical Activities,Assisting and Caring for Others
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects
...,...,...
23537,Repairing and Maintaining Mechanical Equipment,"Inspecting Equipment, Structures, or Materials"
23538,Handling and Moving Objects,Handling and Moving Objects
23539,"Identifying Objects, Actions, and Events",Controlling Machines and Processes
23540,Controlling Machines and Processes,Controlling Machines and Processes


In [62]:
# Model2: JOB
model_pred(m2_job, 'job_test')

Unnamed: 0,O*NET-SOC Code,Title,Task ID,Task,DWA ID,DWA Title,Date,Domain Source,Task_de,GWA ID,...,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score,gwa_class
18834,49-9011.00,Mechanical Door Repairers,8403,"Operate lifts, winches, or chain falls to move...",4.A.3.a.1.I10.D06,Move large objects using heavy equipment.,03/2014,Analyst,"['Operating lift, winches, or chain falls to m...",4.A.3.a.1,...,0.8520,28.0,0.0215,24.0,0.0180,13.0,0.0177,21.0,0.0170,Controlling Machines and Processes
18835,49-9011.00,Mechanical Door Repairers,8404,"Order replacement springs, sections, or slats.",4.A.4.c.3.I05.D01,"Order materials, supplies, or equipment.",03/2014,Analyst,"['Order replacement springs, sections, or slat...",4.A.4.c.3,...,0.6824,23.0,0.1966,26.0,0.0357,18.0,0.0121,30.0,0.0092,Monitoring and Controlling Resources
18836,49-9011.00,Mechanical Door Repairers,8405,Bore or cut holes in flooring as required for ...,4.A.3.a.2.I30.D03,"Drill holes in parts, equipment, or materials.",03/2014,Analyst,['Bore or cut holes in flooring as required fo...,4.A.3.a.2,...,0.8314,5.0,0.0795,24.0,0.0431,28.0,0.0056,1.0,0.0039,Handling and Moving Objects
18837,49-9011.00,Mechanical Door Repairers,8406,Set in and secure floor treadles for door-acti...,4.A.3.a.2.I04.D14,"Assemble electrical components, subsystems, or...",03/2014,Analyst,['Set in and secure floor treadals for door-ac...,4.A.3.a.2,...,0.7785,5.0,0.0761,33.0,0.0743,28.0,0.0081,9.0,0.0062,Handling and Moving Objects
18838,49-9011.00,Mechanical Door Repairers,8406,Set in and secure floor treadles for door-acti...,4.A.3.a.2.I34.D07,Connect electrical components or equipment.,03/2014,Analyst,['Set in and secure floor treadals for door-ac...,4.A.3.a.2,...,0.7785,5.0,0.0761,33.0,0.0743,28.0,0.0081,9.0,0.0062,Handling and Moving Objects
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23537,53-7121.00,"Tank Car, Truck, and Ship Loaders",12806,"Test vessels for leaks, damage, and defects, a...",4.A.3.b.4.I08.D06,Maintain material moving equipment in good wor...,03/2014,Analyst,"['Prüfen von Leaks-, Schaden- und Defektschiff...",4.A.3.b.4,...,0.4675,28.0,0.2713,20.0,0.0470,26.0,0.0276,13.0,0.0238,"Inspecting Equipment, Structures, or Materials"
23538,53-7121.00,"Tank Car, Truck, and Ship Loaders",12807,Unload cars containing liquids by connecting h...,4.A.3.a.2.I34.D01,Connect hoses to equipment or machinery.,03/2014,Analyst,"['Sie laden Autos ein, die Flüssigkeiten entha...",4.A.3.a.2,...,0.6319,13.0,0.1585,5.0,0.0750,28.0,0.0282,21.0,0.0239,Performing General Physical Activities
23539,53-7121.00,"Tank Car, Truck, and Ship Loaders",12808,Copy and attach load specifications to loaded ...,4.A.1.b.1.I01.D03,Mark materials or objects for identification.,03/2014,Analyst,['Copy and attach load specifications to loade...,4.A.1.b.1,...,0.4228,26.0,0.1932,20.0,0.1819,36.0,0.0418,5.0,0.0211,Performing Administrative Activities
23540,53-7121.00,"Tank Car, Truck, and Ship Loaders",12809,Start pumps and adjust valves or cables to reg...,4.A.3.a.3.I02.D03,Control pumps or pumping equipment.,03/2014,Analyst,['Sie starten Pumpen und passen Ware oder Kabe...,4.A.3.a.3,...,0.6826,13.0,0.1007,15.0,0.0328,24.0,0.0324,28.0,0.0302,Controlling Machines and Processes


In [63]:
# Model3: Multilingual 
model_pred(m3_multi, 'multi_test')

Unnamed: 0,O*NET-SOC Code,Title,Task ID,Task,DWA ID,DWA Title,Date,Domain Source,Task_de,GWA ID,...,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score,gwa_class
18834,49-9011.00,Mechanical Door Repairers,8403,"Operate lifts, winches, or chain falls to move...",4.A.3.a.1.I10.D06,Move large objects using heavy equipment.,03/2014,Analyst,"['Operating lift, winches, or chain falls to m...",4.A.3.a.1,...,0.6158,13.0,0.0885,5.0,0.0864,1.0,0.0402,28.0,0.0381,Performing General Physical Activities
18835,49-9011.00,Mechanical Door Repairers,8404,"Order replacement springs, sections, or slats.",4.A.4.c.3.I05.D01,"Order materials, supplies, or equipment.",03/2014,Analyst,"['Order replacement springs, sections, or slat...",4.A.4.c.3,...,0.3934,18.0,0.2303,13.0,0.0536,12.0,0.0382,30.0,0.0311,Monitoring and Controlling Resources
18836,49-9011.00,Mechanical Door Repairers,8405,Bore or cut holes in flooring as required for ...,4.A.3.a.2.I30.D03,"Drill holes in parts, equipment, or materials.",03/2014,Analyst,['Bore or cut holes in flooring as required fo...,4.A.3.a.2,...,0.5110,24.0,0.3823,28.0,0.0239,5.0,0.0212,1.0,0.0062,Handling and Moving Objects
18837,49-9011.00,Mechanical Door Repairers,8406,Set in and secure floor treadles for door-acti...,4.A.3.a.2.I04.D14,"Assemble electrical components, subsystems, or...",03/2014,Analyst,['Set in and secure floor treadals for door-ac...,4.A.3.a.2,...,0.4797,24.0,0.2291,5.0,0.1292,28.0,0.0791,1.0,0.0102,Handling and Moving Objects
18838,49-9011.00,Mechanical Door Repairers,8406,Set in and secure floor treadles for door-acti...,4.A.3.a.2.I34.D07,Connect electrical components or equipment.,03/2014,Analyst,['Set in and secure floor treadals for door-ac...,4.A.3.a.2,...,0.4797,24.0,0.2291,5.0,0.1292,28.0,0.0791,1.0,0.0102,Handling and Moving Objects
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23537,53-7121.00,"Tank Car, Truck, and Ship Loaders",12806,"Test vessels for leaks, damage, and defects, a...",4.A.3.b.4.I08.D06,Maintain material moving equipment in good wor...,03/2014,Analyst,"['Prüfen von Leaks-, Schaden- und Defektschiff...",4.A.3.b.4,...,0.5889,19.0,0.1271,10.0,0.0512,0.0,0.0171,11.0,0.0153,"Inspecting Equipment, Structures, or Materials"
23538,53-7121.00,"Tank Car, Truck, and Ship Loaders",12807,Unload cars containing liquids by connecting h...,4.A.3.a.2.I34.D01,Connect hoses to equipment or machinery.,03/2014,Analyst,"['Sie laden Autos ein, die Flüssigkeiten entha...",4.A.3.a.2,...,0.4662,13.0,0.1757,5.0,0.1112,28.0,0.0526,21.0,0.0374,Performing General Physical Activities
23539,53-7121.00,"Tank Car, Truck, and Ship Loaders",12808,Copy and attach load specifications to loaded ...,4.A.1.b.1.I01.D03,Mark materials or objects for identification.,03/2014,Analyst,['Copy and attach load specifications to loade...,4.A.1.b.1,...,0.2441,28.0,0.1638,23.0,0.0979,13.0,0.0890,26.0,0.0763,Monitoring and Controlling Resources
23540,53-7121.00,"Tank Car, Truck, and Ship Loaders",12809,Start pumps and adjust valves or cables to reg...,4.A.3.a.3.I02.D03,Control pumps or pumping equipment.,03/2014,Analyst,['Sie starten Pumpen und passen Ware oder Kabe...,4.A.3.a.3,...,0.4821,5.0,0.2948,28.0,0.0629,24.0,0.0595,21.0,0.0111,Handling and Moving Objects


In [64]:
# Model1: Multilingual job 
model_pred(m4_multi_job, 'multi_job_test')

Unnamed: 0,O*NET-SOC Code,Title,Task ID,Task,DWA ID,DWA Title,Date,Domain Source,Task_de,GWA ID,...,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score,gwa_class
18834,49-9011.00,Mechanical Door Repairers,8403,"Operate lifts, winches, or chain falls to move...",4.A.3.a.1.I10.D06,Move large objects using heavy equipment.,03/2014,Analyst,"['Operating lift, winches, or chain falls to m...",4.A.3.a.1,...,0.3671,24.0,0.2925,13.0,0.0756,21.0,0.0538,1.0,0.0390,Controlling Machines and Processes
18835,49-9011.00,Mechanical Door Repairers,8404,"Order replacement springs, sections, or slats.",4.A.4.c.3.I05.D01,"Order materials, supplies, or equipment.",03/2014,Analyst,"['Order replacement springs, sections, or slat...",4.A.4.c.3,...,0.8873,23.0,0.0263,18.0,0.0245,13.0,0.0108,7.0,0.0078,Monitoring and Controlling Resources
18836,49-9011.00,Mechanical Door Repairers,8405,Bore or cut holes in flooring as required for ...,4.A.3.a.2.I30.D03,"Drill holes in parts, equipment, or materials.",03/2014,Analyst,['Bore or cut holes in flooring as required fo...,4.A.3.a.2,...,0.6688,24.0,0.2180,5.0,0.0436,28.0,0.0117,21.0,0.0093,Handling and Moving Objects
18837,49-9011.00,Mechanical Door Repairers,8406,Set in and secure floor treadles for door-acti...,4.A.3.a.2.I04.D14,"Assemble electrical components, subsystems, or...",03/2014,Analyst,['Set in and secure floor treadals for door-ac...,4.A.3.a.2,...,0.5319,24.0,0.2489,5.0,0.1156,28.0,0.0219,21.0,0.0133,Handling and Moving Objects
18838,49-9011.00,Mechanical Door Repairers,8406,Set in and secure floor treadles for door-acti...,4.A.3.a.2.I34.D07,Connect electrical components or equipment.,03/2014,Analyst,['Set in and secure floor treadals for door-ac...,4.A.3.a.2,...,0.5319,24.0,0.2489,5.0,0.1156,28.0,0.0219,21.0,0.0133,Handling and Moving Objects
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23537,53-7121.00,"Tank Car, Truck, and Ship Loaders",12806,"Test vessels for leaks, damage, and defects, a...",4.A.3.b.4.I08.D06,Maintain material moving equipment in good wor...,03/2014,Analyst,"['Prüfen von Leaks-, Schaden- und Defektschiff...",4.A.3.b.4,...,0.7574,13.0,0.0627,24.0,0.0341,15.0,0.0330,5.0,0.0213,Repairing and Maintaining Mechanical Equipment
23538,53-7121.00,"Tank Car, Truck, and Ship Loaders",12807,Unload cars containing liquids by connecting h...,4.A.3.a.2.I34.D01,Connect hoses to equipment or machinery.,03/2014,Analyst,"['Sie laden Autos ein, die Flüssigkeiten entha...",4.A.3.a.2,...,0.6737,13.0,0.1451,5.0,0.0661,21.0,0.0250,28.0,0.0218,Performing General Physical Activities
23539,53-7121.00,"Tank Car, Truck, and Ship Loaders",12808,Copy and attach load specifications to loaded ...,4.A.1.b.1.I01.D03,Mark materials or objects for identification.,03/2014,Analyst,['Copy and attach load specifications to loade...,4.A.1.b.1,...,0.9023,18.0,0.0105,9.0,0.0102,5.0,0.0093,33.0,0.0068,Handling and Moving Objects
23540,53-7121.00,"Tank Car, Truck, and Ship Loaders",12809,Start pumps and adjust valves or cables to reg...,4.A.3.a.3.I02.D03,Control pumps or pumping equipment.,03/2014,Analyst,['Sie starten Pumpen und passen Ware oder Kabe...,4.A.3.a.3,...,0.3356,24.0,0.2820,5.0,0.2299,28.0,0.0412,21.0,0.0215,Handling and Moving Objects


### Overall prediction

Original and predicted GWA classes by all models. 

In [40]:
print(GWA_predictions.shape)
GWA_predictions.head()

(4708, 5)


Unnamed: 0,GWA Title,m1_pred,m2_pred,m3_pred,m4_pred
18834,Performing General Physical Activities,Assisting and Caring for Others,Controlling Machines and Processes,Performing General Physical Activities,Controlling Machines and Processes
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources,Monitoring and Controlling Resources,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects


In [None]:
# save to file
outfile = 'pred_test/all_predictioins.csv'
GWA_predictions.to_csv(outfile, header=True)


## Computing metrics

In [72]:
# save all the predicted results to file
pred_path= 'pred_test/all_predictioins.csv'
pred_df = pd.read_csv(pred_path, index_col=0)

In [73]:
warnings.filterwarnings('ignore')

for i in range (pred_df.shape[1]-1):
    results = compute_macro(pred_df.iloc[:,0], pred_df.iloc[:,i+1])
    print('Results for ', pred_df.columns[i+1], results)

Results for  m1_pred {'accuracy': 0.5290994052676296, 'f1_macro': 0.37797274884904, 'recall_macro': 0.4400122881572895, 'precision_macro': 0.3827088228480856, 'hamming_loss': 0.47090059473237045}
Results for  m2_pred {'accuracy': 0.5138062871707731, 'f1_macro': 0.3718534449101467, 'recall_macro': 0.437294272612504, 'precision_macro': 0.3901104602488269, 'hamming_loss': 0.48619371282922685}
Results for  m3_pred {'accuracy': 0.5070093457943925, 'f1_macro': 0.34167679573472814, 'recall_macro': 0.39892455303591784, 'precision_macro': 0.34292711239468954, 'hamming_loss': 0.4929906542056075}
Results for  m4_pred {'accuracy': 0.5339847068819031, 'f1_macro': 0.3851650661612, 'recall_macro': 0.4374095697649561, 'precision_macro': 0.3799185673510285, 'hamming_loss': 0.46601529311809686}
