In [1]:
# import all the required packages
import sys
import os
# change according to the status of GPU
# this command must before import pytorch
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'  # setting the GPUs

In [65]:
import pandas as pd
import numpy as np
import json, csv

import torch
from datasets import load_dataset, load_metric
from transformers import AutoModel, AutoTokenizer, BertForSequenceClassification, XLMRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score, hamming_loss
import warnings

In [3]:
# check the running environment
print(sys.executable)
print(torch.cuda.is_available())
torch.cuda.device_count()

/home/user/jinqli/anaconda3/envs/new_env/bin/python
True


2

## Import Test data

In [10]:
de_path = '/srv/scratch2/jinq/taskontology/task_to_GWA_IWA_DWA_DE.csv'

data_df = pd.read_csv(de_path, index_col=0)

In [11]:
test_path = '/srv/scratch2/jinq/taskontology/task_test.csv'

test_df = pd.read_csv(test_path, index_col=0)

In [12]:
MAX_LENGTH = 64
def get_prediction(text, top_n: int=5):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
    # perform inference to our model
    logits = model(**inputs).logits
    # get output probabilities by doing softmax
    # probs = logits[0].softmax(1)
    # print(probs)
    
    #get the top_n candidates and corresponding logits as score
    value, indices = logits.topk(top_n, 1,sorted=True)
    results = [(id_.item(),round(val.item(),4)) for val,id_ in zip(value[0], indices[0])]
    results = [j for item in results for j in item ]
    return results

def apply_classify_on_df(df):
    """
    Apply a function and return multiple values so that you can create multiple columns, return a pd.Series with the values instead:
    Source: https://queirozf.com/entries/pandas-dataframes-apply-examples
    """
    df[['la1', 'la1score', 'la2', 'la2score', 'la3', 'la3score', 'la4', 'la4score', 'la5', 'la5score']] = df.apply(lambda row: pd.Series(get_prediction(row.loc['Task_de'])), axis=1)
    return df

In [56]:
def compute_macro(true_la, pred_la):
    
    acc = accuracy_score(true_la, pred_la)
    f1 = f1_score(true_la, pred_la,average='macro')
    recall = recall_score(true_la, pred_la,average='macro')
    precision = precision_score(true_la, pred_la,average='macro')
    hamming = hamming_loss(true_la, pred_la) 
    
    return {
      'accuracy': acc,
      'f1_macro': f1,
      'recall_macro': recall,
      'precision_macro': precision,
      'hamming_loss': hamming,
    }
    
def compute_weig():
    f1 = f1_score(true_la, pred_la,average='weighted')
    recall = recall_score(true_la, pred_la,average='weighted')
    precision = precision_score(true_la, pred_la,average='weighted')
    
    return f1,recall,precision
   

In [14]:
m1_gbert = 'trained_models/gbert_results/checkpoint-2000'
m2_job = 'trained_models/job_results/checkpoint-2000'
m3_multi = 'trained_models/m3_multibert_results/checkpoint-2000'
m4_multi_job = 'trained_models/m4_multijob_results/checkpoint-2000'

# encoding labels
y_encoded = LabelEncoder().fit(data_df['GWA Title'])


### m1

In [16]:
model_checkpoint = m1_gbert

# import the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BertForSequenceClassification.from_pretrained(model_checkpoint)


In [17]:
m1pred = apply_classify_on_df(test_df)
m1pred.head()

Unnamed: 0,O*NET-SOC Code,Title,Task ID,Task,DWA ID,DWA Title,Date,Domain Source,Task_de,GWA ID,...,la1,la1score,la2,la2score,la3,la3score,la4,la4score,la5,la5score
18834,49-9011.00,Mechanical Door Repairers,8403,"Operate lifts, winches, or chain falls to move...",4.A.3.a.1.I10.D06,Move large objects using heavy equipment.,03/2014,Analyst,"['Operating lift, winches, or chain falls to m...",4.A.3.a.1,...,1.0,4.3617,5.0,2.6767,24.0,2.6488,13.0,1.7155,25.0,1.7081
18835,49-9011.00,Mechanical Door Repairers,8404,"Order replacement springs, sections, or slats.",4.A.4.c.3.I05.D01,"Order materials, supplies, or equipment.",03/2014,Analyst,"['Order replacement springs, sections, or slat...",4.A.4.c.3,...,20.0,6.5179,18.0,3.3795,23.0,1.8726,12.0,1.5479,1.0,1.351
18836,49-9011.00,Mechanical Door Repairers,8405,Bore or cut holes in flooring as required for ...,4.A.3.a.2.I30.D03,"Drill holes in parts, equipment, or materials.",03/2014,Analyst,['Bore or cut holes in flooring as required fo...,4.A.3.a.2,...,13.0,7.0796,24.0,3.0522,5.0,2.7906,28.0,1.2449,1.0,1.1777
18837,49-9011.00,Mechanical Door Repairers,8406,Set in and secure floor treadles for door-acti...,4.A.3.a.2.I04.D14,"Assemble electrical components, subsystems, or...",03/2014,Analyst,['Set in and secure floor treadals for door-ac...,4.A.3.a.2,...,13.0,5.6544,5.0,4.2529,28.0,2.2759,24.0,1.7753,36.0,1.4265
18838,49-9011.00,Mechanical Door Repairers,8406,Set in and secure floor treadles for door-acti...,4.A.3.a.2.I34.D07,Connect electrical components or equipment.,03/2014,Analyst,['Set in and secure floor treadals for door-ac...,4.A.3.a.2,...,13.0,5.6544,5.0,4.2529,28.0,2.2759,24.0,1.7753,36.0,1.4265


In [24]:

m1_gwa_titles = y_encoded.inverse_transform(m1pred['la1'].astype(int)).tolist()

m1pred['gwa_class'] = m1_gwa_titles

# save m1 gbert model's predicted result
outfile = 'pred_test/gbert_test.csv'
m1pred.to_csv(outfile, header=True)

m1pred[['GWA Title', 'gwa_class']]

Unnamed: 0,GWA Title,gwa_class
18834,Performing General Physical Activities,Assisting and Caring for Others
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects
...,...,...
23537,Repairing and Maintaining Mechanical Equipment,"Inspecting Equipment, Structures, or Materials"
23538,Handling and Moving Objects,Handling and Moving Objects
23539,"Identifying Objects, Actions, and Events",Controlling Machines and Processes
23540,Controlling Machines and Processes,Controlling Machines and Processes


In [25]:
compute_macro(m1pred['GWA Title'], m1pred['gwa_class'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.37797274884904, 0.4400122881572895, 0.3827088228480856)

In [29]:
# a new dataframe to store all the predictions by models

GWA_predictions = pd.DataFrame(m1pred, columns=['GWA Title', 'gwa_class'])
GWA_predictions = GWA_predictions.rename(columns={'gwa_class':'m1_pred'})
GWA_predictions

Unnamed: 0,GWA Title,m1_pred
18834,Performing General Physical Activities,Assisting and Caring for Others
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects
...,...,...
23537,Repairing and Maintaining Mechanical Equipment,"Inspecting Equipment, Structures, or Materials"
23538,Handling and Moving Objects,Handling and Moving Objects
23539,"Identifying Objects, Actions, and Events",Controlling Machines and Processes
23540,Controlling Machines and Processes,Controlling Machines and Processes



### m2


In [31]:
model_checkpoint = m2_job

# import the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BertForSequenceClassification.from_pretrained(model_checkpoint)

# predict with model2
m2pred_df = apply_classify_on_df(test_df)

In [32]:
m2_gwa_titles = y_encoded.inverse_transform(m2pred_df['la1'].astype(int)).tolist()
print(len(gwa_titles))
m2pred_df['gwa_class'] = m2_gwa_titles
GWA_predictions['m2_pred'] = m2_gwa_titles

# save m2 Job model's predicted result
outfile = 'pred_test/job_test.csv'
m2pred_df.to_csv(outfile, header=True)

m2pred_df[['GWA Title', 'gwa_class']]

4708


Unnamed: 0,GWA Title,gwa_class
18834,Performing General Physical Activities,Controlling Machines and Processes
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects
...,...,...
23537,Repairing and Maintaining Mechanical Equipment,"Inspecting Equipment, Structures, or Materials"
23538,Handling and Moving Objects,Performing General Physical Activities
23539,"Identifying Objects, Actions, and Events",Performing Administrative Activities
23540,Controlling Machines and Processes,Controlling Machines and Processes


In [33]:
compute_macro(m2pred_df['GWA Title'], m2pred_df['gwa_class'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.3718534449101467, 0.437294272612504, 0.3901104602488269)

### m3

In [34]:
model_checkpoint = m3_multi

# import the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BertForSequenceClassification.from_pretrained(model_checkpoint)

m3pred = apply_classify_on_df(test_df)

m3_gwa_titles = y_encoded.inverse_transform(m3pred['la1'].astype(int)).tolist()
m3pred['gwa_class'] = m3_gwa_titles
GWA_predictions['m3_pred'] = m3_gwa_titles

# save m3 multilingual model's predicted result
outfile = 'pred_test/multi_test.csv'
m3pred.to_csv(outfile, header=True)

m3pred[['GWA Title', 'gwa_class']]

Unnamed: 0,GWA Title,gwa_class
18834,Performing General Physical Activities,Performing General Physical Activities
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects
...,...,...
23537,Repairing and Maintaining Mechanical Equipment,"Inspecting Equipment, Structures, or Materials"
23538,Handling and Moving Objects,Performing General Physical Activities
23539,"Identifying Objects, Actions, and Events",Monitoring and Controlling Resources
23540,Controlling Machines and Processes,Handling and Moving Objects


In [35]:
compute_macro(m3pred['GWA Title'], m3pred['gwa_class'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.34167679573472814, 0.39892455303591784, 0.34292711239468954)

### m4

In [38]:
model_checkpoint = m4_multi_job

# import the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = XLMRobertaForSequenceClassification.from_pretrained(model_checkpoint)

m4pred = apply_classify_on_df(test_df)

m4_gwa_titles = y_encoded.inverse_transform(m4pred['la1'].astype(int)).tolist()
m4pred['gwa_class'] = m4_gwa_titles
GWA_predictions['m4_pred'] = m4_gwa_titles

# save m4 multilingual_job model's predicted result
outfile = 'pred_test/multi_job_test.csv'
m4pred.to_csv(outfile, header=True)

m4pred[['GWA Title', 'gwa_class']]

Unnamed: 0,GWA Title,gwa_class
18834,Performing General Physical Activities,Controlling Machines and Processes
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects
...,...,...
23537,Repairing and Maintaining Mechanical Equipment,Repairing and Maintaining Mechanical Equipment
23538,Handling and Moving Objects,Performing General Physical Activities
23539,"Identifying Objects, Actions, and Events",Handling and Moving Objects
23540,Controlling Machines and Processes,Handling and Moving Objects


In [39]:
compute_macro(m4pred['GWA Title'], m4pred['gwa_class'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.3851650661612, 0.4374095697649561, 0.3799185673510285)

## save to file

In [40]:
print(GWA_predictions.shape)
GWA_predictions.head()

(4708, 5)


Unnamed: 0,GWA Title,m1_pred,m2_pred,m3_pred,m4_pred
18834,Performing General Physical Activities,Assisting and Caring for Others,Controlling Machines and Processes,Performing General Physical Activities,Controlling Machines and Processes
18835,Monitoring and Controlling Resources,Monitoring and Controlling Resources,Monitoring and Controlling Resources,Monitoring and Controlling Resources,Monitoring and Controlling Resources
18836,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects
18837,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects
18838,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects,Handling and Moving Objects


In [41]:
# save all the predicted results to file
outfile = 'pred_test/all_predictioins.csv'
GWA_predictions.to_csv(outfile, header=True)


In [66]:
warnings.filterwarnings('ignore')

for i in range (GWA_predictions.shape[1]-1):
    results = compute_macro(GWA_predictions.iloc[:,0], GWA_predictions.iloc[:,i+1])
    print('Results for ', GWA_predictions.columns[i+1], results)

Results for  m1_pred {'accuracy': 0.5290994052676296, 'f1_macro': 0.37797274884904, 'recall_macro': 0.4400122881572895, 'precision_macro': 0.3827088228480856, 'hamming_loss': 0.47090059473237045}
Results for  m2_pred {'accuracy': 0.5138062871707731, 'f1_macro': 0.3718534449101467, 'recall_macro': 0.437294272612504, 'precision_macro': 0.3901104602488269, 'hamming_loss': 0.48619371282922685}
Results for  m3_pred {'accuracy': 0.5070093457943925, 'f1_macro': 0.34167679573472814, 'recall_macro': 0.39892455303591784, 'precision_macro': 0.34292711239468954, 'hamming_loss': 0.4929906542056075}
Results for  m4_pred {'accuracy': 0.5339847068819031, 'f1_macro': 0.3851650661612, 'recall_macro': 0.4374095697649561, 'precision_macro': 0.3799185673510285, 'hamming_loss': 0.46601529311809686}
