In [1]:
#Loading the test data
import pandas as pd
test = pd.read_csv("data/test.csv", index_col='Index')

In [2]:
test.head()

Unnamed: 0_level_0,LossDescription,ResultingInjuryDesc,PartInjuredDesc,Cause - Hierarchy 1,Body Part - Hierarchy 1
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,,,Thumb,,
17,,,Lower Arm,,
20,,,Abdomen,,
47,EE was getting out of the truck and twisted he...,,,,
48,Worker was making a delivery and her right han...,,,,


In [3]:
#Function to merge columns
def merge_cols(row):
    txt = ""
    for i in row:
        if isinstance(i, str):
            txt = txt+', '+i
    return txt[2:]

In [4]:
#Function to preprocess the data and saves it as a csv file
def preprocess(test):
    test['description'] = test.iloc[:, :4].apply(lambda x : merge_cols(x), axis=1)
    test = test.drop(columns=['LossDescription','ResultingInjuryDesc','PartInjuredDesc'])
    for columns in test.columns:
        test.loc[:, columns] = test.loc[:, columns].apply(lambda x: x.lower() if isinstance(x, str) else x)
    test.iloc[:, 2].to_csv("./data/preprocess_test.csv")
    

In [5]:
#Calling the function
preprocess(test)
test[['description']].head()

Unnamed: 0_level_0,description
Index,Unnamed: 1_level_1
5,Thumb
17,Lower Arm
20,Abdomen
47,EE was getting out of the truck and twisted he...
48,Worker was making a delivery and her right han...


In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
#Loading the data
test = load_dataset("csv", data_files="data/preprocess_test.csv")


#Loading the tokenizer for the bert-base-uncased model
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

#Function to Tokenize the data
def tokenize_function(example):
    return tokenizer(example["description"], truncation=True)

#Tokenizing the data
tokenized_test = test.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/435 [00:00<?, ? examples/s]

In [7]:
#Preprocessing the tokenized data
tokenized_test = tokenized_test.remove_columns(["description", "Index"])
tokenized_test.set_format("torch")
tokenized_test["train"].column_names

['input_ids', 'token_type_ids', 'attention_mask']

In [8]:
from torch.utils.data import DataLoader
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#Loading the test set into a dataloader of single batch
dataloader = DataLoader(
    tokenized_test["train"], batch_size=tokenized_test["train"].num_rows, collate_fn=data_collator, shuffle=False
)

In [9]:
from transformers import AutoModelForSequenceClassification
#Loading the saved models for the classification task
cause_model = AutoModelForSequenceClassification.from_pretrained("cause_model").to(device)
bodypart_model = AutoModelForSequenceClassification.from_pretrained("bodypart_model").to(device)

In [10]:
#Prediction function that returns the prediction
def prediction(model, data_loader):
    for batch in data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)  
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  
    return predictions

In [11]:
#Running Inference to get predictions
cause_preds = prediction(cause_model, dataloader)
bodypart_preds = prediction(bodypart_model, dataloader)


In [12]:
print(cause_preds.shape, bodypart_preds.shape)

torch.Size([435]) torch.Size([435])


In [13]:
#Labels
causes = ['burn or scald - heat or cold exposures - contact with',
 'caught in, under or between',
 'cut, puncture, scrape injured by',
 'fall, slip or trip injury',
 'fall, slip, or trip injury',
 'includes freezing',
 'misc',
 'motor vehicle',
 'rubbed or abraded by',
 'strain or injury by',
 'striking against or stepping on',
 'struck or injured by']
bodyparts = ['head',
 'lower extremities',
 'misc',
 'multiple body parts',
 'neck',
 'trunk',
 'upper extremities']

In [14]:
#Loading the test data
test = pd.read_csv("./data/test.csv")

In [15]:
test.head()

Unnamed: 0,LossDescription,ResultingInjuryDesc,PartInjuredDesc,Cause - Hierarchy 1,Body Part - Hierarchy 1,Index
0,,,Thumb,,,5
1,,,Lower Arm,,,17
2,,,Abdomen,,,20
3,EE was getting out of the truck and twisted he...,,,,,47
4,Worker was making a delivery and her right han...,,,,,48


In [16]:
#Replacing the nan values with the predicted values
test.loc[:, 'Cause - Hierarchy 1'] = pd.Series([causes[i] for i in cause_preds])
test.loc[:, 'Body Part - Hierarchy 1'] = pd.Series([bodyparts[i] for i in bodypart_preds])

 'caught in, under or between' 'includes freezing'
 'caught in, under or between' 'strain or injury by' 'misc'
 'rubbed or abraded by' 'includes freezing' 'caught in, under or between'
 'caught in, under or between' 'fall, slip or trip injury'
 'caught in, under or between' 'misc' 'rubbed or abraded by'
 'caught in, under or between' 'caught in, under or between' 'misc'
 'caught in, under or between' 'fall, slip or trip injury'
 'caught in, under or between' 'fall, slip or trip injury'
 'caught in, under or between' 'caught in, under or between'
 'caught in, under or between' 'caught in, under or between'
 'includes freezing' 'caught in, under or between'
 'fall, slip or trip injury' 'caught in, under or between'
 'fall, slip or trip injury' 'fall, slip or trip injury'
 'caught in, under or between' 'struck or injured by'
 'fall, slip or trip injury' 'struck or injured by' 'struck or injured by'
 'motor vehicle' 'struck or injured by' 'struck or injured by'
 'strain or injury by' 'misc

In [17]:
test.head()

Unnamed: 0,LossDescription,ResultingInjuryDesc,PartInjuredDesc,Cause - Hierarchy 1,Body Part - Hierarchy 1,Index
0,,,Thumb,"caught in, under or between",upper extremities,5
1,,,Lower Arm,"caught in, under or between",upper extremities,17
2,,,Abdomen,misc,trunk,20
3,EE was getting out of the truck and twisted he...,,,"caught in, under or between",lower extremities,47
4,Worker was making a delivery and her right han...,,,includes freezing,upper extremities,48


In [18]:
test.to_csv("test.csv")