In [4]:
import pandas as pd
import numpy as np
import math
import random
import nltk
import re
from nltk.corpus import stopwords
import string

import matplotlib.pyplot as plt
from operator import itemgetter

## Token Classification

### Load dataset

In [11]:
#df_name = 'umn_df_for_CRFner_40_0725.parquet'  # UMN-40 in study
df_name = 'medal_df_max500v2_for_ner_1005_0726.parquet' # MeDAL-RTE in study
#df_name = 'medal_df_for_ner_1005_0727.parquet' # MeDAL in study
#df_name = 'umn_df_for_ner_203_0727.parquet' # UMN in study
#df_name = 'medal_df_max500v2_for_CRFner_40_0803.parquet' # MeDAL-RTE-40 in study

ner_df = pd.read_parquet(df_name)

ner_df

Unnamed: 0,TEXT_clean_nostp,updated_noStp_LOCATION,LABEL_final,ABV_final,NER_labels,NER_labels_words
0,"[reduced, coenzyme, qcytochrome, c, reductase,...","[12, 87, 98, 108]","[vesicles, vesicles, alone, energy]","[LDV, LDV, CT, SE]","[1005, 1005, 1005, 1005, 1005, 1005, 1005, 100...","[NA_word, NA_word, NA_word, NA_word, NA_word, ..."
1,"[EP, techniques, employed, examine, nature, re...","[0, 28]","[electrophysiological, minutes]","[EP, T2]","[4, 1005, 1005, 1005, 1005, 1005, 1005, 1005, ...","[electrophysiological, NA_word, NA_word, NA_wo..."
2,"[excretion, enzyme, gammaglutamyltranspeptidas...",[39],[necrosis],[CN],"[1005, 1005, 1005, 1005, 1005, 1005, 1005, 100...","[NA_word, NA_word, NA_word, NA_word, NA_word, ..."
3,"[peptidases, activities, compared, human, leuc...","[14, 19]","[active, alkaline]","[AS, ALP]","[1005, 1005, 1005, 1005, 1005, 1005, 1005, 100...","[NA_word, NA_word, NA_word, NA_word, NA_word, ..."
4,"[activity, three, known, conducting, systems, ...",[43],[pulse],[PP],"[1005, 1005, 1005, 1005, 1005, 1005, 1005, 100...","[NA_word, NA_word, NA_word, NA_word, NA_word, ..."
...,...,...,...,...,...,...
73191,"[tremendous, effort, accessing, liquidphase, s...","[16, 37]","[relaxation, twodimensional]","[EC50, 2DE]","[1005, 1005, 1005, 1005, 1005, 1005, 1005, 100...","[NA_word, NA_word, NA_word, NA_word, NA_word, ..."
73192,"[glass, transition, binodals, asymmetric, bina...","[9, 38, 68, 74]","[approach, agreement, effective, effective]","[RPA, PA, ERP, ERP]","[1005, 1005, 1005, 1005, 1005, 1005, 1005, 100...","[NA_word, NA_word, NA_word, NA_word, NA_word, ..."
73193,"[mechanical, properties, thermally, excited, 2...","[4, 23]","[twodimensional, study]","[2DE, T0]","[1005, 1005, 1005, 1005, 397, 1005, 1005, 1005...","[NA_word, NA_word, NA_word, NA_word, twodimens..."
73194,"[approaches, mapping, time, series, networks, ...",[102],[nodes],[NO],"[1005, 1005, 1005, 1005, 1005, 1005, 1005, 100...","[NA_word, NA_word, NA_word, NA_word, NA_word, ..."


### Split dataset

In [None]:
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split


train_dataset,test_dataset = train_test_split(ner_df[['TEXT_clean_nostp', 'updated_noStp_LOCATION', 'NER_labels']], test_size=0.2,random_state=1)
test_dataset, val_dataset = train_test_split(test_dataset, test_size=0.3, random_state=1) 
train_dataset = Dataset(pa.Table.from_pandas(train_dataset))
val_dataset = Dataset(pa.Table.from_pandas(val_dataset))
test_dataset = Dataset(pa.Table.from_pandas(test_dataset))

import datasets
final_dataset = datasets.DatasetDict({'train':train_dataset, 'validation':val_dataset, 'test':test_dataset})

final_dataset

### Set BERT model

In [None]:
task = "ner" 
#model_checkpoint = "dmis-lab/biobert-v1.1" #biobert
#model_checkpoint = 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12' #bluebert
#model_checkpoint = 'NLP4H/ms_bert'
#model_checkpoint = 'distilbert-base-uncased'
#model_checkpoint = 'emilyalsentzer/Bio_ClinicalBERT'
model_checkpoint = 'allenai/scibert_scivocab_uncased'
batch_size = 8
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

### Tokenizing and aligning data
since tokenization can break down words into subwords, we need to align the labels so that each subword is assigned the correct label as well


In [None]:
## code from: https://huggingface.co/docs/transformers/tasks/token_classification

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["TEXT_clean_nostp"], is_split_into_words=True, 
    )
    labels = []
    label = examples['NER_labels']
   
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        # Special tokens have a word id that is None. We set the label to -100 so they are automatically ignored in the loss function.

        if word_idx is None:
            labels.append(-100)
        # We set the label for the first token of each word.
        elif word_idx != previous_word_idx:
            labels.append(label[word_idx])
        # For the other tokens in a word, we set the label to either the current label
        else:
            labels.append(label[word_idx])

        previous_word_idx = word_idx

    
    tokenized_inputs["word_ids"] = word_ids
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_datasets = final_dataset.map(tokenize_and_align_labels)

### Save to disk for model training. I ran on COLAB for GPU usage

In [None]:
#tokenized_datasets.save_to_disk('umn_40_tokenized_dataset_SciBERT_addedtokens_0823_t1v1.json')

## Text classification

In [None]:
#df_name = 'umn_df_for_CRFner_40_0725.parquet'  # UMN-40 in study
df_name = 'medal_df_max500v2_for_ner_1005_0726.parquet' # MeDAL-RTE in study
#df_name = 'medal_df_for_ner_1005_0727.parquet' # MeDAL in study
#df_name = 'umn_df_for_ner_203_0727.parquet' # UMN in study
#df_name = 'medal_df_max500v2_for_CRFner_40_0803.parquet' # MeDAL-RTE-40 in study
text_class_df = pd.read_parquet(df_name)

### Split data prior to text classification dataset creation in order to ensure proper comparison

In [None]:

train_dataset,test_dataset = train_test_split(text_class_df, test_size=0.2,random_state=1)
test_dataset, val_dataset = train_test_split(test_dataset, test_size=0.3, random_state=1) 

In [None]:
unique_labs = text_class_df.LABEL_final.explode().unique()
#np.save('unique_labs_umn_40_txtclass_0824.npy', unique_labs)

In [None]:
def split_for_text_classification(df):
    new_df = pd.DataFrame(columns=['ABSTRACT_ID', 'TEXT', 'ABV', 'LABEL'])
    for i,(row_id,row) in enumerate(df.iterrows()):
        #print(i, row_id)
        #f i % 5000 == 0: print(i)
        for idx, loc_idx in enumerate(row['updated_noStp_LOCATION']):
            #print(loc_idx)
            new_row = {'ABSTRACT_ID': i,'TEXT': row['TEXT_clean_nostp'][max(0,loc_idx-8):loc_idx+9], 'ABV': row['ABV_final'][idx], 'LABEL':row['LABEL_final'][idx] }
            new_df = new_df.append(new_row, ignore_index=True)
        #print(new_df)
    return new_df


In [None]:
train_dataset = split_for_text_classification(train_dataset)
print('done')
test_dataset = split_for_text_classification(test_dataset)
print('done')
val_dataset = split_for_text_classification(val_dataset)

In [None]:
train_dataset = Dataset(pa.Table.from_pandas(train_dataset))
val_dataset = Dataset(pa.Table.from_pandas(val_dataset))
test_dataset = Dataset(pa.Table.from_pandas(test_dataset))

import datasets
final_dataset = datasets.DatasetDict({'train':train_dataset, 'validation':val_dataset, 'test':test_dataset})

final_dataset

In [None]:
#model_checkpoint = "dmis-lab/biobert-v1.1" #biobert
#model_checkpoint = 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12' #bluebert
#model_checkpoint = 'NLP4H/ms_bert'
#model_checkpoint = 'distilbert-base-uncased'
#model_checkpoint = 'emilyalsentzer/Bio_ClinicalBERT'
model_checkpoint = 'allenai/scibert_scivocab_uncased'
batch_size = 8
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

### No need to align since this is just text classification

In [None]:
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples["TEXT"], is_split_into_words=True)
    tokenized_inputs['label'] = list(unique_labs).index(examples['LABEL'])  
    return tokenized_inputs

In [None]:
tokenized_datasets = final_dataset.map(preprocess_function)