# LERCause: Causal Sentence Identification with LER (Nuclear Safety Reports)

This code is to run pre-trained language models (BERT, BioBERT, SciBERT) on LER data for sentence classification and prediction.  

Author:   
1. Jinmo Kim: School of Information Sciences, University of Illinois Urbana-Champaign   
2. Jenna Kim: School of Information Sciences, Univeristy of Illinois Urbana-Champaign      

Cite this paper:   

Kim, J., Kim, J., Lee, A., Kim, J., Diesner, J. (2024). LERCause: Deep learning approaches for causal sentence identification from nuclear safety reports. Plos One.


# 1. Setup

## 1-1. Install package 

Install the transformers package from Hugging Face which is a pytorch interface for working with BERT-based models

In [None]:
# Use this command to check if packages are installed

#!pip list

In [None]:
# Install transformer ver: 4.30.0

#!pip install transformers==4.30.0

In [None]:
# Install PyTorch if this notebook is NOT running on AWS Sagemaker
# For AWS Sagemaker, make sure to run this notebook on 'conda_pytorch_p310' kernel. 
# You can change the kernel type (Go to Kernel menu -> Change kernel)

# !pip install torch==1.5.0

## 1-2. Load libraires

In [None]:
import timeit
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import transformers
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import gc

In [None]:
# Install imbalanced-learn library for sampling if needed

#!pip install imbalanced-learn

In [None]:
# Set up for plots and paramters

#%matplotlib inline
#config InlineBackend.config_format='retina'

#sns.set(style='darkgrid', palette='muted', font_scale=1.5)
#COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
#sns.set_palette(sns.color_palette(COLORS_PALETTE))

#rcParams["figure.figsize"] = (12, 6)

#RANDOM_SEED=42
#np.random.seed(RANDOM_SEED)
#torch.manual_seed(RANDOM_SEED)

## 1-3. Check GPU settings

In [None]:
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU
    device = torch.device("cuda")
    print('There are {:d} GPU(s) available.'.format(torch.cuda.device_count()))
    print('We will use the GPU: ', torch.cuda.get_device_name(0))
    
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

In [None]:
# check GPU memory
!nvidia-smi

# clear the occupied cuda memory for efficient use

# import gc
#gc.collect()
#torch.cuda.empty_cache()

# Kill a process in running if more GPU space is needed
# !sudo kill -9 334

# 2. Functions

In [None]:
def load_data(filename, record):
    """
    Read in input file and load data
    
    filename: csv file   
    record: text file to include a processing output
    
    return two dataframes
    """
    
    ''' Read in data from input file '''
    df = pd.read_csv(filename, encoding='utf-8')
    
    ''' Display no of rows and columns '''
    print("No of Rows: {}".format(df.shape[0]))
    print("No of Columns: {}".format(df.shape[1]))
    print("No of Rows: {}".format(df.shape[0]), file=record)
    print("No of Columns: {}".format(df.shape[1]), file=record)
    
    
    ''' Select data needed for processing & rename columns '''
    df = df[['PMID', 'USENID', 'SENT', 'CLASS']]
    df.rename({"PMID": "pmid", "USENID": "usenid", "SENT": "sentence", "CLASS": "label"}, 
              axis=1, 
              inplace=True)

   
    ''' Remove null values '''
    df=df.dropna()
    
    print("No of rows (After removing null): {}".format(df.shape[0]), file=record)
    print("No of columns: {}".format(df.shape[1]), file=record)
    print("No of rows (After removing null): {}".format(df.shape[0]))
    print("No of columns: {}".format(df.shape[1]))
    
    
    ''' Check the first few instances '''  
    print("\n<Data View: First Few Instances>")
    print("\n", df.head()) 
    print("\n<Data View: First Few Instances>", file=record)
    print("\n", df.head(), file=record)
    
    
    ''' Display no of lables and rows ''' 
    print('\nClass Counts(label, row): Total')
    print(df.label.value_counts())
    print('\nClass Counts(label, row): Total', file=record)
    print(df.label.value_counts(), file=record)
    

    return df

In [None]:
def token_distribution(df):
    """
       Display a distribution of tokens
       
       df: a dataframe
    """
        
    token_lens = []
    long_tokens = []
    
    
    ''' Split text into tokens '''
    for pmid, usenid, txt in zip(df.pmid, df.usenid, df.sentence):
        tokens = tokenizer.encode(txt, padding=True, truncation=True, max_length=512)
        token_lens.append(len(tokens))

        ''' Check a sentence with extreme length '''
        if len(tokens) > 150:
            long_tokens.append((pmid, usenid, len(tokens)))
    
    print("\n************* Token Distribution: train data *************")
    print("long sentences: ")
    print(long_tokens)

    
    ''' Plot the distribution '''
    print("Min token:", min(token_lens))
    print("Max token:", max(token_lens))

    sns.displot(token_lens)
    plt.xlim([0,max(token_lens)+10])
    plt.xlabel("Token Count")

In [None]:
def sample_data(X_train, 
                y_train, 
                record, 
                sampling=0, 
                sample_method='over'):  
    """
       Sampling input train data
       
       X_train: dataframe of X train data
       y_train: datafram of y train data
       record: text file including a processing output
       sampling: indicator of sampling funtion is on or off
       sample_method: method of sampling (oversampling or undersampling)
       
       return two sampled dataframes
    """
    
    from imblearn.over_sampling import RandomOverSampler
    from imblearn.under_sampling import RandomUnderSampler
    
    
    ''' Select a sampling method '''
    if sampling:
        if sample_method == 'over':
            oversample = RandomOverSampler(random_state=42)
            X_over, y_over = oversample.fit_resample(X_train, y_train)
    
            print('\n************** Data Sampling **************')
            print('\nOversampled Data (class, Rows):\n{}'.format(y_over.value_counts()))
            print('\n************** Data Sampling **************', file=record)
            print('\nOversampled Data (class, Rows):\n{}'.format(y_over.value_counts()), file=record)
            
            X_train_sam, y_train_sam = X_over, y_over
            
        elif sample_method == 'under':
            undersample = RandomUnderSampler(random_state=42)
            X_under, y_under = undersample.fit_resample(X_train, y_train)
        
            print('\n************** Data Sampling **************')
            print('\nUndersampled Data (class, Rows):\n{}'.format(y_under.value_counts()))
            print('\n************** Data Sampling **************', file=record)
            print('\nUndersampled Data (class, Rows):\n{}'.format(y_under.value_counts()), file=record)
            
            X_train_sam, y_train_sam = X_under, y_under
    else:
        X_train_sam, y_train_sam = X_train, y_train 
        
        print('\n************** Data Sampling **************')
        print('\nNo Sampling Performed\n')
        print('\n************** Data Sampling **************', file=record)
        print('\nNo Sampling Performed\n', file=record)
    
    return X_train_sam, y_train_sam

In [None]:
class LabelDataset(Dataset):   
    
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        review = " ".join(review.split())
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            review,
            None,                    
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt')

        return {
            'text': review,
            'input_ids': encoding['input_ids'].flatten(),            
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)}

In [None]:
def create_data_loader(df, 
                       tokenizer, 
                       max_len, 
                       batch_size):
    """
       Create a data loader
       
       df: dataframe 
       tokenizer: tokenizer
       max_len: maximum input length
       batch_size: size of batch
       
       return data loader
    """
    
    ds = LabelDataset(
        reviews = df.sentence.to_numpy(),
        targets = df.label.to_numpy(),
        tokenizer = tokenizer,
        max_len = max_len)

    return DataLoader(ds, batch_size = batch_size, num_workers = 1)

In [None]:
class LabelClassifier(nn.Module):
    
    def __init__(self, n_classes, pretrained_model):
        super(LabelClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(p=0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        bert_out = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids)
        output_dropout = self.dropout(bert_out.pooler_output)
        output = self.linear(output_dropout)

        return output

In [None]:
def train_model(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples,
    outfile):
    """
       Fuction to set up for model training
       
       model: a model for training
       data_loader: data loader
       loss_fn: loss function
       optimizer: optimizer
       device: processing device
       scheduler: scheduler
       n_examples: number of samples
       outfile: file containing a summary of processing output
       
       return prediction score, mean loss values
    """

    model = model.train()

    losses = []
    correct_predictions = 0
    
    ''' Load data to a model'''
    for d in data_loader:
        input_ids = d["input_ids"].to(device, dtype=torch.long)
        attention_mask = d["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = d["token_type_ids"].to(device, dtype=torch.long)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids=token_type_ids)
        
        ''' Prediction output '''
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print("Correct Prediction (Train): {} out of {}".format(correct_predictions.int(), n_examples), file=outfile)
    print("Correct Prediction (Train): {} out of {}".format(correct_predictions.int(), n_examples))

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(
    model,
    data_loader,
    loss_fn,
    device,
    n_examples,
    outfile):
    """
      Evaluate a model performance
      
      model: trained model
      data_loader: data loader
      loss_fn: loss function
      device: processing device
      n_examples: number of samples
      outfile: file containing a summary of processing output
    """

    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device, dtype=torch.long)
            attention_mask = d["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = d["token_type_ids"].to(device, dtype=torch.long)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                token_type_ids = token_type_ids
                )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    print("Correct Prediction (Eval): {} out of {}".format(correct_predictions.int(), n_examples), file=outfile)
    print("Correct Prediction (Eval): {} out of {}".format(correct_predictions.int(), n_examples))

    return correct_predictions.double()/n_examples, np.mean(losses)

In [None]:
def plot_train_history(history):
    """
       Plot loss and accuracy of training & validation
       
       history: a dictionary containing a summary of training and valiadation scores
    """
    
    plt.plot(history["train_acc"], 'b-o', label="train accuracy")
    plt.plot(history["val_acc"], 'r-o', label="validation accuracy")

    plt.title("Training History")
    plt.ylabel("Accuracy")
    plt.xlabel("Epoch")
    plt.legend()
    plt.xticks(history["epoch"])
    plt.yticks(np.arange(0,1.2,step=0.05))
    plt.ylim([0,1.05])

In [None]:
def training_loop(epochs,
                  modelname,
                  model,
                  train_data_loader,
                  val_data_loader,
                  loss_fn,
                  optimizer,
                  device,
                  scheduler,
                  n_train,
                  n_val,
                  model_file,
                  record):
    """
      Controller for a training process
      
      epochs: number of epoch
      modelname: name of selected model
      model: model for training
      train_data_loader: data loader for training
      val_data_loader: data loader for validation
      loss_fn: loss function
      optimizer: optimizer
      device: processing device
      scheduler: scheduler
      n_train: number of train samples
      n_val: number of validation samples
      model_file: file to save trained model
      record: file containing a summary of processing output
    """
    
    print("\n********** " + modelname + " **********")
    print("\n********** " + modelname + " **********", file=record) 

    history = defaultdict(list)
    best_accuracy = 0
    
    ''' Train and evaluation'''
    for epoch in range(epochs):
        print("\nEpoch {} / {}".format(str(epoch + 1), str(epochs)))
        print("-" * 60)
        print("\nEpoch {} / {}".format(str(epoch + 1), str(epochs)), file=record)
        print("-" * 60, file=record)

        train_acc, train_loss = train_model(model,
                                            train_data_loader,
                                            loss_fn,
                                            optimizer,
                                            device,
                                            scheduler,
                                            n_train,
                                            outfile=record)
        
        print("Train Loss: {}, Accuracy: {}\n".format(train_loss, train_acc))
        print("Train Loss: {}, Accuracy: {}\n".format(train_loss, train_acc), file=record)
        

        val_acc, val_loss = eval_model(model,
                                       val_data_loader,
                                       loss_fn,
                                       device,
                                       n_val,
                                       outfile=record)
        
        print("Validation Loss: {}, Accuracy: {}".format(val_loss, val_acc))
        print("Validation Loss: {}, Accuracy: {}".format(val_loss, val_acc), file=record)
        

        ''' Save the state of the best model '''
        history["epoch"].append(epoch)
        history["train_acc"].append(train_acc)
        history["train_loss"].append(train_loss)
        history["val_acc"].append(val_acc)
        history["val_loss"].append(val_loss)

        if val_acc > best_accuracy:
            if model_file:
                torch.save(model.state_dict(), model_file)
            best_accuracy = val_acc

    
    ''' Plot training & validation accuracy '''
    #plot_train_history(history)
    

In [None]:
def get_predictions(model, 
                    data_loader, 
                    device):
    """
      Predict label class
      
      model: trained model
      data_loader: data loader
      device: processing device
    """

    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        
        ''' Get output from the model with test data'''
        for d in data_loader:
            texts = d["text"]
            input_ids = d["input_ids"].to(device, dtype=torch.long)
            attention_mask = d["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = d["token_type_ids"].to(device, dtype=torch.long)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                token_type_ids = token_type_ids
            )

            _, preds = torch.max(outputs, dim=1)

            ''' Normalize the raw output to get probability for each clas '''
            probs = F.softmax(outputs, dim=1)
            #probs = torch.sigmoid(outputs)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    ''' Move output to cpu for calculation '''
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()
    real_values = torch.stack(real_values).cpu()

    return review_texts, predictions, prediction_probs, real_values

In [None]:
def evaluate_model(y_test, 
                   y_pred, 
                   record, 
                   eval_model=0):
    """
      evaluate a model performance
      
      y_test: original y test data
      y_pred: predicted y values
      record: text file containing a processing output
      eval_model: indicator if this funtion is on or off
    """
    
    if eval_model:
        
        ''' Create a confusion matrix '''
        print('\nConfusion Matrix:\n')
        print(confusion_matrix(y_test, y_pred))
        print('\nConfusion Matrix:\n', file=record)
        print(confusion_matrix(y_test, y_pred), file=record)
        
        ''' Display a classification report '''
        print('\nClassification Report:\n')
        print(classification_report(y_test, y_pred, digits=4))
        print('\nClassification Report:\n', file=record)
        print(classification_report(y_test, y_pred, digits=4), file=record)

In [None]:
def predict_proba(df_test, y_text, y_test, y_pred, y_pred_probs, proba_file, proba_on=0):

    """
       Predict probability of each class

       df_test: original X test data
       y_text: text data sentence
       y_test: original y test data
       y_pred: predicted y values
       y_pred_probs: probability scores of prediction
       proba_file: output file of probability scores
       proba_on: decide if the probability output is expected

    """
    if proba_on:
        
        df_result = pd.DataFrame({
            'pmid': df_test["pmid"],
            'usenid': df_test["usenid"],
            'sentence': y_text,
            'prob_0': y_pred_probs[:, 0],
            'prob_1': y_pred_probs[:, 1],
            'pred': y_pred,
            'act': y_test})

        df_result.to_csv(proba_file, encoding='utf-8', header=True, index=False)

In [None]:
def split_data(input_file, 
               result_file):
    """
       Split data from input file
       
       input_file: file containing input data 
       result_file: name of output file of evaluation
       
       return X and y dataframes
    """
    
    ''' Open result file for records '''
    f=open(result_file, "a")
    
    
    ''' Load data '''
    print("\n************** Loading Data ************\n")
    print("\n************** Loading Data ************\n", file=f)
    
    df = load_data(input_file, record=f)
    
    
    ''' Train and test split '''
    print("\n************** Spliting Data **************\n")
    print("\n************** Spliting Data **************\n", file=f)
    
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.label)
    df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42, stratify=df_test.label)
    
    print("Train Data: {}".format(df_train.shape))
    print("Val Data: {}".format(df_val.shape))
    print("Test Data: {}".format(df_test.shape))
    print("Train Data: {}".format(df_train.shape), file=f)
    print("Val Data: {}".format(df_val.shape), file=f)
    print("Test Data: {}".format(df_test.shape), file=f)
    
    print('\nClass Counts(label, row): Train')
    print(df_train.label.value_counts())
    print('\nClass Counts(label, row): Val')
    print(df_val.label.value_counts())
    print('\nClass Counts(label, row): Test')
    print(df_test.label.value_counts())
    print('\nClass Counts(label, row): Train', file=f)
    print(df_train.label.value_counts(), file=f)
    print('\nClass Counts(label, row): Val', file=f)
    print(df_val.label.value_counts(), file=f)
    print('\nClass Counts(label, row): Test', file=f)
    print(df_test.label.value_counts(), file=f)
    
    print("\nTest Data")
    print(df_test.head())
    print("\nTest Data", file=f)
    print(df_test.head(), file=f)
    
        
    return (df_train, df_val, df_test)
    
    f.close()

In [None]:
def model_train(df_train, 
                df_val,
                datasize_change,
                sample_balance,
                balance_sampling_on,                                   
                balance_sampling_type,
                sample_ratio,
                ratio,
                sample_on, 
                sample_type, 
                tokenizer,
                max_len,
                batch_size,
                modelname,
                n_class,
                device,
                pretrained_model,
                learning_rate,
                epochs,
                model_file,
                result_file):  
    """
       Function for data processing and model fitting
       
       df_train: dataframe containing train data 
       df_val: dataframe containing validation data
       datasize_change: data size change on or off
       sample_balance: balance of sample on or off
       balance_sampling_on: sampling on or off when balance is 1
       balance_samplling_type: sample type to choose if balance_sampling_on is 1
       sample_ratio: proportion of data size for balance sampling
       ratio: proportion of data size
       sample_on: sampling on or off
       sample_type: sample type to choose if sample_on is 1
       tokenizer: file containing tokenizer
       max_len: maximun length of tokens
       eval_on: model evaluation on or off
       tokenizer_file: file to save tokenizer
       max_len: maximun length of tokens
       batch_size: size of batch 
       modelname: name of selected model
       n_class: number of label class
       device: processing device
       pretrained_model: path to model for training
       learning_rate: learning rate
       epochs: number of epoch
       model_file: file to save model
       result_file: output file for records
    """
    
    ''' Open result file for records '''
    f = result_file
    
    ''' Data size change '''    
    if datasize_change:
        
        ''' Sample data with balance (1:1) '''
        if sample_balance:
            
            print("\n************** Data Balancing: Label Class (1:1) *************\n")
            print("\n************** Data Balancing: Label Class (1:1) *************\n", file=f)
            
            X_train = df_train.iloc[:, :-1]
            y_train = df_train.iloc[:, -1]

            X_train_samp, y_train_samp = sample_data(X_train, 
                                                     y_train, 
                                                     record=f, 
                                                     sampling=balance_sampling_on,
                                                     sample_method=balance_sampling_type)

            df_train_concat = pd.concat([X_train_samp, y_train_samp], axis=1)
            df_train = df_train_concat
            
            print('\nClass Counts(label, row): After balancing')
            print(df_train.label.value_counts())
            print('\nClass Counts(label, row): After balancing', file=f)
            print(df_train.label.value_counts(), file=f)
            
            print("\n<Balanced Train Data>")
            print(df_train.head())
            print("\n<Balanced Train Data>", file=f)
            print(df_train.head(), file=f)
           
        
        ''' Sample data based on size ratio '''     
        if sample_ratio:
            
            X_train = df_train.iloc[:, :-1]
            y_train = df_train.iloc[:, -1]
            
            if ratio == 1:
                X_train = X_train
                y_train = y_train       
            else:
                X_train, _, y_train, _ = train_test_split(X_train, 
                                                          y_train, 
                                                          train_size=ratio, 
                                                          random_state=42, 
                                                          stratify=y_train)
            
            ''' Combine x_train and y_train data '''
            df_train = pd.concat([X_train, y_train], axis=1)
            
            print("\n************** Data Size Change: Ratio *************\n")
            print("Data Ratio: {}".format(ratio))
            print("\n************** Data Size Change: Ratio *************\n", file=f)
            print("Data Ratio: {}".format(ratio), file=f)
            
            print('\nClass Counts(label, row): After sampling')
            print(df_train.label.value_counts())
            print('\nClass Counts(label, row): After sampling', file=f)
            print(df_train.label.value_counts(), file=f)
            
            print("\n<Train Data Based on Ratio>")
            print(df_train.head())
            print("\n<Train Data Based on Ratio>", file=f)
            print(df_train.head(), file=f)
            
    
        ''' Reset index '''
        df_train=df_train.reset_index(drop=True)
        
        print("\n************** Processing Data **************")
        print("\nTrain Data: {}".format(df_train.shape))
        print("\n************** Processing Data **************", file=f)
        print("\nTrain Data: {}".format(df_train.shape), file=f)
        
        print('\nClass Counts(label, row): Train')
        print(df_train.label.value_counts())
        print('\nClass Counts(label, row): Train', file=f)
        print(df_train.label.value_counts(), file=f)
        
        print("\n<Train Data>")
        print(df_train.head())
        print("\n<Train Data>", file=f)
        print(df_train.head(), file=f)
        
    
    ''' Sampling '''
    if sample_on:
        X_train = df_train.iloc[:, :-1]
        y_train = df_train.iloc[:, -1]

        X_train_samp, y_train_samp = sample_data(X_train, 
                                                 y_train, 
                                                 record=f, 
                                                 sampling=balance_sampling_on,
                                                 sample_method=balance_sampling_type)

        df_train = pd.concat([X_train_samp, y_train_samp], axis=1)
        
        print("\nSampled Data: First Few Instances")
        print(df_train.head())
        print("\nSampled Data: First Few Instances", file=f)
        print(df_train.head(), file=f)
        
        
    ''' Transform data '''
    train_data_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
    val_data_loader = create_data_loader(df_val, tokenizer, max_len, batch_size)
    
    
    ''' Fitting a model '''
    print("\n************** Training Model: '" + modelname + "' **************")
    print("\n************** Training Model: '" + modelname + "' **************", file=f)

    n_train = len(df_train)
    n_val = len(df_val)

    
    ''' Create a classifier instance and move it to GPU '''
    model = LabelClassifier(n_class, pretrained_model)
    model = model.to(device)

    
    ''' Optimizer, scheduler, loss function '''
    optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False)
    total_steps = len(train_data_loader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, 
                                                num_training_steps = total_steps)

    loss_fn = nn.CrossEntropyLoss().to(device)

    ''' Loop training with epochs '''
    training_loop(epochs, modelname, model, train_data_loader, val_data_loader,
                loss_fn, optimizer, device, scheduler, n_train, n_val, model_file, record=f)
    
    print("\n\nTrained model: '" + model_file + "' saved in the local directory")   
    print("\n\nTrained model: '" + model_file + "' saved in the local directory", file=f)

In [None]:
def model_inference(df_test,
                    pretrained_model,
                    model_file,
                    n_class,
                    device,
                    tokenizer,
                    max_len,
                    batch_size,
                    eval_on,
                    proba_on,
                    proba_file,
                    result_file):
    """
       Function for prediction and evaluation
       
       df_test: dataframe containing test data 
       pretrained_model: path to model for training
       model_file: file containing trained model
       n_class: number of label class
       device: processing device
       tokenizer: tokenizer
       max_len: maximun length of tokens
       batch_size: size of batch
       eval_on: model evaluation on or off
       proba_on: probability on or off
       proba_file: output file to save probability
       result_file: name of output file of evaluation
    """
      
    ''' Open result file for records '''
    f = result_file
    
    
    ''' Load trained model '''    
    model = LabelClassifier(n_class, pretrained_model)
    model.load_state_dict(torch.load(model_file))
    model = model.to(device)
    
    print("\nA trained model from '" + model_file + "' loaded")
    print("\nA trained model from '" + model_file + "' loaded", file=f)
    
    
    ''' Get Predictions '''
    print("\n************** Getting Predictions **************", file=f)
    print("\n************** Getting Predictions **************")
    
    
    ''' Load tokenizer and transform test data '''
    print("\nTest Data: First Few Instances")
    print(df_test.head())
    print("\nTest Data: First Few Instances", file=f)
    print(df_test.head(), file=f)
    
    test_data_loader = create_data_loader(df_test, tokenizer, max_len, batch_size)
    
    
    ''' Predict class '''
    y_text, y_pred, y_pred_probs, y_test = get_predictions(model, 
                                                           test_data_loader, 
                                                           device)
    

    ''' Evaluate model performance '''
    print("\n************** Evaluating Performance **************", file=f)
    print("\n************** Evaluating Performance **************")
    evaluate_model(y_test, y_pred, record=f, eval_model=eval_on)
    

    ''' Generate output with probability scores '''   
    predict_proba(df_test, 
                  y_text, 
                  y_test, 
                  y_pred, 
                  y_pred_probs, 
                  proba_file=proba_file, 
                  proba_on=proba_on)
    
    if proba_on:
        print("\nOutput file:'" + proba_file + "' Created", file=f)
        print("\nOutput file:'" + proba_file + "' Created")
    

In [None]:
def main(df_train, 
         df_val, 
         df_test, 
         mode,
         datasize_change,
         sample_balance,
         balance_sampling_on,                                   
         balance_sampling_type,
         sample_ratio,
         ratio,
         sample_on, 
         sample_type, 
         tokenizer,
         max_len,
         batch_size,
         modelname,
         n_class,
         device,
         pretrained_model,
         learning_rate,
         epochs,
         model_file,
         eval_on,
         proba_on,
         proba_file,
         result_file):
    
    ''' Open result file for records '''
    record = open(result_file, "a")
    
    ''' Check the processing time '''
    proc_start_time = timeit.default_timer()
    
    ''' Select a mode for training or testing'''
    if mode == "train":
        
        model_train(df_train, 
                    df_val, 
                    datasize_change, 
                    sample_balance, 
                    balance_sampling_on,                                   
                    balance_sampling_type, 
                    sample_ratio, 
                    ratio, 
                    sample_on, 
                    sample_type, 
                    tokenizer, 
                    max_len, 
                    batch_size, 
                    modelname, 
                    n_class, 
                    device, 
                    pretrained_model,
                    learning_rate, 
                    epochs, 
                    model_file, 
                    result_file=record) 
    
    elif mode == "test":
        
        model_inference(df_test, 
                        pretrained_model, 
                        model_file, 
                        n_class, 
                        device, 
                        tokenizer, 
                        max_len,
                        batch_size, 
                        eval_on, 
                        proba_on, 
                        proba_file, 
                        result_file=record)
    
    
    ''' Check the processing time '''
    proc_elapsed = timeit.default_timer() - proc_start_time
    
    print("\n************** Processing Time **************")
    print("\n{}: {} sec\n".format(mode, round(proc_elapsed,2)))
    print("\n************** Processing Time **************", file=record)
    print("\n{}: {} sec\n".format(mode, round(proc_elapsed,2)), file=record)
    
    print("\nSummary file:'" + result_file + "' Created\n")
    print("\nSummary file:'" + result_file + "' Created\n", file=record)
    
    record.close()

# 3. Run Code

In [None]:
if __name__== "__main__":

    ###############################################
    ##########  1. Set Parameter Values  ##########
    ###############################################

    ########  1-1. Input file name  ########
    input_filename="LER_rawdata.csv"
    
        
    ########  1-2. Which mode to run?  ########   
    mode_name = "data-split"                                    # 3 options: "data-split", "train", "test"
                                                                # Use "data-split" before "train" or/and "test"
    
    ########  1-3. Data size change?  ########
    ## 1-3-1. Change on/off?
    datachange_on = 0                                           # 0 for no change; 1 for change of data size
    
    ## 1-3-2. class balance (1:1)?
    balance_on = 0                                              # 0 for no balance; 1 for class balance (1:1)
    balance_sample_on = 0                                       # 0 for no sampling; 1 for sampling
    balance_sample_type = 'under'                               # 'over'(oversampling); 'under'(undersampling)
    balance_str = 'balance' + str(balance_on) + '_'
    
    ## 1-3-3. data increase?
    ratio_on = 0                                                # 0 for no ratio; 1 for ratio for data size
    ratio_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]  # a list containing ratio numbers

    
    ########  1-4. Sampling applied?  ########
    sampling_on = 0                                             # 0 for no sampling; 1 for sampling
    sampling_type = 'over'                                      # 'over'(oversampling)/'under'(undersampling)
    
    
    ########  1-5. Which BERT-based model to use?  ########
    pretrained_modelname = 'bert-base-cased'
    #pretrained_modelname = 'dmis-lab/biobert-base-cased-v1.1'
    #pretrained_modelname = 'allenai/scibert_scivocab_cased'

    if mode_name == "train" or mode_name == "test":
        tokenizer = AutoTokenizer.from_pretrained(pretrained_modelname)
    
    modelname = pretrained_modelname.split("/")[-1]
    
    
    ########  1-6. Binary or multi classification?  ########
    num_class = 2                                              # number of label class
    

    ########  1-7. Check token distribution for MAX_LEN value: uncommentize if needed  ########
    #print("\n************** Token Distribution **************")
    #df_token = load_data(input_filename, record=None)
    #token_distribution(df_token)

    
    ########  1-8. Hyperparameters for BERT?  ########        
    MAX_LEN = 150                                             
    BATCH_SIZE = 16                                          
    EPOCHS = 4                                                
    LEARNING_RATE = 2e-5                                     

    
    ########  1-9. Evaluation & probability file  ########
    eval_on = 1                                               # 0 for no; 1 for yes (evaluation scores)
    proba_on = 1                                              # 0 for no; 1 for yes (probability & prediction output)

 
    ###############################################
    ##########   2. Run Main Fuction    ###########
    ###############################################

    if mode_name == "data-split": 
        eval_file = "summary_bert_" + mode_name + ".txt" 
        df_train, df_val, df_test = split_data(input_filename, eval_file)
        
    else:
        if datachange_on:
            for ratio in ratio_list:
                if sampling_on:
                    model_file="model_bert_"+balance_str+str(ratio)+"_"+sampling_type+"_"+modelname+".bin"
                    proba_file="result_bert_"+balance_str+str(ratio)+"_"+sampling_type+"_"+modelname+".csv"
                    eval_file="summary_bert_"+mode_name+"_"+balance_str+str(ratio)+"_"+sampling_type+"_"+modelname+".txt"
                
                else:
                    model_file="model_bert_"+balance_str+"_"+str(ratio)+"_"+modelname+".bin"
                    proba_file="result_bert_"+balance_str+"_"+str(ratio)+"_"+modelname+".csv"
                    eval_file="summary_bert_"+mode_name+"_"+balance_str+"_"+str(ratio)+"_"+modelname+".txt"  

                main(df_train, 
                     df_val, 
                     df_test, 
                     mode=mode_name, 
                     datasize_change=datachange_on,
                     sample_balance=balance_on, 
                     balance_sampling_on=balance_sample_on,                                      
                     balance_sampling_type=balance_sample_type, 
                     sample_ratio=ratio_on,
                     ratio=ratio, 
                     sample_on=sampling_on, 
                     sample_type=sampling_type,
                     tokenizer=tokenizer, 
                     max_len=MAX_LEN, 
                     batch_size=BATCH_SIZE,
                     modelname=modelname, 
                     n_class=num_class, 
                     device=device,
                     pretrained_model=pretrained_modelname, 
                     learning_rate=LEARNING_RATE,
                     epochs=EPOCHS, 
                     model_file=model_file, 
                     eval_on=eval_on,
                     proba_on=proba_on, 
                     proba_file=proba_file, 
                     result_file=eval_file)
                        
        else:
            if sampling_on:
                model_file="model_bert_"+sampling_type+"_"+modelname+".bin"
                proba_file="result_bert_"+sampling_type+"_"+modelname+".csv"
                eval_file="summary_bert_"+mode_name+"_"+sampling_type+"_"+modelname+".txt"
            else:
                model_file="model_bert_"+modelname+".bin"
                proba_file="result_bert_"+modelname+".csv"
                eval_file="summary_bert_"+mode_name+"_"+modelname+".txt"
                
            main(df_train, 
                 df_val, 
                 df_test, 
                 mode=mode_name, 
                 datasize_change=datachange_on,
                 sample_balance=balance_on, 
                 balance_sampling_on=balance_sample_on,                                      
                 balance_sampling_type=balance_sample_type, 
                 sample_ratio=ratio_on,
                 ratio=1, 
                 sample_on=sampling_on, 
                 sample_type=sampling_type,
                 tokenizer=tokenizer, 
                 max_len=MAX_LEN, 
                 batch_size=BATCH_SIZE,
                 modelname=modelname, 
                 n_class=num_class, 
                 device=device,
                 pretrained_model=pretrained_modelname, 
                 learning_rate=LEARNING_RATE,
                 epochs=EPOCHS, 
                 model_file=model_file, 
                 eval_on=eval_on,
                 proba_on=proba_on, 
                 proba_file=proba_file, 
                 result_file=eval_file)

    
    print("\n\n************** Processing Complete **************\n")
    