# LERCause: Causal Sentence Identification with LER (Nuclear Safety Reports)

This code is to run traditional machine learning models on LER data for sentence classification and prediction. 

Author:   
1. Jinmo Kim: School of Information Sciences, University of Illinois Urbana-Champaign   
2. Jenna Kim: School of Information Sciences, Univeristy of Illinois Urbana-Champaign      

Cite this paper:   

Kim, J., Kim, J., Lee, A., Kim, J., Diesner, J. (2024). LERCause: Deep learning approaches for causal sentence identification from nuclear safety reports. Plos One.


# 1. Set up

In [None]:
import timeit
import pandas as pd
import pickle

import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
## Install imbalanced-learn library for sampling if not already installed
 
#!pip install imbalanced-learn==0.11.0

#!pip install scikit-learn==1.0.2

# 2. Functions

In [None]:
def load_data(filename, record):   
    """
    Read in input file and load data
    
    filename: csv file   
    record: text file to include a processing output
    
    return two dataframes
    """
    
    ''' Read in data from input file '''
    df = pd.read_csv(filename, encoding='utf-8')
    
    
    ''' Display no of rows and columns '''
    print("No of Rows: {}".format(df.shape[0]))
    print("No of Columns: {}".format(df.shape[1]))
    print("No of Rows: {}".format(df.shape[0]), file=record)
    print("No of Columns: {}".format(df.shape[1]), file=record)
    
    
    ''' Select data needed for processing & rename columns '''
    df = df[['PMID', 'USENID', 'SENT', 'CLASS']]
    df.rename({"PMID": "pmid", "USENID": "usenid", "SENT": "sentence", "CLASS": "label"}, 
              axis=1, 
              inplace=True)
    
    
    ''' Check the first few instances '''  
    print("\n<Data View: First Few Instances>")
    print("\n", df.head()) 
    print("\n<Data View: First Few Instances>", file=record)
    print("\n", df.head(), file=record)
    
    
    ''' Display no of lables and rows ''' 
    print('\nClass Counts(label, row): Total')
    print(df.label.value_counts())
    print('\nClass Counts(label, row): Total', file=record)
    print(df.label.value_counts(), file=record)
    
    
    ''' Split data into X and y '''
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    
        
    return X, y

In [None]:
def sample_data(X_train, 
                y_train, 
                record, 
                sampling=0, 
                sample_method='over'):  
    """
       Sampling input train data
       
       X_train: dataframe of X train data
       y_train: datafram of y train data
       record: text file including a processing output
       sampling: indicator of sampling funtion is on or off
       sample_method: method of sampling (oversampling or undersampling)
       
       return two sampled dataframes
    """
    
    from imblearn.over_sampling import RandomOverSampler
    from imblearn.under_sampling import RandomUnderSampler
    
    
    ''' Select a sampling method '''
    if sampling:
        if sample_method == 'over':
            oversample = RandomOverSampler(random_state=42)
            X_over, y_over = oversample.fit_resample(X_train, y_train)
    
            print('\n************** Data Sampling **************')
            print('\nOversampled Data (class, Rows):\n{}'.format(y_over.value_counts()))
            print('\n************** Data Sampling **************', file=record)
            print('\nOversampled Data (class, Rows):\n{}'.format(y_over.value_counts()), file=record)
            
            X_train_sam, y_train_sam = X_over, y_over
            
        elif sample_method == 'under':
            undersample = RandomUnderSampler(random_state=42)
            X_under, y_under = undersample.fit_resample(X_train, y_train)
        
            print('\n************** Data Sampling **************')
            print('\nUndersampled Data (class, Rows):\n{}'.format(y_under.value_counts()))
            print('\n************** Data Sampling **************', file=record)
            print('\nUndersampled Data (class, Rows):\n{}'.format(y_under.value_counts()), file=record)
            
            X_train_sam, y_train_sam = X_under, y_under
    else:
        X_train_sam, y_train_sam = X_train, y_train 
        
        print('\n************** Data Sampling **************')
        print('\nNo Sampling Performed\n')
        print('\n************** Data Sampling **************', file=record)
        print('\nNo Sampling Performed\n', file=record)
    
    return X_train_sam, y_train_sam

In [None]:
def preprocess_data(X_data_raw):
    """
       Preprocess data with lowercase conversion, punctuation removal, tokenization, stemming
       
       X_data_raw: X data in dataframe
       
       return dataframe  
    """
    
    ''' Make sure that data type is a string '''
    X_data=X_data_raw.iloc[:, -1].astype(str)
   

    ''' Convert all characters to lowercase '''
    X_data = X_data.map(lambda x: x.lower())

    
    ''' Remove punctuation '''
    X_data = X_data.str.replace('[^\w\s]', '')
    
    
    ''' Tokenization '''
    X_data = X_data.apply(nltk.word_tokenize)
    
    
    ''' Remove stopwords '''
    stopword_list = stopwords.words("english")
    X_data = X_data.apply(lambda x: [word for word in x if word not in stopword_list])

          
    ''' Stemming '''
    stemmer = PorterStemmer()
    X_data = X_data.apply(lambda x: [stemmer.stem(y) for y in x])
   
    
    ''' Integrate a list of elements into a string '''
    X_data = X_data.apply(lambda x: ' '.join(x)) 
    
    
    return X_data

In [None]:
def select_classifier(model):   
    """
      Options of classifiers:
      decision tree, svm, naive bayes, random forest, and gradient boosting
      
      model: name of classifier algorithm
      
      return a selected classifier
    """
    
    if model=='DT':
        classifier = DecisionTreeClassifier(max_depth=2)
        
    elif model=='SVM':
        classifier = SVC(kernel='linear', probability=True)  
    
    elif model=='NB':
        classifier = MultinomialNB()
    
    elif model=='LR':
        classifier = LogisticRegression()  
    
    elif model=='RF':
        classifier = RandomForestClassifier(max_depth=2, random_state=0)
    
    elif model=='GB':
        classifier = GradientBoostingClassifier()
    
    else:
        raise ValueError("Must be a valid classifier name!")
    
    return classifier

In [None]:
def evaluate_model(y_test, 
                   y_pred, 
                   record, 
                   eval_model=0):
    """
      evaluate a model performance
      
      y_test: original y test data
      y_pred: predicted y values
      record: text file containing a processing output
      eval_model: indicator if this funtion is on or off
    """
    
    if eval_model:
        
        ''' Create a confusion matrix '''
        print('\nConfusion Matrix:\n')
        print(confusion_matrix(y_test, y_pred))
        print('\nConfusion Matrix:\n', file=record)
        print(confusion_matrix(y_test, y_pred), file=record)
        
        ''' Display a classification report '''
        print('\nClassification Report:\n')
        print(classification_report(y_test, y_pred, digits=4))
        print('\nClassification Report:\n', file=record)
        print(classification_report(y_test, y_pred, digits=4), file=record)

In [None]:
def predict_proba(model, 
                  X_test_proc, 
                  X_test, 
                  y_test, 
                  y_pred, 
                  proba_file, 
                  proba_on=0):
    """
       Predict probability of each class
       
       model: trained model with a selected classifier
       X_test_proc: X test data preprocessed
       X_test: original X test data
       y_test: original y test data
       y_pred: predicted y values
       proba_file: output file containing probability scores
       proba_on: indicator if the probability output is expected    
    """
    
    if proba_on:
        
        ''' Compute probability '''
        y_prob = model.predict_proba(X_test_proc)
        df_prob = pd.DataFrame(data=y_prob, columns=model.classes_)
        result = pd.concat([X_test.reset_index(drop=True), df_prob], 
                           axis=1, 
                           ignore_index=False)
    
    
        ''' Add predicted class to output '''
        result['pred'] = pd.Series(y_pred)

        ''' Add actual class to output ''' 
        y_test = y_test.reset_index(drop=True)
        result['act'] = y_test

        ''' Save output '''
        result.to_csv(proba_file, 
                      encoding='utf-8', 
                      index=False, 
                      header=True)

In [None]:
def split_data(input_file, 
               result_file):
    """
       Split data from input file
       
       input_file: file containing input data 
       result_file: name of output file of evaluation
       
       return X and y dataframes
    """
    
    ''' Open result file for records '''
    f=open(result_file, "a")
    
    
    ''' Load data '''
    print("\n************** Loading Data ************\n")
    print("\n************** Loading Data ************\n", file=f)
    
    X, y = load_data(input_file, record=f)
    
    print("\n<First Sentence>\n{}".format(X.sentence[0]))
    print("\n<First Sentence>\n{}".format(X.sentence[0]), file=f)
    
    
    ''' Train and test split '''
    print("\n************** Spliting Data **************\n")
    print("\n************** Spliting Data **************\n", file=f)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_test,y_test, test_size=0.5, random_state=42, stratify=y_test)
    
    print("Train Data: {}".format(X_train.shape))
    print("Val Data: {}".format(X_val.shape))
    print("Test Data: {}".format(X_test.shape))
    print("Train Data: {}".format(X_train.shape), file=f)
    print("Val Data: {}".format(X_val.shape), file=f)
    print("Test Data: {}".format(X_test.shape), file=f)
    
    print('\nClass Counts(label, row): Train')
    print(y_train.value_counts())
    print('\nClass Counts(label, row): Val')
    print(y_val.value_counts())
    print('\nClass Counts(label, row): Test')
    print(y_test.value_counts())
    print('\nClass Counts(label, row): Train', file=f)
    print(y_train.value_counts(), file=f)
    print('\nClass Counts(label, row): Val', file=f)
    print(y_val.value_counts(), file=f)
    print('\nClass Counts(label, row): Test', file=f)
    print(y_test.value_counts(), file=f)
    
    print("\n<X_test Data>")
    print(X_test.head())
    print("\n<X_test Data>", file=f)
    print(X_test.head(), file=f)
        
        
    return (X_train, y_train, X_test, y_test)
    
    f.close()

In [None]:
def model_train(X_train, 
                y_train,
                datasize_change,
                sample_balance,
                balance_sampling_on,                                   
                balance_sampling_type,
                sample_ratio,
                ratio,
                sample_on, 
                sample_type, 
                model_method,
                model_file,
                result_file):   
    """
       Function for data processing and model fitting
       
       X_train: dataframe containing X train data 
       y_train: dataframe containing y train data
       datasize_change: data size change on or off
       sample_balance: balance of sample on or off
       balance_sampling_on: sampling on or off when balance is 1
       balance_samplling_type: sample type to choose if balance_sampling_on is 1
       sample_ratio: proportion of data size for balance sampling
       ratio: proportion of data size
       sample_on: sampling on or off
       sample_type: sample type to choose if sample_on is 1
       model_method: name of classifier to be applied for model fitting
       model_file: file saved trained model
       result_file: name of output file of evaluation  
    """
    
    ''' Open result file for records '''
    f = result_file
    
    
    ''' Data size change '''
    if datasize_change:
        
        ''' Sample data with balance (1:1) '''
        if sample_balance:
            
            print("\n************** Data Balancing: Label Class (1:1) *************\n")
            print("\n************** Data Balancing: Label Class (1:1) *************\n", file=f)
            
            X_train, y_train = sample_data(X_train, 
                                           y_train, 
                                           record=f, 
                                           sampling=balance_sampling_on, 
                                           sample_method=balance_sampling_type)
            
            print('\nClass Counts(label, row): After balancing')
            print(y_train.value_counts())
            print('\nClass Counts(label, row): After balancing', file=f)
            print(y_train.value_counts(), file=f)
            
            print("\n<Balanced Train Data>")
            print(X_train.head()) 
            print("\n<Balanced Train Data>", file=f)
            print(X_train.head(), file=f)
        
                  
        ''' Sample data based on size ratio '''    
        if sample_ratio:
            
            if ratio == 1:
                X_train = X_train
                y_train = y_train       
            
            else:
                X_train, _, y_train, _ = train_test_split(X_train, 
                                                          y_train, 
                                                          train_size=ratio, 
                                                          random_state=42, 
                                                          stratify=y_train)
            
            print("\n************** Data Size Change: Ratio *************\n")
            print("Data Ratio: {}".format(ratio))   
            print("\n************** Data Size Change: Ratio *************\n", file=f)
            print("Data Ratio: {}".format(ratio), file=f)
            
            print('\nClass Counts(label, row): After sampling')
            print(y_train.value_counts())
            print('\nClass Counts(label, row): After sampling', file=f)
            print(y_train.value_counts(), file=f)
            
            print("\n<Train Data Based on Ratio>")
            print(X_train.head())
            print("\n<Train Data Based on Ratio>", file=f)
            print(X_train.head(), file=f)
            
        
        ''' Reset index '''
        X_train=X_train.reset_index(drop=True)
        y_train=y_train.reset_index(drop=True)
        
        print("\n************** Processing Data **************")
        print("\nTrain Data: {}".format(X_train.shape))
        print("\n************** Processing Data **************", file=f)
        print("\nTrain Data: {}".format(X_train.shape), file=f)
        
        print('\nClass Counts(label, row): Train')
        print(y_train.value_counts())
        print('\nClass Counts(label, row): Train', file=f)
        print(y_train.value_counts(), file=f)
        
        print("\n<X_train Data>")
        print(X_train.head())
        print("\n<X_train Data>", file=f)
        print(X_train.head(), file=f)

   
    ''' Sampling '''
    if sample_on:
        
        X_train, y_train = sample_data(X_train, 
                                       y_train, 
                                       record=f, 
                                       sampling=sample_on, 
                                       sample_method=sample_type)
        
        print("\nSampled Data: First Few Instances")
        print(X_train.head(3))
        print("\nSampled Data: First Few Instances", file=f)
        print(X_train.head(3), file=f)
        

    ''' Preprocessing ''' 
    X_train_proc = preprocess_data(X_train)
    
    print("\n<After preprocessing training data>\n")
    print(X_train_proc)
    print("\n<After preprocessing training data>\n", file=f)
    print(X_train_proc, file=f)
    
    
    ''' Fitting a model '''
    print("\n************** Training Model: " + model_method + " **************")
    print("\n************** Training Model: " + model_method + " **************", file=f)
    
    
    ''' Instantiate a transformer and classifier '''
    transformer = TfidfVectorizer(use_idf=True)
    classifier = select_classifier(model_method)
    
    
    ''' Define pipeline to combine transformer and classifier '''
    pipe = Pipeline([("transformer", transformer),("classifier", classifier)])

    
    ''' Fit the pipeline on entire training data '''
    estimator = pipe.fit(X_train_proc, y_train)
    
    print("\nA model by '" + model_method + "' created\n")
    print("\nA model by '" + model_method + "' created\n", file=f)
              
        
    ''' Saving fitted model '''
    pickle.dump(estimator, open(model_file, 'wb'))
    
    print("Trained model saved in the local directory\n")
    print("Trained model saved in the local directory\n", file=f)

In [None]:
def model_inference(X_test,
                    y_test,  
                    model_method,
                    model_file,
                    eval_on, 
                    proba_file,
                    proba_on,
                    result_file):   
    """
       Function for prediction and evaluation
       
       X_test: dataframe containing X test data 
       y_test: dataframe containing y test data
       model_method: name of classifier to be applied for model fitting
       model_file: file saved trained model
       eval_on: model evaluation on or off
       proba_file: name of output file of probability
       proba_on: probability on or off
       result_file: name of output file of evaluation 
    """
      
    ''' Open result file for records '''
    f = result_file
    
    ''' Load trained model ''' 
    model = pickle.load(open(model_file, 'rb'))
    
    print("\nA trained model from '" + model_file + "' loaded")
    print("\nA trained model from '" + model_file + "' loaded", file=f)
                    

    ''' Predict output '''
    print("\n************** Getting Predictions **************")
    print("\n************** Getting Predictions **************", file=f)
    
    X_test_proc = preprocess_data(X_test)
    
    print("\n<After preprocessing test data>")
    print(X_test_proc)
    print("\n<After preprocessing test data>", file=f)
    print(X_test_proc, file=f)
    
    y_pred = model.predict(X_test_proc)
    

    ''' Evaluate model performance '''
    print("\n************** Evaluating Performance **************")
    print("\n************** Evaluating Performance **************", file=f)
    
    evaluate_model(y_test, 
                   y_pred, 
                   record=f, 
                   eval_model=eval_on)
    

    ''' Generate output with probability scores '''
    predict_proba(model, 
                  X_test_proc, 
                  X_test, 
                  y_test, 
                  y_pred, 
                  proba_file=proba_file, 
                  proba_on=proba_on)
    
    if proba_on:
        print("\nOutput file:'" + proba_file + "' Created")
        print("\nOutput file:'" + proba_file + "' Created", file=f)
    

In [None]:
def main(X_train, 
         y_train, 
         X_test, 
         y_test,
         mode,
         datasize_change,
         sample_balance,
         balance_sampling_on,                                   
         balance_sampling_type,
         sample_ratio,
         ratio,
         sample_on, 
         sample_type, 
         model_method,
         model_file,
         eval_on, 
         proba_file,
         proba_on,
         result_file):
    
    ''' Open result file for records '''
    record = open(result_file, "a")
    
    
    ''' Check the processing time '''
    proc_start_time = timeit.default_timer()
    
    
    ''' Select a mode for training or testing'''
    if mode == "train":
        
        model_train(X_train, 
                    y_train, 
                    datasize_change, 
                    sample_balance, 
                    balance_sampling_on, 
                    balance_sampling_type, 
                    sample_ratio, 
                    ratio, 
                    sample_on, 
                    sample_type, 
                    model_method, 
                    model_file, 
                    result_file=record)
    
    elif mode == "test":
        
        model_inference(X_test, 
                        y_test, 
                        model_method, 
                        model_file, 
                        eval_on, 
                        proba_file, 
                        proba_on, 
                        result_file=record)
    
    
    ''' Check the processing time '''
    proc_elapsed = timeit.default_timer() - proc_start_time
    
    print("\n************** Processing Time **************")
    print("\n{}: {} sec\n".format(mode, round(proc_elapsed,2)))
    print("\n************** Processing Time **************", file=record)
    print("\n{}: {} sec\n".format(mode, round(proc_elapsed,2)), file=record)
    
    print("\nSummary file:'" + result_file + "' Created")
    print("\nSummary file:'" + result_file + "' Created", file=record)
    
    
    record.close()

# 3. Run Code 

In [None]:
if __name__== "__main__":
    
    ###############################################
    ##########  1. Set Parameter Values  ##########
    ###############################################

    ########  1-1. Input file name  ########
    input_filename="LER_rawdata.csv" 
    
    
    ########  1-2. Which mode to run?  ########
    mode_name = "data-split"                                    # 3 options: "data-split", "train", "test"
                                                                # Use "data-split" before "train" or/and "test"

    ########  1-3. Data size change?  ########
    ## 1-3-1. Change on/off?
    datachange_on = 0                                           # 0 for no change; 1 for change of data size
    
    ## 1-3-2. class balance (1:1)?
    balance_on = 0                                              # 0 for no balance; 1 for class balance (1:1)
    balance_sample_on = 0                                       # 0 for no sampling; 1 for sampling
    balance_sample_type = 'under'                               # 'over'(oversampling); 'under'(undersampling)
    balance_str = 'balance' + str(balance_on) + '_'
    
    ## 1-3-3. data increase?
    ratio_on = 0                                                # 0 for no ratio; 1 for ratio for data size
    ratio_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]  # a list containing ratio numbers

    
    ########  1-4. Sampling applied?  ########
    sampling_on = 0                                             # 0 for no sampling; 1 for sampling
    sampling_type = 'over'                                      # 'over'(oversampling)/'under'(undersampling)
    
    
    ########  1-5. Which model to use?  ########
    model_type = 'LR'                                           # 'LR'(Logisitic Regression);
                                                                # 'SVM'(Support Vector Machine);'NB'(Naive Bayes);
                                                                # 'RF'(Random Forest);'GB'(Gradient Boosting)
    
    ########  1-6. Evaluation & probability file  ######## 
    eval_on = 1                                                 # 0 for no; 1 for yes (evaluatin scores)
    proba_on = 1                                                # 0 for no; 1 for yes (probability & prediction output)
    
    
    ###############################################
    ##########   2. Run Main Fuction    ###########
    ###############################################
    
    if mode_name == "data-split":
        result_file = "summary_ml_" + mode_name + ".txt" 
        X_train, y_train, X_test, y_test = split_data(input_filename, result_file)
        
    else:
        if datachange_on:                   
            for ratio in ratio_list:           
                if sampling_on:
                    model_file="model_ml_"+balance_str+str(ratio)+"_"+sampling_type+"_"+model_type+".sav"
                    proba_file="result_ml_"+balance_str+str(ratio)+"_"+sampling_type+"_"+model_type+".csv"  
                    eval_file="summary_ml_"+mode_name+balance_str+"_"+str(ratio)+"_"+sampling_type+"_"+model_type+".txt" 
                else:
                    model_file="model_ml_"+balance_str+str(ratio)+"_"+model_type+".sav"
                    proba_file="result_ml_"+balance_str+str(ratio)+"_"+model_type+".csv"  
                    eval_file="summary_ml_"+mode_name+balance_str+str(ratio)+"_"+model_type+".txt"  
            
                main(X_train, y_train, X_test, y_test, mode=mode_name,
                     datasize_change=datachange_on, sample_balance=balance_on,
                     balance_sampling_on=balance_sample_on,
                     balance_sampling_type=balance_sample_type,
                     sample_ratio=ratio_on, ratio=ratio, sample_on=sampling_on, 
                     sample_type=sampling_type, model_method=model_type, 
                     model_file=model_file, eval_on=eval_on, proba_file=proba_file,
                     proba_on=proba_on, result_file=eval_file)
        else:
            if sampling_on:
                model_file="model_ml_"+sampling_type+"_"+model_type+".sav"
                proba_file="result_ml_"+sampling_type+"_"+model_type+".csv"  
                eval_file="summary_ml_"+mode_name+"_"+sampling_type+"_"+model_type+".txt" 
            else:
                model_file="model_ml_"+model_type+".sav"
                proba_file="result_ml_"+model_type+".csv"  
                eval_file="summary_ml_"+mode_name+"_"+model_type+".txt" 
            
            main(X_train, y_train, X_test, y_test, mode=mode_name,
                 datasize_change=datachange_on, sample_balance=balance_on,
                 balance_sampling_on=balance_sample_on,
                 balance_sampling_type=balance_sample_type,
                 sample_ratio=ratio_on, ratio=1, sample_on=sampling_on, 
                 sample_type=sampling_type, model_method=model_type, 
                 model_file=model_file, eval_on=eval_on, proba_file=proba_file,
                 proba_on=proba_on, result_file=eval_file)

                
    print("\n\n************** Processing Completed **************\n")
    