# LERCause: Causal Sentence Identification with LER (Nuclear Safety Reports)  
This code is to use a heurisitc method (keyword matching) on LER data for sentence classification and prediction.   

Author:   
1. Jinmo Kim: School of Information Sciences, University of Illinois Urbana-Champaign   
2. Jenna Kim: School of Information Sciences, Univeristy of Illinois Urbana-Champaign      
    
Cite this paper:  

Kim, J., Kim, J., Lee, A., Kim, J., Diesner, J. (2024). LERCause: Deep learning approaches for causal sentence identification from nuclear safety reports. Plos One. 




# 1. Set up

In [None]:
import timeit
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
## Install imbalanced-learn library for sampling if not already installed
 
#!pip install imbalanced-learn==0.11.0

# 2. Functions

In [None]:
def load_data(filename, record):
    
    """
       Read in input file and load data
    
       filename: csv file
       colname: column name used for text input data
       record: text file to include a processing output
    
       return two dataframes
    """
    
    ''' Read in data from input file '''
    df = pd.read_csv(filename, encoding='utf-8')
    
    
    ''' Display no of rows and columns '''
    print("No of Rows: {}".format(df.shape[0]))
    print("No of Columns: {}".format(df.shape[1]))
    print("No of Rows: {}".format(df.shape[0]), file=record)
    print("No of Columns: {}".format(df.shape[1]), file=record)
    
    
    ''' Select data needed for processing & rename columns '''
    df = df[['PMID', 'USENID', 'SENT', 'CLASS']]
    df.rename({"PMID": "pmid", "USENID": "usenid", "SENT": "sentence", "CLASS": "label"}, 
              axis=1, 
              inplace=True)
    
    
    ''' Remove null values ''' 
    df=df.dropna()
    
    print("No of rows (After removing null): {}".format(df.shape[0]))
    print("No of columns: {}".format(df.shape[1]))
    print("No of rows (After removing null): {}".format(df.shape[0]), file=record)
    print("No of columns: {}".format(df.shape[1]), file=record)
    

    ''' Check the first few instances '''  
    print("\n<Data View: First Few Instances>")
    print("\n", df.head()) 
    print("\n<Data View: First Few Instances>", file=record)
    print("\n", df.head(), file=record)
    
    
    ''' Display no of lables and rows ''' 
    print('\nClass Counts(label, row): Total')
    print(df.label.value_counts())
    print('\nClass Counts(label, row): Total', file=record)
    print(df.label.value_counts(), file=record)
    
     
    return df

In [None]:
def sample_data(X_train, 
                y_train, 
                record, 
                sampling=0, 
                sample_method='over'):  
    """
       Sampling input train data
       
       X_train: dataframe of X train data
       y_train: datafram of y train data
       record: text file including a processing output
       sampling: indicator of sampling funtion is on or off
       sample_method: method of sampling (oversampling or undersampling)
       
       return two sampled dataframes
    """
    
    from imblearn.over_sampling import RandomOverSampler
    from imblearn.under_sampling import RandomUnderSampler
    
    
    ''' Select a sampling method '''
    if sampling:
        if sample_method == 'over':
            oversample = RandomOverSampler(random_state=42)
            X_over, y_over = oversample.fit_resample(X_train, y_train)
    
            print('\n************** Data Sampling **************')
            print('\nOversampled Data (class, Rows):\n{}'.format(y_over.value_counts()))
            print('\n************** Data Sampling **************', file=record)
            print('\nOversampled Data (class, Rows):\n{}'.format(y_over.value_counts()), file=record)
            
            X_train_sam, y_train_sam = X_over, y_over
            
        elif sample_method == 'under':
            undersample = RandomUnderSampler(random_state=42)
            X_under, y_under = undersample.fit_resample(X_train, y_train)
        
            print('\n************** Data Sampling **************')
            print('\nUndersampled Data (class, Rows):\n{}'.format(y_under.value_counts()))
            print('\n************** Data Sampling **************', file=record)
            print('\nUndersampled Data (class, Rows):\n{}'.format(y_under.value_counts()), file=record)
            
            X_train_sam, y_train_sam = X_under, y_under
    else:
        X_train_sam, y_train_sam = X_train, y_train 
        
        print('\n************** Data Sampling **************')
        print('\nNo Sampling Performed\n')
        print('\n************** Data Sampling **************', file=record)
        print('\nNo Sampling Performed\n', file=record)
    
    return X_train_sam, y_train_sam

In [None]:
def find_exact_match(string, keywords):
    """
       Search exact match of terms in a text
    
       string: text string
       keywords: a list of terms used as keyword

       return a list of matched terms
    """
  
    items = []
    
    for keyword in keywords:
        term = r'\b' + keyword + r'\b'
        found = re.findall(term, string, flags=re.IGNORECASE)

        if len(found) > 0:
            [items.append(word) for word in found]

    return items

In [None]:
def convert_match_to_label(df_data, keywords):
    """
       Identify strings that match keywords in texts 
       and convert to label if an instance includes any matched term
    
       df_data: input dataframe
       keywords: a list of terms used as keyword

       return: dataframe that includes matched terms and converted labels   
    """
    
    import re

    ''' Remove punctuation from texts '''
    df_data["sent_process"] = df_data["sentence"].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    
    ''' Remove unnecessary spaces '''
    df_data["sent_process"] = df_data["sent_process"].apply(lambda x: " ".join(x.split()))

    
    ''' Detect keyword terms in each text '''
    df_data["match"] = df_data["sent_process"].apply(lambda x: find_exact_match(x, keywords))
  

    ''' Label each match '''
    df_data["pred"] = df_data["match"].apply(lambda x: 1 if len(x)>0 else 0)
    
    
    ''' Select data & rename columns '''
    df_data = df_data[["pmid", "usenid", "sentence", "match", "pred", "label"]]
    df_data.rename({"label": "act"}, axis=1, inplace=True)

    
    return df_data

In [None]:
def evaluate_model(y_test, 
                   y_pred, 
                   record, 
                   eval_model=0):
    """
      evaluate a model performance
      
      y_test: original y test data
      y_pred: predicted y values
      record: text file containing a processing output
      eval_model: indicator if this funtion is on or off
    """
    
    if eval_model:
        
        ''' Create a confusion matrix '''
        print('\nConfusion Matrix:\n')
        print(confusion_matrix(y_test, y_pred))
        print('\nConfusion Matrix:\n', file=record)
        print(confusion_matrix(y_test, y_pred), file=record)
        
        ''' Display a classification report '''
        print('\nClassification Report:\n')
        print(classification_report(y_test, y_pred, digits=4))
        print('\nClassification Report:\n', file=record)
        print(classification_report(y_test, y_pred, digits=4), file=record)

In [None]:
def main(input_file, 
         sample_on, 
         sample_type, 
         keywords,   
         eval_on, 
         match_file,
         result_file):
    
    """
       Main function for processing data, model fitting, and prediction
       
       input_file: input file
       sample_on: indicator of sampling on or off
       sample_type: type of sampling method
       keywords: a list of terms used for keyword matching
       eval_on: indicator of model evaluation on or off
       match_file: name of csv file to save output
       result_file: name of text file to save evaluation
    """
    
    ''' Open result file for records '''
    f=open(result_file, "a")
    
    
    ''' Check the processing time '''
    proc_start_time = timeit.default_timer()
    
    
    ''' Load data '''
    print("\n************** Loading Data **************\n")
    print("\n************** Loading Data **************\n", file=f)
    
    df = load_data(input_file, record=f)       

    
    ''' Train and test split '''
    print("\n************** Spliting Data **************\n")
    print("\n************** Spliting Data **************\n", file=f)
    
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.label)
    df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42, stratify=df_test.label)
    
    print("Train Data: {}".format(df_train.shape))
    print("Val Data: {}".format(df_val.shape))
    print("Test Data: {}".format(df_test.shape))
    print("Train Data: {}".format(df_train.shape), file=f)
    print("Val Data: {}".format(df_val.shape), file=f)
    print("Test Data: {}".format(df_test.shape), file=f)   
    
    print('\nClass Counts(label, row): Train')
    print(df_train.label.value_counts())
    print('\nClass Counts(label, row): Val')
    print(df_val.label.value_counts())
    print('\nClass Counts(label, row): Test')
    print(df_test.label.value_counts())
    print('\nClass Counts(label, row): Train', file=f)
    print(df_train.label.value_counts(), file=f)
    print('\nClass Counts(label, row): Val', file=f)
    print(df_val.label.value_counts(), file=f)
    print('\nClass Counts(label, row): Test', file=f)
    print(df_test.label.value_counts(), file=f)
    
    print("\nTest Data: First Few Instances")
    print(df_test.head())
    print("\nTest Data: First Few Instances", file=f)
    print(df_test.head(), file=f)
    
    
    ''' Reset index '''
    df_train=df_train.reset_index(drop=True)
    df_val=df_val.reset_index(drop=True)
    df_test=df_test.reset_index(drop=True)
    
    
    print("\n************** Processing Data **************")
    print("\n************** Processing Data **************", file=f)
    
    print("\nTest Data: {}".format(df_test.shape))
    print("\nTest Data: {}".format(df_test.shape), file=f)
    
    print('\nClass Counts(label, row): Test')
    print(df_test.label.value_counts())
    print('\nClass Counts(label, row): Test', file=f)
    print(df_test.label.value_counts(), file=f)
    
    print("\nTest Data: First Few Instances")
    print(df_test.head())
    print("\nTest Data: First Few Instances", file=f)
    print(df_test.head(), file=f)
    
    
    ''' Sampling '''
    if sample_on:
        X_train = df_train.iloc[:, :-1]
        y_train = df_train.iloc[:, -1]
    
        X_train_samp, y_train_samp = sample_data(X_train, y_train, sampling=sample_on, sample_method=sample_type)
        df_train = pd.concat([X_train_samp, y_train_samp], axis=1)
        
        print("\nSampled Data: First Few Instances")
        print(df_train.head())
        print("\nSampled Data: First Few Instances", file=f)
        print(df_train.head(), file=f)
        
    
    ''' Heuristic Method: keyword matching '''
    print("\n************** Heuristic Method: Keyword Match **************")
    print("\n************** Heuristic Method: Keyword Match **************", file=f)
    
    df_matched=convert_match_to_label(df_test, keywords)   
    
    print("\nOutput Data: {}".format(df_matched.shape))
    print("\nOutput Data: {}".format(df_matched.shape), file=f)
    
    print("\nOutput Data: First Few Instances")
    print(df_matched.head())
    print("\nOutput Data: First Few Instances", file=f)
    print(df_matched.head(), file=f) 
     

    ''' Save output '''
    df_matched.to_csv(match_file, 
                      encoding='utf-8', 
                      index=False, 
                      header=True)
    
    print("\nOutput file:'" + match_file + "' Created")
    print("\nOutput file:'" + match_file + "' Created", file=f)
    

    ''' Evaluate performance '''
    print("\n************** Evaluating performance **************")
    print("\n************** Evaluating performance **************", file=f)

    y_test = df_matched["act"]
    y_pred = df_matched["pred"]

    evaluate_model(y_test, 
                   y_pred, 
                   record=f, 
                   eval_model=eval_on)
    
    print("\nSummary file:'" + result_file + "' Created")
    print("\nSummary file:'" + result_file + "' Created", file=f)
    
    
    ''' Check the processing time '''
    proc_elapsed = timeit.default_timer() - proc_start_time
    
    print("\nTotal Processing Time: {} sec\n".format(round(proc_elapsed,2)))
    print("\nTotal Processing Time: {} sec\n".format(round(proc_elapsed,2)), file=f)
    
    
    f.close()

# 3. Run Code 


In [None]:
%%time

if __name__== "__main__":
    
    ###############################################
    ##########  1. Set Parameter Values  ##########
    ###############################################
    
    ########  1-1. Input file name  ########
    input_filename="LER_rawdata.csv" 
  

    ########  1-2. Sampling applied?  ########
    sampling_on=0                                  # 0 for no sampling; 1 for sampling
    sampling_type='under'                          # Use when sampling_on=1;'over'(oversampling)/'under'(undersampling) 

    
    ########  1-3. Evaluation applied?  ######## 
    eval_on=1                                      # 0 for no; 1 for yes (confusion matrix/classification report)
    
    
    ########  1-4. A list containing terms for keyword matching  ########
    keyword_list = ['result in', 'caused', 'due to', 'be caused by', 
                    'result from', 'as a result of', 'cause of',
                    'causes of', 'in response to', 'because', 
                    'because of', 'attributed to', 'lead to']
    
    
    ###############################################
    ##########   2. Run Main Fuction    ###########
    ###############################################
    
    
    output_file = "result_heuristic.csv" 
    eval_file = "summary_heuristic.txt" 
            
    main(input_file=input_filename, 
         sample_on=sampling_on, 
         sample_type=sampling_type, 
         keywords=keyword_list,
         eval_on=eval_on,
         match_file=output_file,
         result_file=eval_file)
        
        
    print("\n\n************** Processing Completed **************\n")
    