# LERCause: Causal Sentence Identification with LER (Nuclear Safety Reports)     

This code is to run a CNN (Convolutional Neural Networks) model on LER data for sentence classification and prediction.    
      
Author:   
1. Jinmo Kim: School of Information Sciences, University of Illinois Urbana-Champaign   
2. Jenna Kim: School of Information Sciences, Univeristy of Illinois Urbana-Champaign      

Cite this paper:   

Kim, J., Kim, J., Lee, A., Kim, J., Diesner, J. (2024). LERCause: Deep learning approaches for causal sentence identification from nuclear safety reports. Plos One.


# 1. Set up

## 1-1. Load libraries

In [None]:
import timeit
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import keras
import gc

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
## Install imbalanced-learn library for sampling if not already installed
 
#!pip install imbalanced-learn==0.11.0

## 1-2. Check GPU settings

Make sure to use "conda_tensorflow2_p310" kernel to run this code in AWS Sagemaker. If not setup, you can find it go to Kernel -> Change kernel

In [None]:
# check the version of Tensorflow and Keras
# Tensorflow (ver 2.12.0); Keras (ver 2.12.0)

print("Tensorflow version: ", tf.__version__)
print("Keras version: ", keras.__version__)

In [None]:
# check if gpu is available

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("GPU device: ", tf.config.list_physical_devices('GPU'))

TensorFlow supports running computations on a variety of types of devices, including CPU and GPU. They are reperesented with string identifiers.    

For example:  
"/device:CPU:0" : CPU of your machine  
"/physical_device:GPU:0": GPU visible to TensorFlow.  

TensorFlow code, with Keras included, can run on a GPU by default without requiring explicit code configuration. If both CPU and GPU are available, TensorFlow will run the GPU-capable code unless otherwise specified.

In [None]:
# check GPU memory and & utilization
!nvidia-smi

# If want to clear occupied memory
#import gc
#gc.collect()

# check GPU memory usage again
#!nvidia-smi

## 2. Functions

In [None]:
def load_data(filename, record):
    
    """
    Read in input file and load data
    
    filename: csv file   
    record: text file to include a processing output
    
    return two dataframes
    """
    
    ''' Read in data from input file '''
    df = pd.read_csv(filename, encoding='utf-8')
    
    
    ''' Display no of rows and columns '''
    print("No of Rows: {}".format(df.shape[0]))
    print("No of Columns: {}".format(df.shape[1]))
    print("No of Rows: {}".format(df.shape[0]), file=record)
    print("No of Columns: {}".format(df.shape[1]), file=record)
    
    
    ''' Select data needed for processing & rename columns '''
    df = df[['PMID', 'USENID', 'SENT', 'CLASS']]
    df.rename({"PMID": "pmid", "USENID": "usenid", "SENT": "sentence", "CLASS": "label"}, 
              axis=1, 
              inplace=True)
   

    ''' Remove null values & trim string data '''
    df=df.dropna()
    df["sentence"] = df["sentence"].apply(lambda x: x.strip())
    
    print("No of rows (After removing null): {}".format(df.shape[0]))
    print("No of columns: {}".format(df.shape[1]))
    print("No of rows (After removing null): {}".format(df.shape[0]), file=record)
    print("No of columns: {}".format(df.shape[1]), file=record) 
    
    
    ''' Check the first few instances '''  
    print("\n<Data View: First Few Instances>")
    print("\n", df.head()) 
    print("\n<Data View: First Few Instances>", file=record)
    print("\n", df.head(), file=record)
    
    
    ''' Display no of lables and rows ''' 
    print('\nClass Counts(label, row): Total')
    print(df.label.value_counts())
    print('\nClass Counts(label, row): Total', file=record)
    print(df.label.value_counts(), file=record)
    

    ''' Split data into X and y '''
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
     
     
    return X, y

In [None]:
def sample_data(X_train, 
                y_train, 
                record, 
                sampling=0, 
                sample_method='over'):  
    """
       Sampling input train data
       
       X_train: dataframe of X train data
       y_train: datafram of y train data
       record: text file including a processing output
       sampling: indicator of sampling funtion is on or off
       sample_method: method of sampling (oversampling or undersampling)
       
       return two sampled dataframes
    """
    
    from imblearn.over_sampling import RandomOverSampler
    from imblearn.under_sampling import RandomUnderSampler
    
    
    ''' Select a sampling method '''
    if sampling:
        if sample_method == 'over':
            oversample = RandomOverSampler(random_state=42)
            X_over, y_over = oversample.fit_resample(X_train, y_train)
    
            print('\n************** Data Sampling **************')
            print('\nOversampled Data (class, Rows):\n{}'.format(y_over.value_counts()))
            print('\n************** Data Sampling **************', file=record)
            print('\nOversampled Data (class, Rows):\n{}'.format(y_over.value_counts()), file=record)
            
            X_train_sam, y_train_sam = X_over, y_over
            
        elif sample_method == 'under':
            undersample = RandomUnderSampler(random_state=42)
            X_under, y_under = undersample.fit_resample(X_train, y_train)
        
            print('\n************** Data Sampling **************')
            print('\nUndersampled Data (class, Rows):\n{}'.format(y_under.value_counts()))
            print('\n************** Data Sampling **************', file=record)
            print('\nUndersampled Data (class, Rows):\n{}'.format(y_under.value_counts()), file=record)
            
            X_train_sam, y_train_sam = X_under, y_under
    else:
        X_train_sam, y_train_sam = X_train, y_train 
        
        print('\n************** Data Sampling **************')
        print('\nNo Sampling Performed\n')
        print('\n************** Data Sampling **************', file=record)
        print('\nNo Sampling Performed\n', file=record)
    
    return X_train_sam, y_train_sam

In [None]:
def token_distribution(df):
    """
       Display a distribution of tokens
       
       df: a dataframe
    """
    
    token_lens = []
    long_tokens = []
    
    for text in df.sentence.to_list():
        
        ''' Split text into tokens '''
        tokens = text.split()
        token_lens.append(len(tokens))
    
        ''' Check a sentence with extreme length '''
        if len(tokens) > 150:
            long_tokens.append(len(tokens))   

    print("\n************* Token Distribution: train data *************")
    print("long sentences: ")
    print(long_tokens)
    
    
    ''' Plot the distribution '''
    print("Min token:", min(token_lens))
    print("Max token:", max(token_lens))
    print("Avg token:", round(np.mean(token_lens)))

    sns.displot(token_lens)
    plt.xlim([0,max(token_lens)+10])
    plt.xlabel("Token Count")

In [None]:
def plot_history(history):
    
    """
       Plot loss and accuracy of training & validation
       
       history: a dictionary containing a summary of training and valiadation scores
    """
    
    plt.style.use('ggplot')
    
    ''' Extract scores '''
    acc = history.history['binary_accuracy']
    val_acc = history.history['val_binary_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    
    ''' Create a plot '''
    x = range(1, len(acc) + 1)
    
    plt.figure(figsize=(12,5))
    plt.subplot(1,2,1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    
    plt.subplot(1,2,2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    
    plt.show()

In [None]:
def create_cnn_model(maxlen, 
                     vocab_size, 
                     record):
    """
       Instantiate a cnn model
       
       max_len: maximum input length
       vocab_size: size of vocabulary
       record: text file including a processing output
       
       return a configured model
    """
    embedding_dim = 100
  
    ''' Define the model '''
    model = Sequential()

    
    ''' Add an embedding layer '''
    model.add(layers.Embedding(input_dim=vocab_size, 
                               output_dim=embedding_dim, 
                               input_length=maxlen))

    
    ''' Add a first convolutional layer with 512 filters '''
    model.add(layers.Conv1D(512, 2, activation='relu'))
  

    ''' Pooling layer '''
    #model.add(layers.GlobalMaxPooling1D())
    model.add(layers.MaxPooling1D())

    
    ''' Add a second convolutional layer '''
    model.add(layers.Conv1D(512, 3, activation='relu'))

    
    ''' Add a second pooling layer '''
    model.add(layers.MaxPooling1D())
  
    
    ''' Flattening '''
    model.add(layers.Flatten())
    
    
    ''' Add dropout to prevent overfitting '''
    model.add(layers.Dropout(0.5))
  

    ''' Full connection '''
    model.add(layers.Dense(units=1, activation='sigmoid'))
  

    ''' Compile the model '''
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy',
                  metrics=['binary_accuracy',
                           tf.keras.metrics.Precision(name='precision'),
                           tf.keras.metrics.Recall(name='recall')])
  
    ''' Summarize the model '''
    print("\n\n************* Model Summary *************\n")
    print(model.summary(), "\n")
    print("\n\n************* Model Summary *************\n", file=record)
    model.summary(print_fn=lambda x: record.write(x + '\n'))
    

    return model

In [None]:
def evaluate_model(y_test, 
                   y_pred, 
                   record, 
                   eval_model=0):
    """
      evaluate a model performance
      
      y_test: original y test data
      y_pred: predicted y values
      record: text file containing a processing output
      eval_model: indicator if this funtion is on or off
    """
    
    if eval_model:
        
        ''' Create a confusion matrix '''
        print('\nConfusion Matrix:\n')
        print(confusion_matrix(y_test, y_pred))
        print('\nConfusion Matrix:\n', file=record)
        print(confusion_matrix(y_test, y_pred), file=record)
        
        ''' Display a classification report '''
        print('\nClassification Report:\n')
        print(classification_report(y_test, y_pred, digits=4))
        print('\nClassification Report:\n', file=record)
        print(classification_report(y_test, y_pred, digits=4), file=record)

In [None]:
def predict_proba(predictions, 
                  X_test, 
                  y_test, 
                  y_pred, 
                  proba_file, 
                  proba_on=0):
    
    """
       Predict probability of each class
       
       predictions: a list of probability scores for class 1
       X_test: original X test data
       y_test: original y test data
       y_pred: predicted y values
       proba_file: output file of probability scores
       proba_on: decide if the probability output is expected    
       
    """
    
    if proba_on:
        
        ''' Compute probability '''
        y_prob = predictions
        df_prob = pd.DataFrame(data=y_prob, columns=["prob_1"])
        result = pd.concat([X_test.reset_index(drop=True), df_prob], 
                           axis=1, 
                           ignore_index=False)
    
    
        ''' Add predicted class to output '''
        result['pred'] = pd.Series(y_pred)


        ''' Add actual class to output ''' 
        y_test = y_test.reset_index(drop=True)
        result['act'] = y_test

        
        ''' Save output '''
        result.to_csv(proba_file, 
                      encoding='utf-8', 
                      index=False, 
                      header=True)

In [None]:
def split_data(input_file, result_file):
    
    """
       Split data from input file
       
       input_file: file containing input data 
       result_file: name of output file of evaluation
       
       return X and y dataframes
    """
    
    ''' Open result file for records '''
    f=open(result_file, "a")
    
    
    ''' Load data '''
    print("\n************** Loading Data ************\n")
    print("\n************** Loading Data ************\n", file=f)
    
    X, y = load_data(input_file, record=f)
    
    print("\n<First Sentence>\n{}".format(X.sentence[0]))
    print("\n<First Sentence>\n{}".format(X.sentence[0]), file=f)
    
    
    ''' Train and test split '''
    print("\n************** Spliting Data **************\n")
    print("\n************** Spliting Data **************\n", file=f)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
    
    print("Train Data: {}".format(X_train.shape))
    print("Val Data: {}".format(X_val.shape))
    print("Test Data: {}".format(X_test.shape))
    print("Train Data: {}".format(X_train.shape), file=f)
    print("Val Data: {}".format(X_val.shape), file=f)
    print("Test Data: {}".format(X_test.shape), file=f)
    
    print('\nClass Counts(label, row): Train')
    print(y_train.value_counts())
    print('\nClass Counts(label, row): Val')
    print(y_val.value_counts())
    print('\nClass Counts(label, row): Test')
    print(y_test.value_counts())    
    print('\nClass Counts(label, row): Train', file=f)
    print(y_train.value_counts(), file=f)
    print('\nClass Counts(label, row): Val', file=f)
    print(y_val.value_counts(), file=f)
    print('\nClass Counts(label, row): Test', file=f)
    print(y_test.value_counts(), file=f)

    print("\n<X_train Data>")
    print(X_train.head())
    print("\n<X_train Data>", file=f)
    print(X_train.head(), file=f)
    
    print("\n<X_val Data>")
    print(X_val.head())
    print("\n<X_val Data>", file=f)
    print(X_val.head(), file=f)
    
    print("\n<X_test Data>")
    print(X_test.head())
    print("\n<X_test Data>", file=f)
    print(X_test.head(), file=f)
    

    return (X_train, y_train, X_val, y_val, X_test, y_test)
    
    f.close()

In [None]:
def model_train(X_train, 
                y_train,
                X_val, 
                y_val,
                datasize_change,
                sample_balance,
                balance_sampling_on,                                   
                balance_sampling_type,
                sample_ratio,
                ratio,
                sample_on, 
                sample_type, 
                tokernizer_file,
                max_len,
                batch_size,
                epochs,
                model_file,
                result_file):   
    """
       Function for data processing and model fitting
       
       X_train: dataframe containing X train data 
       y_train: dataframe containing y train data
       X_val: dataframe containing X validation data 
       y_val: dataframe containing y validation data
       datasize_change: data size change on or off
       sample_balance: balance of sample on or off
       balance_sampling_on: sampling on or off when balance is 1
       balance_samplling_type: sample type to choose if balance_sampling_on is 1
       sample_ratio: proportion of data size for balance sampling
       ratio: proportion of data size
       sample_on: sampling on or off
       sample_type: sample type to choose if sample_on is 1
       tokenizer_file: file to save tokenizer
       max_len: maximun length of tokens
       batch_size: size of batch 
       epochs: number of epoch
       model_file: file saved trained model
       result_file: name of output file of evaluation  
    """
    
    ''' Open result file for records '''
    f = result_file
    
    
    ''' Data size change '''
    if datasize_change:
        
        ''' Sample data with balance (1:1) '''
        if sample_balance:
            
            print("\n************** Data Balancing: Label Class (1:1) *************\n")
            print("\n************** Data Balancing: Label Class (1:1) *************\n", file=f)
            
            X_train, y_train = sample_data(X_train, 
                                           y_train, 
                                           record=f, 
                                           sampling=balance_sampling_on, 
                                           sample_method=balance_sampling_type)
                      
            print('\nClass Counts(label, row): After balancing')
            print(y_train.value_counts())
            print('\nClass Counts(label, row): After balancing', file=f)
            print(y_train.value_counts(), file=f)
            
            print("\n<Balanced Train Data>")
            print(X_train.head()) 
            print("\n<Balanced Train Data>", file=f)
            print(X_train.head(), file=f)
           
        
        ''' Sample data based on size ratio '''   
        if sample_ratio:
            if ratio == 1:
                X_train = X_train
                y_train = y_train       
            else:
                X_train, _, y_train, _ = train_test_split(X_train, 
                                                          y_train, 
                                                          train_size=ratio, 
                                                          random_state=42, 
                                                          stratify=y_train)
                
            print("\n************** Data Size Change: Ratio *************\n")
            print("Data Ratio: {}".format(ratio))   
            print("\n************** Data Size Change: Ratio *************\n", file=f)
            print("Data Ratio: {}".format(ratio), file=f)
     
            print('\nClass Counts(label, row): After sampling')
            print(y_train.value_counts())
            print('\nClass Counts(label, row): After sampling', file=f)
            print(y_train.value_counts(), file=f)
            
            print("\n<Train Data Based on Ratio>")
            print(X_train.head())
            print("\n<Train Data Based on Ratio>", file=f)
            print(X_train.head(), file=f)
        
    
        ''' Reset index '''
        X_train=X_train.reset_index(drop=True)
        y_train=y_train.reset_index(drop=True)
        X_val=X_val.reset_index(drop=True)
        y_val=y_val.reset_index(drop=True)
        
        print("\n************** Processing Data **************")
        print("\nTrain Data: {}".format(X_train.shape))
        print("Val Data: {}".format(X_val.shape))
        print("\n************** Processing Data **************", file=f)
        print("\nTrain Data: {}".format(X_train.shape), file=f)
        print("Val Data: {}".format(X_val.shape), file=f)

        print('\nClass Counts(label, row): Train')
        print(y_train.value_counts())
        print('\nClass Counts(label, row): Train', file=f)
        print(y_train.value_counts(), file=f)
        print('\nClass Counts(label, row): Val')
        print(y_val.value_counts())
        print('\nClass Counts(label, row): Val', file=f)
        print(y_val.value_counts(), file=f)
        
        print("\n<X_train Data>\n")
        print(X_train.head())
        print("\n<X_val Data>")
        print(X_val.head())
        print("\n<X_train Data>\n", file=f)
        print(X_train.head(), file=f)
        print("\n<X_val Data>", file=f)
        print(X_val.head(), file=f)
        
    
    ''' Sampling '''  
    if sample_on:
        X_train, y_train = sample_data(X_train, 
                                       y_train, 
                                       record=f, 
                                       sampling=sample_on, 
                                       sample_method=sample_type)
        
        print("\nSampled Data: First Few Instances")
        print(X_train.head(3))
        print("\nSampled Data: First Few Instances", file=f)
        print(X_train.head(3), file=f)
        

    ''' Transform data '''
    print("\n************** Transforming Text into Vectors **************")
    print("\n************** Transforming Text into Vectors **************", file=f)
    
    sentences_train = X_train.iloc[:, -1]
    sentences_val = X_val.iloc[:, -1]

    print("\nsentences_train: ", sentences_train.shape)
    print(sentences_train.head())
    print("\nsentences_val: ", sentences_val.shape)
    print(sentences_val.head())
    print("\nsentences_train: ", sentences_train.shape, file=f)
    print(sentences_train.head(), file=f)
    print("\nsentences_val: ", sentences_val.shape, file=f)
    print(sentences_val.head(), file=f)
    
    
    ''' Tokenization '''
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences_train)
    
    vocab_size = len(tokenizer.word_index) + 1
    
    print("\nvocab size: ", vocab_size)
    print("\nvocab size: ", vocab_size, file=f)
    
    X_train = tokenizer.texts_to_sequences(sentences_train)
    X_val = tokenizer.texts_to_sequences(sentences_val)
    
    print("\nFirst Instance: Train\n")
    print(sentences_train.iloc[0])
    print("\nFirst Instance: Train\n", file=f)
    print(sentences_train.iloc[0], file=f)
    
    print("\nFirst Instance: Val\n")
    print(sentences_val.iloc[0])
    print("\nFirst Instance: Val\n", file=f)
    print(sentences_val.iloc[0], file=f)
    
    
    ''' Save the tokenizer '''
    with open(tokenizer_file, 'wb') as tf:
        pickle.dump(tokenizer, tf, protocol=pickle.HIGHEST_PROTOCOL)
        
    print("\nTokenizer file '" + tokenizer_file + "' saved in the local directory\n", file=f)
    print("\nTokenizer file '" + tokenizer_file + "' saved in the local directory\n")
    
    
    ''' Pad texts to a pre-defined max length '''
    X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
    X_val = pad_sequences(X_val, padding='post', maxlen=max_len)
    
    print("\n<X_train vector: first instance>\n")
    print(X_train[0, :])
    print("\n<X_train vector: first instance>\n", file=f)
    print(X_train[0, :], file=f)

    
    ''' Model Fitting '''
    print("\n************** Training Model: CNN **************")
    print("\n************** Training Model: CNN **************", file=f)

    model = create_cnn_model(max_len, 
                             vocab_size, 
                             record=f)
    
    
    ''' Create a checkpoint to save the best model while training '''
    model_checkpoint = ModelCheckpoint(model_file, 
                                       verbose=1, 
                                       monitor="val_binary_accuracy",   
                                       save_best_only=True, 
                                       save_weights_only=False,
                                       mode="auto", 
                                       save_freq="epoch")  
    
    
    ''' Train model with train and validation data '''
    history = model.fit(X_train, 
                        y_train, 
                        epochs=epochs,
                        verbose=1,
                        validation_data=(X_val, y_val),
                        batch_size=batch_size,
                        callbacks=[model_checkpoint])
    
    print("\nTrained model '" + model_file + "' saved in the local directory\n")
    print("\nTrained model '" + model_file + "' saved in the local directory\n", file=f)
    
    
    ''' Display a plot of loss & accuracy '''
    print("\n")
    plot_history(history)

In [None]:
def model_inference(X_test,
                    y_test,
                    tokenizer_file,
                    model_file,
                    max_len,
                    eval_on, 
                    proba_file,
                    proba_on,
                    result_file):   
    """
       Function for prediction and evaluation
       
       X_test: dataframe containing X test data 
       y_test: dataframe containing y test data
       tokenizer_file: file containing saved tokenizer
       model_file: file containing trained model
       max_len: maximun length of tokens
       eval_on: model evaluation on or off
       proba_file: name of output file of probability
       proba_on: probability on or off
       result_file: name of output file of evaluation
       
    """
      
    ''' Open result file for records '''
    f = result_file
    
    ''' Load tokenizer and transform test data '''
    print("\nTest Data: First Few Instances")
    print(X_test.head())
    print("\nTest Data: First Few Instances", file=f)
    print(X_test.head(), file=f)
        
    with open(tokenizer_file, 'rb') as tf:
        tokenizer = pickle.load(tf)
    
    print("\nA tokenizer from '" + tokenizer_file + "' loaded")
    print("\nA tokenizer from '" + tokenizer_file + "' loaded", file=f)
    
    sentences_test = X_test.iloc[:, -1]
    X_test_trans = tokenizer.texts_to_sequences(sentences_test)
    
    print("\nFirst Instance: Test\n")
    print(sentences_test.iloc[0])
    print("\nFirst Instance: Test\n", file=f)
    print(sentences_test.iloc[0], file=f)
    
    
    ''' Pad texts to a pre-defined max length '''
    X_test_trans = pad_sequences(X_test_trans, 
                                 padding='post', 
                                 maxlen=max_len)
    
    print("\n<X_test vector: first instance>\n")
    print(X_test_trans[0, :])
    print("\n<X_test vector: first instance>\n", file=f)
    print(X_test_trans[0, :], file=f)
    
    
    ''' Load trained model with its weights and optimizer '''
    cnn_model = load_model(model_file)
    
    print("\nA trained model from '" + model_file + "' loaded")
    print("\nA trained model from '" + model_file + "' loaded", file=f)
    
    
    ''' Show the architecture of loaded model '''
    print("\n\n************* Summary of Loaded Model *************\n")
    print(cnn_model.summary())
    print("\n\n************* Summary of Loaded Model *************\n", file=f)
    cnn_model.summary(print_fn=lambda x: f.write(x + '\n'))

    
    ''' Prediction ''' 
    print("\n************** Getting Predictions **************")
    print("\n************** Getting Predictions **************", file=f)

    predictions = cnn_model.predict(X_test_trans)
    y_pred = (predictions > 0.5).astype("int32").flatten()  

    
    ''' Evaluating model performance '''
    print("\n************** Evaluating Performance **************")
    print("\n************** Evaluating Performance **************", file=f)
    
    evaluate_model(y_test, 
                   y_pred, 
                   record=f, 
                   eval_model=eval_on)

    
    ''' Generating output file with probability score '''
    predict_proba(predictions, 
                  X_test, 
                  y_test, 
                  y_pred, 
                  proba_file=proba_file, 
                  proba_on=proba_on)
    
    if proba_on:
        print("\nOutput file:'" + proba_file + "' Created")
        print("\nOutput file:'" + proba_file + "' Created", file=f)

In [None]:
def main(X_train, 
         y_train, 
         X_val, 
         y_val,
         X_test, 
         y_test,
         mode,
         datasize_change,
         sample_balance,
         balance_sampling_on,                                   
         balance_sampling_type,
         sample_ratio,
         ratio,
         sample_on, 
         sample_type,
         tokenizer_file,
         max_len,
         batch_size,
         epochs,
         model_file,
         eval_on, 
         proba_file,
         proba_on,
         result_file):
    
    ''' Open result file for records '''
    record = open(result_file, "a")
    
    
    ''' Check the processing time '''
    proc_start_time = timeit.default_timer()
    
    
    ''' Select a mode for training or testing'''
    if mode == "train":
        
        model_train(X_train, 
                    y_train, 
                    X_val, 
                    y_val, 
                    datasize_change, 
                    sample_balance, 
                    balance_sampling_on, 
                    balance_sampling_type, 
                    sample_ratio, 
                    ratio, 
                    sample_on, 
                    sample_type, 
                    tokenizer_file, max_len, 
                    batch_size, 
                    epochs, 
                    model_file, 
                    result_file=record)
    
    elif mode == "test":
        
        model_inference(X_test, 
                        y_test, 
                        tokenizer_file, 
                        model_file, 
                        max_len, 
                        eval_on, 
                        proba_file, 
                        proba_on, 
                        result_file=record)
    
    
    ''' Check the processing time '''
    proc_elapsed = timeit.default_timer() - proc_start_time
    
    print("\n************** Processing Time **************")
    print("\n{}: {} sec\n".format(mode, round(proc_elapsed,2)))
    print("\n************** Processing Time **************", file=record)
    print("\n{}: {} sec\n".format(mode, round(proc_elapsed,2)), file=record)
    
    print("\nSummary file:'" + result_file + "' Created")
    print("\nSummary file:'" + result_file + "' Created", file=record)
    
    
    record.close()

# 3. Run Code

In [None]:
if __name__== "__main__":
    
    ###############################################
    ##########  1. Set Parameter Values  ##########
    ###############################################

    ########  1-1. Input file name  ########
    input_filename="LER_rawdata.csv" 
    
    
    ########  1-2. Which mode to run?  ########
    mode_name = "data-split"                                    # 3 options: "data-split", "train", "test"
                                                                # Use 'data-split' before 'train' or/and 'test'
    
    
    ########  1-3. Data size change?  ########
    ## 1-3-1. Change on/off?
    datachange_on = 0                                           # 0 for no change; 1 for change of data size
    
    ## 1-3-2. class balance (1:1)?
    balance_on = 0                                              # 0 for no balance; 1 for class balance (1:1)
    balance_sample_on = 0                                       # 0 for no sampling; 1 for sampling
    balance_sample_type = 'under'                               # 'over'(oversampling); 'under'(undersampling)
    balance_str = 'balance' + str(balance_on) + '_'
    
    ## 1-3-3. data increase?
    ratio_on = 0                                                # 0 for no ratio; 1 for ratio for data size
    ratio_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]  # a list containing ratio numbers

    
    ########  1-4. Sampling applied?  ########
    sampling_on = 0                                             # 0 for no sampling; 1 for sampling
    sampling_type = 'over'                                      # 'over'(oversampling)/'under'(undersampling)
    
    
    ########  1-5. Check token distribution for deciding a MAX_LEN value: uncommentize if needed
    #print("\n************** Token Distribution **************")
    #X, y = load_data(input_filename, record=None)
    #token_distribution(X)

    
    ########  1-6. Hyperparameters for CNN model  ########
    MAX_LEN = 150                                               
    BATCH_SIZE = 16                                             
    EPOCHS = 4                                                

   
    ########  1-7. Evaluation & probability file  ######## 
    eval_on = 1                                                 # 0 for no; 1 for yes (evaluation scores)
    proba_on = 1                                                # 0 for no; 1 for yes (probability & prediction output)
    
    

    ###############################################
    ##########   2. Run Main Fuction    ###########
    ###############################################
    
    if mode_name == "data-split":
        
        result_file = "summary_cnn_" + mode_name + ".txt" 
        X_train, y_train, X_val, y_val, X_test, y_test = split_data(input_filename, result_file)
        
    else:
        if datachange_on:                   
            for ratio in ratio_list:           
                if sampling_on:
                    tokenizer_file="tokenizer_cnn_"+balance_str+str(ratio)+"_"+sampling_type+".pickle"
                    model_file="model_cnn_"+balance_str+str(ratio)+"_"+sampling_type+".keras"
                    proba_file="result_cnn_"+balance_str+str(ratio)+"_"+sampling_type+".csv"  
                    eval_file="summary_cnn_"+mode_name+"_"+balance_str+str(ratio)+"_"+sampling_type+".txt" 
                else:
                    tokenizer_file="tokenizer_cnn_"+balance_str+str(ratio)+".pickle"
                    model_file="model_cnn_"+balance_str+str(ratio)+".keras"
                    proba_file="result_cnn_"+balance_str+str(ratio)+".csv"  
                    eval_file="summary_cnn_"+mode_name+"_"+balance_str+str(ratio)+".txt"  
            
                main(X_train, 
                     y_train, 
                     X_val, 
                     y_val, 
                     X_test, 
                     y_test, 
                     mode=mode_name,
                     datasize_change=datachange_on, 
                     sample_balance=balance_on,
                     balance_sampling_on=balance_sample_on,
                     balance_sampling_type=balance_sample_type,
                     sample_ratio=ratio_on, 
                     ratio=ratio, 
                     sample_on=sampling_on, 
                     sample_type=sampling_type, 
                     tokenizer_file=tokenizer_file,
                     max_len=MAX_LEN, 
                     batch_size=BATCH_SIZE, 
                     epochs=EPOCHS,
                     model_file=model_file, 
                     eval_on=eval_on, 
                     proba_file=proba_file,
                     proba_on=proba_on, 
                     result_file=eval_file)
        else:
            if sampling_on:
                tokenizer_file = "tokenizer_cnn_"+sampling_type+".pickle"
                model_file = "model_cnn_"+sampling_type+".keras"
                proba_file = "result_cnn_"+sampling_type+".csv"  
                eval_file = "summary_cnn_"+mode_name+"_"+sampling_type+".txt" 
            else:
                tokenizer_file = "tokenizer_cnn.pickle"
                model_file = "model_cnn.keras"
                proba_file = "result_cnn.csv"  
                eval_file = "summary_cnn_"+mode_name+".txt" 
            
            main(X_train, 
                 y_train, 
                 X_val, 
                 y_val, 
                 X_test, 
                 y_test, 
                 mode=mode_name,
                 datasize_change=datachange_on, 
                 sample_balance=balance_on,
                 balance_sampling_on=balance_sample_on,
                 balance_sampling_type=balance_sample_type,
                 sample_ratio=ratio_on, 
                 ratio=1, 
                 sample_on=sampling_on, 
                 sample_type=sampling_type, 
                 tokenizer_file=tokenizer_file,
                 max_len=MAX_LEN, 
                 batch_size=BATCH_SIZE, 
                 epochs=EPOCHS,
                 model_file=model_file, 
                 eval_on=eval_on, 
                 proba_file=proba_file,
                 proba_on=proba_on, 
                 result_file=eval_file)

            
    print("\n\n************** Processing Completed **************\n")