## Suicidal and Self-Injurious Incidents Prediction

##### This script runs the Transformer Encoder model on the notes data with data augmentation 

In [1]:
from platform import python_version
print(python_version())

3.9.7


### Load Data

In [2]:
# pip install tensorflow

In [3]:
# pip install nlpaug

In [4]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')

# For handeling dataframes
import pandas as pd
import numpy as np
import gzip
import re                                  # For regular expression operations
import string                              # For string operations
import math
import time
import random
import gc

# For text preprocessing
import nltk                                # Natural Language Toolkit
from nltk.corpus import stopwords          # For stop words that come with NLTK


# For building neural netwrok models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers, losses
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout, Bidirectional
from tensorflow.keras.optimizers import SGD, Adam

# For model evaluation
import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
from sklearn.metrics import roc_auc_score


# For text agumenters
import os
os.environ["MODEL_DIR"] = '../model'
import nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

nltk.download('averaged_perceptron_tagger')  # 
nltk.download('wordnet')                     # For Synonym augmenter
nltk.download('punkt')                       # For BackTranslation augmenter

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hongxia lu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\hongxia
[nltk_data]     lu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\hongxia
[nltk_data]     lu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
pd.set_option('display.max_colwidth', None)

In [6]:
# Load notes data for positive cases
# From R script "str_notes_4k_Alex"
data = pd.read_csv("Z:/rscripts/users/interns/hlu/data/cleaned_model_str_notes_Alex.csv")
data.shape

(87975, 18)

In [7]:
data.columns

Index(['Unnamed: 0', 'InmateID', 'BookingNumber', 'DateOfBirth', 'Sex', 'Race',
       'MaritalStatus', 'BookingDate', 'LastUpdateDateTime', 'ReleaseDate',
       'soap_sub', 'soap_obj', 'soap_ass', 'soap_plan', 'quick_notes', 'event',
       'age', 'AB109'],
      dtype='object')

In [8]:
sum(data['event'])

249

In [9]:
data['asses_plan'] = data['soap_ass'] + data['soap_plan']

In [10]:
data.drop(['Unnamed: 0', 'InmateID','DateOfBirth', 'BookingDate', 'LastUpdateDateTime', 'ReleaseDate','soap_ass', 'soap_plan'], axis=1, inplace=True)

In [11]:
abbreviations = pd.read_csv("Z:/rscripts/users/interns/hlu/Abbreviations2.csv")

In [12]:
df_dict = dict(zip(abbreviations.abbreviations, abbreviations.complete))

In [13]:
del abbreviations

In [14]:
def replace_words(text):
    text = "".join([x.lower() for x in text]) # Convert to lower case
    text = re.sub("(___|\+|nan)", " ", text).strip() 
    text = " ".join([df_dict.get(wrd, wrd) for wrd in text.split()])
    return text

In [16]:
data['soap_sub'] = data['soap_sub'].map(str).apply(replace_words)
data['soap_obj'] = data['soap_obj'].map(str).apply(replace_words)
data['asses_plan'] = data['asses_plan'].map(str).apply(replace_words)
data['quick_notes'] = data['quick_notes'].map(str).apply(replace_words)

### NLP preprocessing

### Descriptive Statistics Before Cleaning

In [18]:
# Descriptive statistics of the number of words after cleaning
print("Number of words before cleaning for soap_sub")
print("Quantiles: ", [int(e) for e in (np.quantile([len(x.split()) for x in data["soap_sub"]], q = [0, 0.25, 0.5, 0.75, 1]))])
print("Mean: ", round(np.mean([len(x.split()) for x in data["soap_sub"]])))
print("Standard deviation: ", round(np.std([len(x.split()) for x in data["soap_sub"]])))

Number of words before cleaning for soap_sub
Quantiles:  [0, 18, 56, 163, 22025]
Mean:  210
Standard deviation:  599


In [19]:
# Descriptive statistics of the number of words after cleaning
print("Number of words before cleaning for soap_obj")
print("Quantiles: ", [int(e) for e in (np.quantile([len(x.split()) for x in data["soap_obj"]], q = [0, 0.25, 0.5, 0.75, 1]))])
print("Mean: ", round(np.mean([len(x.split()) for x in data["soap_obj"]])))
print("Standard deviation: ", round(np.std([len(x.split()) for x in data["soap_obj"]])))

Number of words before cleaning for soap_obj
Quantiles:  [0, 16, 55, 148, 23987]
Mean:  174
Standard deviation:  504


In [20]:
# Descriptive statistics of the number of words after cleaning
print("Number of words before cleaning for assessment and plan")
print("Quantiles: ", [int(e) for e in (np.quantile([len(x.split()) for x in data["asses_plan"]], q = [0, 0.25, 0.5, 0.75, 1]))])
print("Mean: ", round(np.mean([len(x.split()) for x in data["asses_plan"]])))
print("Standard deviation: ", round(np.std([len(x.split()) for x in data["asses_plan"]])))

Number of words before cleaning for assessment and plan
Quantiles:  [0, 33, 57, 146, 33419]
Mean:  186
Standard deviation:  512


In [21]:
# Descriptive statistics of the number of words after cleaning
print("Number of words before cleaning for quick notes")
print("Quantiles: ", [int(e) for e in (np.quantile([len(x.split()) for x in data["quick_notes"]], q = [0, 0.25, 0.5, 0.75, 1]))])
print("Mean: ", round(np.mean([len(x.split()) for x in data["quick_notes"]])))
print("Standard deviation: ", round(np.std([len(x.split()) for x in data["quick_notes"]])))

Number of words before cleaning for quick notes
Quantiles:  [1, 52, 116, 209, 100117]
Mean:  243
Standard deviation:  866


### Cleaning Notes

In [22]:
# Download the stopwords from NLTK
nltk.download('stopwords')

# Import the standard English stop words list from NLTK
stopwords_english = stopwords.words('english') 

[nltk_data] Downloading package stopwords to C:\Users\hongxia
[nltk_data]     lu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# Clean text
def preprocess(text):
    text = "".join([x for x in text if x not in string.punctuation]) # Remmove punctuations 
    text = ' '.join(['' if (x in stopwords_english) else x for x in text.split()]) # Remove stopwords
    text = re.sub("(\W|\d+|\n)", " ", text).strip() # remove spaces, digits and line breaks
    return(text)

In [24]:
data.columns

Index(['BookingNumber', 'Sex', 'Race', 'MaritalStatus', 'soap_sub', 'soap_obj',
       'quick_notes', 'event', 'age', 'AB109', 'asses_plan'],
      dtype='object')

In [25]:
soap_sub= data["soap_sub"].apply(preprocess)
soap_obj = data["soap_obj"].apply(preprocess)
quick_notes = data["quick_notes"].apply(preprocess)
asses_plan = data["asses_plan"].apply(preprocess)

### Descriptive Statistics After Cleaning

In [26]:
# clean_sub = data["soap_sub"].apply(preprocess)

In [27]:
# Descriptive statistics of the number of words after cleaning
print("Number of words after cleaning")
# print("Quantiles: ", np.round(np.quantile([len(x.split()) for x in clean_sub], q = [0, 0.25, 0.5, 0.75, 1])))
print("Quantiles: ", np.round(np.quantile([len(x.split()) for x in soap_sub], q = [0, 0.25, 0.5, 0.75, 1])))
print("Mean: ", round(np.mean([len(x.split()) for x in soap_sub])))
print("Median: ", round(np.median([len(x.split()) for x in soap_sub])))
print("Standard deviation: ", round(np.std([len(x.split()) for x in soap_sub])))

Number of words after cleaning
Quantiles:  [    0.    14.    37.   102. 12406.]
Mean:  131
Median:  37
Standard deviation:  366


In [28]:
# clean_obj = data["soap_obj"].apply(preprocess)

In [29]:
# Descriptive statistics of the number of words after cleaning
print("Number of words after cleaning")
# print("Quantiles: ", [int(e) for e in (np.round(np.quantile([len(x.split()) for x in clean_obj], q = [0, 0.25, 0.5, 0.75, 1])))])
print("Quantiles: ", [int(e) for e in (np.round(np.quantile([len(x.split()) for x in soap_obj], q = [0, 0.25, 0.5, 0.75, 1])))])
print("Mean: ", round(np.mean([len(x.split()) for x in soap_obj])))
print("Median: ", round(np.median([len(x.split()) for x in soap_obj])))
print("Standard deviation: ", round(np.std([len(x.split()) for x in soap_obj])))

Number of words after cleaning
Quantiles:  [0, 11, 44, 110, 16828]
Mean:  129
Median:  44
Standard deviation:  370


In [30]:
# del clean_sub, clean_obj

In [31]:
# clean_asplan = data["asses_plan"].apply(preprocess)

In [32]:
# Descriptive statistics of the number of words after cleaning
print("Number of words after cleaning")
# print("Quantiles: ", [int(e) for e in (np.quantile([len(x.split()) for x in clean_asplan], q = [0, 0.25, 0.5, 0.75, 1]))])
print("Quantiles: ", [int(e) for e in (np.quantile([len(x.split()) for x in asses_plan], q = [0, 0.25, 0.5, 0.75, 1]))])
print("Mean: ", round(np.mean([len(x.split()) for x in asses_plan])))
print("Median: ", round(np.median([len(x.split()) for x in asses_plan])))
print("Standard deviation: ", round(np.std([len(x.split()) for x in asses_plan])))

Number of words after cleaning
Quantiles:  [0, 19, 41, 101, 21795]
Mean:  129
Median:  41
Standard deviation:  349


In [33]:
# clean_quick = data["quick_notes"].apply(preprocess)

In [34]:
# Descriptive statistics of the number of words after cleaning
print("Number of words after cleaning")
# print("Quantiles: ", [int(e) for e in (np.quantile([len(x.split()) for x in clean_quick], q = [0, 0.25, 0.5, 0.75, 1]))])
print("Quantiles: ", [int(e) for e in (np.quantile([len(x.split()) for x in quick_notes], q = [0, 0.25, 0.5, 0.75, 1]))])
print("Mean: ", round(np.mean([len(x.split()) for x in quick_notes])))
print("Median: ", round(np.median([len(x.split()) for x in quick_notes])))
print("Standard deviation: ", round(np.std([len(x.split()) for x in quick_notes])))

Number of words after cleaning
Quantiles:  [1, 36, 82, 143, 66513]
Mean:  162
Median:  82
Standard deviation:  564


### Modeling

In [36]:
def evaluate(model, X, y):
    y_pred = model.predict(X)
    pred = (y_pred > 0.5).astype("int32")
#     acc = np.sum(y == pred)/len(pred)
    
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()

    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    specificity = tn/(tn + fp)
    f1 = (2*precision*recall)/(precision + recall)
    acc = (tp+tn)/(tn+fp+fn+tp)
    
    auc_roc = round(roc_auc_score(y, y_pred),4)
    pre, rec, thresholds = precision_recall_curve(y, y_pred)
    auc_pr = round(auc(rec, pre),4)
    return(auc_roc, auc_pr, acc, precision, recall, specificity, f1, tn, fp, fn, tp)

In [37]:
prevalence = np.sum(data['event'])/data.shape[0]

In [38]:
prevalence

0.0028303495311167944

### Define the Transformer Encoder

In [39]:
from keras.layers.merge import concatenate

In [40]:
# Create the Transformer model
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_length, vocab_size, embedding_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.pos_emb = layers.Embedding(input_dim=max_length, output_dim=embedding_dim)

    def call(self, x):
#         maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=max_length, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate=0.3):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embedding_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    

# Four inputs (all texts)
# Three inputs (flatten before concatenating)
def transformer_model_4_flatten(vocab_size, embedding_dim,max_length,dropout_rate):   
    inputs = layers.Input(shape=(max_length,))
    embedding_layer = TokenAndPositionEmbedding(max_length,vocab_size, embedding_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embedding_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(dropout_rate)(x)
    flat_1 = layers.Flatten()(x)
    
    inputs_2 = layers.Input(shape=(max_length,))
    embedding_layer_2 = TokenAndPositionEmbedding(max_length,vocab_size, embedding_dim)
    x_2 = embedding_layer_2(inputs_2)
    transformer_block_2 = TransformerBlock(embedding_dim, num_heads, ff_dim)
    x_2 = transformer_block_2(x_2)
    x_2 = layers.GlobalAveragePooling1D()(x_2)
    x_2 = layers.Dropout(dropout_rate)(x_2)
    flat_2 = layers.Flatten()(x_2)
    
    
    inputs_3 = layers.Input(shape=(max_length,))
    embedding_layer_3 = TokenAndPositionEmbedding(max_length,vocab_size, embedding_dim)
    x_3 = embedding_layer_3(inputs_3)
    transformer_block_3 = TransformerBlock(embedding_dim, num_heads, ff_dim)
    x_3 = transformer_block_3(x_3)
    x_3 = layers.GlobalAveragePooling1D()(x_3)
    x_3 = layers.Dropout(dropout_rate)(x_3)
    flat_3 = layers.Flatten()(x_3)
    
    inputs_4 = layers.Input(shape=(max_length,))
    embedding_layer_4 = TokenAndPositionEmbedding(max_length,vocab_size, embedding_dim)
    x_4 = embedding_layer_4(inputs_4)
    transformer_block_4 = TransformerBlock(embedding_dim, num_heads, ff_dim)
    x_4 = transformer_block_4(x_4)
    x_4 = layers.GlobalAveragePooling1D()(x_4)
    x_4 = layers.Dropout(dropout_rate)(x_4)
    flat_4 = layers.Flatten()(x_4)
    
    merge = concatenate([flat_1, flat_2, flat_3, flat_4])
    
    hidden = Dense(128, activation='relu')(merge)
    
    outputs = layers.Dense(1, activation="sigmoid")(hidden)

    model = tf.keras.Model(inputs=[inputs, inputs_2, inputs_3, inputs_4], outputs=outputs)
    opt = tf.keras.optimizers.Adam(lr=0.001)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['AUC'])
    
    return(model)

## Four Inputs (all texts)

### Undersample to 20% and then augment to 50%

In [41]:
# For TFIDF augmenter
def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"): 
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

In [48]:
# pip install torchvision
# pip install transformers

# Arguments for character augmenter 
aug_p=0.2 

# Define the augmenters
augs = []
augs.append(nac.KeyboardAug(aug_char_p = aug_p, 
                         include_special_char=False,
                         include_upper_case=False, include_numeric=False)) # Substitute characters by keyboard distance
augs.append(nac.RandomCharAug(action="insert", 
                         aug_char_p = aug_p, 
                         include_upper_case=False, include_numeric=False)) # Insert characters randomly
augs.append(nac.RandomCharAug(action="substitute",
                         aug_char_p = aug_p, 
                         include_upper_case=False, include_numeric=False)) # Substitute characters randomly
augs.append(nac.RandomCharAug(action="swap", 
                         aug_char_p = aug_p, 
                         include_upper_case=False, include_numeric=False)) # Swap characters randomly
augs.append(nac.RandomCharAug(action="delete", 
                         aug_char_p = aug_p, 
                         include_upper_case=False, include_numeric=False)) # Delete characters randomly
# Word Augmenter
augs.append(naw.RandomWordAug(action="swap", aug_p=aug_p)) # Swap words randomly
augs.append(naw.RandomWordAug(action='delete', aug_p=aug_p)) # Delete words randomly
# augs.append(naw.RandomWordAug(action='crop', aug_p=aug_p)) # Delete a set of continuous words randomly
augs.append(naw.SynonymAug(aug_src='wordnet', aug_p=aug_p)) # Substitute words by WordNet's synonym
augs.append(naw.AntonymAug(aug_p=aug_p)) # Substitute word by antonym
augs.append(naw.SplitAug(aug_p=aug_p)) # Split words to two tokens randomly
augs.append(naw.SpellingAug(aug_p=aug_p)) # Substitute words by simulated spelling errors



In [41]:
pd_train_idx_pct = pd.read_csv('other3/pd_under_idx_10_pct_0.3.csv')
pd_test_idx_10 = pd.read_csv('other/pd_test_idx_10.csv')

pd_train_idx_pct.drop('Unnamed: 0', axis=1, inplace = True)
pd_test_idx_10.drop('Unnamed: 0', axis=1, inplace=True)

In [47]:
# Undersample to 20% and then augment to 50% to choose the best augmenter
epochs = 20
batch_size = 32
max_length = 162 # Largest mean length of the four types of notes

dropout_rate = 0.4
embedding_dim = 200
num_heads = 4  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

time_1 = time.time()

X = data['soap_obj']
X_2 = data['soap_sub']
X_3 = data['asses_plan']
X_4 = data['quick_notes']
y = data['event']


j = 0 # to keep track of the iteration number
time_start = time.time()


with open('structured_textpred/text_augmentation.csv','a') as fd:
    fd.write(f'Understample to 20% then augment to 50% on 14 augmenters to choose the best augmenter\n')
    
# Run the model 10 times with a different split each time  

for ii in range(10):
    time_s = time.time()
    
    j += 1
    iteration = "iter" + str(j)
    print(iteration, '....')
    
    # Train and test data
    train_index = pd_train_idx_pct.iloc[:, ii].values # Undersampled training sets
    test_index = pd_test_idx_10.iloc[:, ii].values # Test sets remain the same
                    
    x_train_1, x_test_1 = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    x_train_2, x_test_2 = X_2[train_index], X_2[test_index]
    x_train_3, x_test_3 = X_3[train_index], X_3[test_index]
    x_train_4, x_test_4 = X_4[train_index], X_4[test_index]
    
    
    # Select the samples to be augmented 
    x_aug_0 = x_train_1[y_train==1]
    y_aug_0 = y_train[y_train==1]

    x_aug_0 = np.array(x_aug_0)
    y_aug_0 = np.array(y_aug_0)
    
    x_aug2_0 = x_train_2[y_train==1]
    y_aug2_0 = y_train[y_train==1]

    x_aug2_0 = np.array(x_aug2_0)
    y_aug2_0 = np.array(y_aug2_0)
    
    x_aug3_0 = x_train_3[y_train==1]
    y_aug3_0 = y_train[y_train==1]

    x_aug3_0 = np.array(x_aug3_0)
    y_aug3_0 = np.array(y_aug3_0)
    
    x_aug4_0 = x_train_4[y_train==1]
    y_aug4_0 = y_train[y_train==1]

    x_aug4_0 = np.array(x_aug4_0)
    y_aug4_0 = np.array(y_aug4_0)

    # Randomly augment a predetermined number of the selected cases
    length = len(x_aug_0) # Total number of texts availble for augmentation
    p = sum(y_train)/len(y_train)
    num = math.ceil(abs(p - 0.5)*len(train_index))*2 # Number of samples needed to balance the classes in the training set

    if num <= length:
        # If the number of texts to be augmented is larger than the number of samples needed
        samples = np.random.choice(range(length), size=num, replace=False) # Generate random indices without replacement
    else:
        samples = np.random.choice(range(length), size=num, replace=True) # Generate random indices with replacement
        
    for jj in range(len(augs)):

        # Augment the training set             
        DA_start = time.time()

        new_text = []
        new_y = []

        new_text2 = []
        new_y2 = []

        new_text3 = []
        new_y3 = []

        new_text4 = []
        new_y4 = []
        
        aug = augs[jj]
        

        for kk in samples: # Go through each sample to be augmented   
            text = x_aug_0[kk]
            label_aug = y_aug_0[kk]
            augmented_text = aug.augment(text)

            new_text.append(augmented_text)
            new_y.append(label_aug) # Assign the label of the original text to the augmented text

            text2 = x_aug2_0[kk]
            label_aug2 = y_aug2_0[kk]
            augmented_text2 = aug.augment(text2)

            new_text2.append(augmented_text2)
            new_y2.append(label_aug2) # Assign the label of the original text to the augmented text

            text3 = x_aug3_0[kk]
            label_aug3 = y_aug3_0[kk]
            augmented_text3 = aug.augment(text3)

            new_text3.append(augmented_text3)
            new_y3.append(label_aug3) # Assign the label of the original text to the augmented text

            text4 = x_aug4_0[kk]
            label_aug4 = y_aug4_0[kk]
            augmented_text4 = aug.augment(text4)

            new_text4.append(augmented_text4)
            new_y4.append(label_aug4) # Assign the label of the original text to the augmented text

        # Append the augmented texts and their corresponding labels to the original data
        x_aug = np.append(x_train_1, new_text)
        y_0 = np.append(y_train, new_y)
        y_aug = y_0.T

        x_aug2 = np.append(x_train_2, new_text2)
        y_02 = np.append(y_train, new_y2)
        y_aug2 = y_02.T

        x_aug3 = np.append(x_train_3, new_text3)
        y_03 = np.append(y_train, new_y3)
        y_aug3 = y_03.T

        x_aug4 = np.append(x_train_4, new_text4)
        y_04 = np.append(y_train, new_y4)
        y_aug4 = y_04.T

        # Shuffle the order of the augmented dataset
        augmented = pd.DataFrame((np.vstack((x_aug, y_aug))).T)
        augmented = augmented.sample(frac=1).reset_index(drop=True)

        augmented2 = pd.DataFrame((np.vstack((x_aug2, y_aug2))).T)
        augmented2 = augmented2.sample(frac=1).reset_index(drop=True)

        augmented3 = pd.DataFrame((np.vstack((x_aug3, y_aug3))).T)
        augmented3 = augmented3.sample(frac=1).reset_index(drop=True)

        augmented4 = pd.DataFrame((np.vstack((x_aug4, y_aug4))).T)
        augmented4 = augmented4.sample(frac=1).reset_index(drop=True)

        DA_time = time.time() - DA_start

        with open('structured_textpred/text_augmentation.csv','a') as fd:
            fd.write(f'{iteration},DA Running Time:,{DA_time}\n')

        # Convert the dataframe to numpy arrays
        x_train1 = np.array(augmented.iloc[:,0])
        y_train1 = np.array(augmented.iloc[:,1])
        y_train1 = y_train1.astype('float32')

        x_train2 = np.array(augmented2.iloc[:,0])
        y_train2 = np.array(augmented2.iloc[:,1])
        y_train2 = y_train2.astype('float32')

        x_train3 = np.array(augmented3.iloc[:,0])
        y_train3 = np.array(augmented3.iloc[:,1])
        y_train3 = y_train3.astype('float32')

        x_train4 = np.array(augmented4.iloc[:,0])
        y_train4 = np.array(augmented4.iloc[:,1])
        y_train4 = y_train4.astype('float32')

        # Run CNN model
        model_time_start = time.time()

        # Preprocess the text
        x_train1 = np.array(pd.Series(x_train1).apply(preprocess))
        x_test1 = np.array(pd.Series(x_test_1).apply(preprocess))

        x_train2 = np.array(pd.Series(x_train2).apply(preprocess))
        x_test2 = np.array(pd.Series(x_test_2).apply(preprocess))

        x_train3 = np.array(pd.Series(x_train3).apply(preprocess))
        x_test3 = np.array(pd.Series(x_test_3).apply(preprocess))

        x_train4 = np.array(pd.Series(x_train4).apply(preprocess))
        x_test4 = np.array(pd.Series(x_test_4).apply(preprocess))

        # Tokenize the text   
        tokenizer = Tokenizer(num_words=5000) # get the frequency of all tokens and use the 5000 most common ones
        tokenizer.fit_on_texts(x_train1)
        x_train1 = tokenizer.texts_to_sequences(x_train1)
        x_test1 = tokenizer.texts_to_sequences(x_test1)
        vocab_size = len(tokenizer.word_index) + 1 # plus the reserved index 0
        word_index = tokenizer.word_index

        tokenizer2 = Tokenizer(num_words=5000)
        tokenizer2.fit_on_texts(x_train2)
        x_train2 = tokenizer2.texts_to_sequences(x_train2)
        x_test2 = tokenizer2.texts_to_sequences(x_test2)


        tokenizer3 = Tokenizer(num_words=5000)
        tokenizer3.fit_on_texts(x_train3)
        x_train3 = tokenizer3.texts_to_sequences(x_train3)
        x_test3 = tokenizer3.texts_to_sequences(x_test3)

        tokenizer4 = Tokenizer(num_words=5000)
        tokenizer4.fit_on_texts(x_train4)
        x_train4 = tokenizer4.texts_to_sequences(x_train4)
        x_test4 = tokenizer4.texts_to_sequences(x_test4)

        # Pad the sequences with 0's
        x_train1 = pad_sequences(x_train1, padding='post', maxlen=max_length) 
        x_test1 = pad_sequences(x_test1, padding='post', maxlen=max_length)

        x_train2 = pad_sequences(x_train2, padding='post', maxlen=max_length) 
        x_test2 = pad_sequences(x_test2, padding='post', maxlen=max_length)


        x_train3 = pad_sequences(x_train3, padding='post', maxlen=max_length) 
        x_test3 = pad_sequences(x_test3, padding='post', maxlen=max_length)

        x_train4 = pad_sequences(x_train4, padding='post', maxlen=max_length) 
        x_test4 = pad_sequences(x_test4, padding='post', maxlen=max_length)

        # Fit the Transformer model
        mymodel = transformer_model_4_flatten(vocab_size, embedding_dim,max_length,dropout_rate)
        mymodel.fit([x_train1,x_train2,x_train3,x_train4], y_train1, epochs=epochs, batch_size=batch_size)

        # Collect and log evaluation metrics
        auc_roc, auc_pr, acc, precision, recall, specificity, f1, tn, fp, fn, tp = evaluate(mymodel, [x_test1, x_test2, x_test3, x_test4],y_test)
        model_time = time.time() - model_time_start

        with open('structured_textpred/text_augmentation.csv','a') as fd:
            fd.write(f'{iteration},aug_{jj},{auc_roc},{auc_pr},{acc},{precision},{recall},{specificity},{f1},{model_time},{tn},{fp},{fn},{tp},{num}\n')

        del mymodel, tokenizer, tokenizer2, tokenizer3, tokenizer4
        del text, text2, text3, text4
        del new_text, new_text2, new_text3, new_text4
        del new_y, new_y2, new_y3, new_y4
        del augmented, augmented2, augmented3, augmented4
        del augmented_text, augmented_text2, augmented_text3, augmented_text4
        del x_aug, x_aug2, x_aug3, x_aug4
        del y_0, y_02, y_03, y_04
        del x_train1, x_train2, x_train3, x_train4
        del x_test1, x_test2, x_test3, x_test4
        gc.collect()
    
    del x_train_1,x_train_2,x_train_3,x_train_4,x_test_1,x_test_2,x_test_3,x_test_4
    del x_aug_0, x_aug2_0, x_aug3_0, x_aug4_0
    del y_aug_0, y_aug2_0, y_aug3_0, y_aug4_0

    gc.collect()

    time_e = time.time() - time_s
    with open('structured_textpred/text_augmentation.csv','a') as fd:
        fd.write(f'1 iteration 14 DA,{time_e}\n')

running_time = time.time() - time_start
with open('structured_textpred/text_augmentation.csv','a') as fd:
        fd.write(f'10 iteration training time,{running_time}\n')

iter1 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
iter2 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch

Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoc

Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
iter5 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 1

Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
iter6 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/2

Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 1

Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
iter7 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/2

Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
iter8 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch

Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12

Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/

Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoc

Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/

## Augmenter 3

In [51]:
# Undersample and then augment to 50% to choose the best augmenter
epochs = 20
batch_size = 32
max_length = 160 # Largest mean length of the four types of notes

dropout_rate = 0.4
embedding_dim = 200
num_heads = 4  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

time_1 = time.time()

X = data['soap_obj']
X_2 = data['soap_sub']
X_3 = data['asses_plan']
X_4 = data['quick_notes']
y = data['event']

# del data

j = 0 # to keep track of the iteration number
time_start = time.time()


with open('structured_textpred/text_augmentation.csv','a') as fd:
    fd.write(f'Understample to 30% then augment to 50% using augmneter_3\n')
    
# Run the model 10 times with a different split each time  

for ii in range(10):
    time_s = time.time()
    
    j += 1
    iteration = "iter" + str(j)
    print(iteration, '....')
    
    # Train and test data
    train_index = pd_train_idx_pct.iloc[:, ii].values # Undersampled training sets
    test_index = pd_test_idx_10.iloc[:, ii].values # Test sets remain the same
                    
    x_train_1, x_test_1 = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    x_train_2, x_test_2 = X_2[train_index], X_2[test_index]
    x_train_3, x_test_3 = X_3[train_index], X_3[test_index]
    x_train_4, x_test_4 = X_4[train_index], X_4[test_index]
    
    
    # Select the samples to be augmented 
    x_aug_0 = x_train_1[y_train==1]
    y_aug_0 = y_train[y_train==1]

    x_aug_0 = np.array(x_aug_0)
    y_aug_0 = np.array(y_aug_0)
    
    x_aug2_0 = x_train_2[y_train==1]
    y_aug2_0 = y_train[y_train==1]

    x_aug2_0 = np.array(x_aug2_0)
    y_aug2_0 = np.array(y_aug2_0)
    
    x_aug3_0 = x_train_3[y_train==1]
    y_aug3_0 = y_train[y_train==1]

    x_aug3_0 = np.array(x_aug3_0)
    y_aug3_0 = np.array(y_aug3_0)
    
    x_aug4_0 = x_train_4[y_train==1]
    y_aug4_0 = y_train[y_train==1]

    x_aug4_0 = np.array(x_aug4_0)
    y_aug4_0 = np.array(y_aug4_0)

    # Randomly augment a predetermined number of the selected cases
    length = len(x_aug_0) # Total number of texts availble for augmentation
    p = sum(y_train)/len(y_train)
    num = math.ceil(abs(p - 0.5)*len(train_index))*2 # Number of samples needed to balance the classes in the training set

    if num <= length:
        # If the number of texts to be augmented is larger than the number of samples needed
        samples = np.random.choice(range(length), size=num, replace=False) # Generate random indices without replacement
    else:
        samples = np.random.choice(range(length), size=num, replace=True) # Generate random indices with replacement
        


    # Augment the training set             
    DA_start = time.time()

    new_text = []
    new_y = []

    new_text2 = []
    new_y2 = []

    new_text3 = []
    new_y3 = []

    new_text4 = []
    new_y4 = []

    aug = augs[3]


    for kk in samples: # Go through each sample to be augmented   
        text = x_aug_0[kk]
        label_aug = y_aug_0[kk]
        augmented_text = aug.augment(text)

        new_text.append(augmented_text)
        new_y.append(label_aug) # Assign the label of the original text to the augmented text

        text2 = x_aug2_0[kk]
        label_aug2 = y_aug2_0[kk]
        augmented_text2 = aug.augment(text2)

        new_text2.append(augmented_text2)
        new_y2.append(label_aug2) # Assign the label of the original text to the augmented text

        text3 = x_aug3_0[kk]
        label_aug3 = y_aug3_0[kk]
        augmented_text3 = aug.augment(text3)

        new_text3.append(augmented_text3)
        new_y3.append(label_aug3) # Assign the label of the original text to the augmented text

        text4 = x_aug4_0[kk]
        label_aug4 = y_aug4_0[kk]
        augmented_text4 = aug.augment(text4)

        new_text4.append(augmented_text4)
        new_y4.append(label_aug4) # Assign the label of the original text to the augmented text

    # Append the augmented texts and their corresponding labels to the original data
    x_aug = np.append(x_train_1, new_text)
    y_0 = np.append(y_train, new_y)
    y_aug = y_0.T

    x_aug2 = np.append(x_train_2, new_text2)
    y_02 = np.append(y_train, new_y2)
    y_aug2 = y_02.T

    x_aug3 = np.append(x_train_3, new_text3)
    y_03 = np.append(y_train, new_y3)
    y_aug3 = y_03.T

    x_aug4 = np.append(x_train_4, new_text4)
    y_04 = np.append(y_train, new_y4)
    y_aug4 = y_04.T

    # Shuffle the order of the augmented dataset
    augmented = pd.DataFrame((np.vstack((x_aug, y_aug))).T)
    augmented = augmented.sample(frac=1).reset_index(drop=True)

    augmented2 = pd.DataFrame((np.vstack((x_aug2, y_aug2))).T)
    augmented2 = augmented2.sample(frac=1).reset_index(drop=True)

    augmented3 = pd.DataFrame((np.vstack((x_aug3, y_aug3))).T)
    augmented3 = augmented3.sample(frac=1).reset_index(drop=True)

    augmented4 = pd.DataFrame((np.vstack((x_aug4, y_aug4))).T)
    augmented4 = augmented4.sample(frac=1).reset_index(drop=True)

    DA_time = time.time() - DA_start

    with open('structured_textpred/text_augmentation.csv','a') as fd:
        fd.write(f'{iteration},DA Running Time:,{DA_time}\n')

    # Convert the dataframe to numpy arrays
    x_train1 = np.array(augmented.iloc[:,0])
    y_train1 = np.array(augmented.iloc[:,1])
    y_train1 = y_train1.astype('float32')

    x_train2 = np.array(augmented2.iloc[:,0])
    y_train2 = np.array(augmented2.iloc[:,1])
    y_train2 = y_train2.astype('float32')

    x_train3 = np.array(augmented3.iloc[:,0])
    y_train3 = np.array(augmented3.iloc[:,1])
    y_train3 = y_train3.astype('float32')

    x_train4 = np.array(augmented4.iloc[:,0])
    y_train4 = np.array(augmented4.iloc[:,1])
    y_train4 = y_train4.astype('float32')

    # Run CNN model
    model_time_start = time.time()

    # Preprocess the text
    x_train1 = np.array(pd.Series(x_train1).apply(preprocess))
    x_test1 = np.array(pd.Series(x_test_1).apply(preprocess))

    x_train2 = np.array(pd.Series(x_train2).apply(preprocess))
    x_test2 = np.array(pd.Series(x_test_2).apply(preprocess))

    x_train3 = np.array(pd.Series(x_train3).apply(preprocess))
    x_test3 = np.array(pd.Series(x_test_3).apply(preprocess))

    x_train4 = np.array(pd.Series(x_train4).apply(preprocess))
    x_test4 = np.array(pd.Series(x_test_4).apply(preprocess))

    # Tokenize the text   
    tokenizer = Tokenizer(num_words=5000) # get the frequency of all tokens and use the 5000 most common ones
    tokenizer.fit_on_texts(x_train1)
    x_train1 = tokenizer.texts_to_sequences(x_train1)
    x_test1 = tokenizer.texts_to_sequences(x_test1)
    vocab_size = len(tokenizer.word_index) + 1 # plus the reserved index 0
    word_index = tokenizer.word_index

    tokenizer2 = Tokenizer(num_words=5000)
    tokenizer2.fit_on_texts(x_train2)
    x_train2 = tokenizer2.texts_to_sequences(x_train2)
    x_test2 = tokenizer2.texts_to_sequences(x_test2)


    tokenizer3 = Tokenizer(num_words=5000)
    tokenizer3.fit_on_texts(x_train3)
    x_train3 = tokenizer3.texts_to_sequences(x_train3)
    x_test3 = tokenizer3.texts_to_sequences(x_test3)

    tokenizer4 = Tokenizer(num_words=5000)
    tokenizer4.fit_on_texts(x_train4)
    x_train4 = tokenizer4.texts_to_sequences(x_train4)
    x_test4 = tokenizer4.texts_to_sequences(x_test4)

    # Pad the sequences with 0's
    x_train1 = pad_sequences(x_train1, padding='post', maxlen=max_length) 
    x_test1 = pad_sequences(x_test1, padding='post', maxlen=max_length)

    x_train2 = pad_sequences(x_train2, padding='post', maxlen=max_length) 
    x_test2 = pad_sequences(x_test2, padding='post', maxlen=max_length)


    x_train3 = pad_sequences(x_train3, padding='post', maxlen=max_length) 
    x_test3 = pad_sequences(x_test3, padding='post', maxlen=max_length)

    x_train4 = pad_sequences(x_train4, padding='post', maxlen=max_length) 
    x_test4 = pad_sequences(x_test4, padding='post', maxlen=max_length)

    # Fit the Transformer model
    mymodel = transformer_model_4_flatten(vocab_size, embedding_dim,max_length,dropout_rate)
    mymodel.fit([x_train1,x_train2,x_train3,x_train4], y_train1, epochs=epochs, batch_size=batch_size)

    # Collect and log evaluation metrics
    auc_roc, auc_pr, acc, precision, recall, specificity, f1, tn, fp, fn, tp = evaluate(mymodel, [x_test1, x_test2, x_test3, x_test4],y_test)
    model_time = time.time() - model_time_start

    with open('structured_textpred/text_augmentation.csv','a') as fd:
        fd.write(f'{iteration},aug_{jj},{auc_roc},{auc_pr},{acc},{precision},{recall},{specificity},{f1},{model_time},{tn},{fp},{fn},{tp},{num}\n')

    del mymodel, tokenizer, tokenizer2, tokenizer3, tokenizer4
    del text, text2, text3, text4
    del new_text, new_text2, new_text3, new_text4
    del new_y, new_y2, new_y3, new_y4
    del augmented, augmented2, augmented3, augmented4
    del augmented_text, augmented_text2, augmented_text3, augmented_text4
    del x_aug, x_aug2, x_aug3, x_aug4
    del y_0, y_02, y_03, y_04
    del x_train1, x_train2, x_train3, x_train4
    del x_test1, x_test2, x_test3, x_test4
    
    
    del x_train_1,x_train_2,x_train_3,x_train_4,x_test_1,x_test_2,x_test_3,x_test_4
    del x_aug_0, x_aug2_0, x_aug3_0, x_aug4_0
    del y_aug_0, y_aug2_0, y_aug3_0, y_aug4_0

    gc.collect()

    time_e = time.time() - time_s
    with open('structured_textpred/text_augmentation.csv','a') as fd:
        fd.write(f'1 iteration 14 DA,{time_e}\n')

running_time = time.time() - time_start
with open('structured_textpred/text_augmentation.csv','a') as fd:
        fd.write(f'10 iteration training time,{running_time}\n')

iter1 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
iter2 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
iter3 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
iter4 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
iter5 ....
Epoch 1/20
Epoch 2/20

Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
iter10 ....
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Four Inputs - scaled

In [42]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [124]:
# For 10% prevalence, underssample the negative cases
pct = 0.5
num_samples = round(249/pct - 249) # Number of samples needed from the negatives

neg = data[data['event'] == 0]
pos = data[data['event'] == 1]
neg = neg.reset_index()

samples = np.random.choice(range(neg.shape[0]), size=num_samples, replace=False)
neg = neg.iloc[samples,]
data2 = pd.concat([neg, pos], axis=0, ignore_index = True)
data2 = data2.reset_index()

In [125]:
# No augmentation
epochs = 20
batch_size = 32
max_length = 199 # Longest Mean length of the 4 notes
dropout_rate = 0.3
embedding_dim = 200
num_heads = 4  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

time_1 = time.time()

X = data2['soap_obj']
X_2 = data2['soap_sub']
X_3 = data2['ass_plan']
X_4 = data2['quick_notes']
y = data2['event']

X_5 = pd.get_dummies(data2[['Sex','Race','MaritalStatus','age','AB109']])


j = 0 # to keep track of the iteration number
time_start = time.time()

with open('other/soap_self_harm.csv','a') as fd:
    fd.write(f'No augmentation_4 heads_maxlen_199_replace_words_4-inputs_Scaled_Structured_lr=0.0001_prevalence_50%\n')
    
# Run the model 10 times with a different split each time  
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.25, random_state=0)##################################
       
for train_index, test_index in sss.split(X, y):
    time_s = time.time()

    j += 1
    iteration = "iter" + str(j)
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    x_train2, x_test2 = X_2[train_index], X_2[test_index]
    x_train3, x_test3 = X_3[train_index], X_3[test_index]
    x_train4, x_test4 = X_4[train_index], X_4[test_index]
    
    # Standardize the structured data
    x_train_5, x_test_5 = X_5.iloc[train_index,], X_5.iloc[test_index,]
    std_scaler = preprocessing.StandardScaler()
    std_scaler.fit(x_train_5)
    x_train_5_scaled = std_scaler.transform(x_train_5)
    x_test_5_scaled = std_scaler.transform(x_test_5)


    # Run CNN model
    model_time_start = time.time()

    # Preprocess the text
    vfunc = np.vectorize(preprocess)
    x_train_1 = vfunc(x_train)
    x_test_1 = vfunc(x_test)
    
    x_train_2 = vfunc(x_train2)
    x_test_2 = vfunc(x_test2)
    
    x_train_3 = vfunc(x_train3)
    x_test_3 = vfunc(x_test3)
    
    x_train_4 = vfunc(x_train4)
    x_test_4 = vfunc(x_test4)

    # Tokenize the text   
    tokenizer = Tokenizer(num_words=5000) # get the frequency of all tokens and use the 5000 most common ones
    tokenizer.fit_on_texts(x_train_1)
    x_train_1 = tokenizer.texts_to_sequences(x_train_1)
    x_test_1 = tokenizer.texts_to_sequences(x_test_1)
    vocab_size = len(tokenizer.word_index) + 1 # plus the reserved index 0
    word_index = tokenizer.word_index
    
    tokenizer_2 = Tokenizer(num_words=5000) # get the frequency of all tokens and use the 5000 most common ones
    tokenizer_2.fit_on_texts(x_train_2)
    x_train_2 = tokenizer_2.texts_to_sequences(x_train_2)
    x_test_2 = tokenizer_2.texts_to_sequences(x_test_2)
    
    tokenizer_3 = Tokenizer(num_words=5000) # get the frequency of all tokens and use the 5000 most common ones
    tokenizer_3.fit_on_texts(x_train_3)
    x_train_3 = tokenizer_3.texts_to_sequences(x_train_3)
    x_test_3 = tokenizer_3.texts_to_sequences(x_test_3)
    
    tokenizer_4 = Tokenizer(num_words=5000) # get the frequency of all tokens and use the 5000 most common ones
    tokenizer_4.fit_on_texts(x_train_4)
    x_train_4 = tokenizer_4.texts_to_sequences(x_train_4)
    x_test_4 = tokenizer_4.texts_to_sequences(x_test_4)

    # Pad the sequences with 0's
    x_train_1 = pad_sequences(x_train_1, padding='post', maxlen=max_length) 
    x_test_1 = pad_sequences(x_test_1, padding='post', maxlen=max_length)
    
    x_train_2 = pad_sequences(x_train_2, padding='post', maxlen=max_length) 
    x_test_2 = pad_sequences(x_test_2, padding='post', maxlen=max_length)
    
    x_train_3 = pad_sequences(x_train_3, padding='post', maxlen=max_length) 
    x_test_3 = pad_sequences(x_test_3, padding='post', maxlen=max_length)
    
    x_train_4 = pad_sequences(x_train_4, padding='post', maxlen=max_length) 
    x_test_4 = pad_sequences(x_test_4, padding='post', maxlen=max_length)

    # Fit the Transformer model
    mymodel = transformer_model_5_structured(vocab_size, embedding_dim,max_length,dropout_rate)
    mymodel.fit([x_train_1,x_train_2,x_train_3,x_train_4,x_train_5_scaled], y_train, epochs=epochs, batch_size=batch_size)

    # Collect and log evaluation metrics
    auc_roc, auc_pr, acc, precision, recall, specificity, f1, tn, fp, fn, tp = evaluate(mymodel, [x_test_1,x_test_2,x_test_3,x_test_4,x_test_5_scaled], y_test)
    model_time = time.time() - model_time_start

    with open('other/soap_self_harm.csv','a') as fd:
        fd.write(f'{iteration},{auc_roc},{auc_pr},{acc},{precision},{recall},{specificity},{f1},{tn}, {fp}, {fn}, {tp}\n')

    del mymodel, tokenizer, tokenizer_2, tokenizer_3, tokenizer_4 
    gc.collect()
    
    time_e = time.time() - time_s
    with open('other/soap_self_harm.csv','a') as fd:
        fd.write(f'1 iteration 18 DA,{time_e}\n')

running_time = time.time() - time_start
with open('other/soap_self_harm.csv','a') as fd:
        fd.write(f'10 iteration training time,{running_time}\n')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
