## 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import string
import re
import statistics

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

from sklearn.preprocessing import MinMaxScaler

from scipy.sparse import csr_matrix
import pickle

In [2]:
pd.set_option('display.max_rows', 1000); pd.set_option('display.max_columns', 1000); pd.set_option('display.width', 1000)

## 2. Functions

In [3]:
# Performs a series of replacements
def clean_notes(text):
    # Remove '\n'
    text = text.replace("\n", " ")
    #Remove >>>, username, date & time
    text = re.sub(r">>> ([a-z0-9_\.-]+) : \d{2}/\d{2}/\d{4} \d{2}:\d{2} ", "", text)
    #reference code ("Ref JR: " + 6 digits)
    text = re.sub("Ref JR: \d{6} ", "", text)
    return text

In [4]:
# Remove punctuation
def remove_punctuation(text):
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 
    text = text.translate(translator)
    text = re.sub(r'\s+'," ", text)
    return text

In [5]:
# Define Stopwords
nltk.download('stopwords')
stopwords = stopwords.words("english")

wo_stop_words = ['location', 'loc', 'description', 'fault', 'action', 'work', 'start', 'end']

for i in wo_stop_words:
    if i not in stopwords:
        stopwords.append(i)

[nltk_data] Downloading package stopwords to C:\Users\Chun
[nltk_data]     Quan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# remove stopwords, lemmatize, returns only alphabet characters

nltk.download('punkt') # Tokenizer model
nltk.download('wordnet') # Lexical database
# nltk.download('punkt_tab')

def remove_stopwords_lemmatize(sentence):
    # Tokenize
    tokens = word_tokenize(sentence)
    tokens = [word.lower() for word in tokens]

    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Only contains alphabet characters
    tokens = " ".join([word for word in tokens if word.isalpha()])

    return tokens

[nltk_data] Downloading package punkt to C:\Users\Chun
[nltk_data]     Quan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Chun
[nltk_data]     Quan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# Filter out the noise from a text by removing words that appear less than 3
def filter_noise(text):
    text = ' '.join([word for word in word_tokenize(text) if word_counts[word] > 3])
    return text

## 3. Datasets and Preprocessing

### a. Training Dataset

Training Data: Jan 2020 to Jun 2023

In [8]:
# Load training datasets
tmp_data_1 = pd.read_csv("0_NEL_MMS_WO (2020-2022)_withnotes.csv")
tmp_data_2 = pd.read_csv("0_NEL_MMS_WO (Jan2023 - Jul2023).csv")

In [9]:
# Ensure both dataset has the same columns
tmp_data_2 = tmp_data_2[tmp_data_1.columns]
# Merge dataset
tmp_data = pd.concat([tmp_data_1, tmp_data_2])

In [10]:
# Filter CM Job
train_df = tmp_data[tmp_data["JOB_TYPE"] == "CM"].copy(deep = True)
print(f'CM Work Order DataFrame: {train_df.shape[0]} rows')

# Extract notes & symptom, drop those that are na
train_df = train_df[["NOTES", "DESCRIPTION_SYMPTOM"]]
train_df = train_df.dropna(how="any")

# Rest axis & rename columns
train_df = train_df.reset_index(drop=True)
train_df = train_df.rename(columns = {'DESCRIPTION_SYMPTOM': 'TARGET'})
print(f'Symptom Work Order DataFrame: {train_df.shape[0]} rows')

CM Work Order DataFrame: 66838 rows
Symptom Work Order DataFrame: 27189 rows


In [11]:
train_df.head()

Unnamed: 0,NOTES,TARGET
0,>>> nepsas26 : 25/01/2020 08:57\nFault: Intrus...,Patron sensor alignment out/Patron Sensor bloc...
1,>>> nesctc02 : 30/01/2020 09:57\nFault: No com...,No Comms
2,>>> nealms01 : 29/01/2020 07:21\nFault : CSC R...,CSC R/W/C-Through Reader
3,>>> nealsn01 : 04/01/2020 10:53\nFault : BNA ...,Other Hardware faults/Other hardware problems
4,>>> nepsas26 : 06/01/2020 14:00\nFault: NBX un...,Magazine 1/Change Notebox


### b. Testing Dataset

Testing Data: Jul 2023 - Dec 2023

In [12]:
# Load testing dataset, ensure both dataset has the same columns as training datasets
tmp_data_3 = pd.read_csv('0_NEL_MMS_WO (Jul2023 - Dec2023).csv')
tmp_data_3 = tmp_data_3[tmp_data.columns]

In [13]:
# Filter CM Job
test_df = tmp_data_3[tmp_data_3["JOB_TYPE"] == "CM"].copy(deep = True)
print(f'CM Work Order DataFrame: {test_df.shape[0]} rows')

# Extract notes & symptom, drop those that are N.A.
test_df = test_df[["NOTES", "DESCRIPTION_SYMPTOM"]]
test_df = test_df.dropna(how="any")

# Rest axis & Rename columns
test_df = test_df.reset_index(drop=True)
test_df = test_df.rename(columns = {'DESCRIPTION_SYMPTOM': 'TARGET'})
print(f'Symptom Work Order DataFrame: {test_df.shape[0]} rows')

CM Work Order DataFrame: 8985 rows
Symptom Work Order DataFrame: 5407 rows


In [14]:
test_df.head()

Unnamed: 0,NOTES,TARGET
0,>>> nealsn01 : 26/07/2022 09:05\nFault : Gate ...,Hang
1,>>> nepsas26 : 28/07/2022 10:36\nFault: Gate s...,ECU Replacement
2,>>> nepsas26 : 28/07/2022 11:07\nFault: Gate s...,ECU Replacement
3,>>> dtacto01 : 19/07/2022 08:49\nFault : Intru...,Patron sensor alignment out/Patron Sensor bloc...
4,>>> dtsgto04 : 06/07/2022 13:11\nFault : Patro...,Patron Sensor Alignment Out/Gate Intrusion


## 4. NLP

### a. Training Dataset

In [15]:
# Apply fucntion
train_df['NOTES'] = train_df['NOTES'].apply(clean_notes)
train_df['NOTES_PROCESSED'] = train_df['NOTES'].apply(remove_punctuation)
train_df['NOTES_PROCESSED'] = train_df['NOTES_PROCESSED'].apply(remove_stopwords_lemmatize)

In [16]:
# Counts the frequency of each word
notes_tok = [word_tokenize(word) for word in train_df['NOTES_PROCESSED']]

word_vocab = []
for line in notes_tok:
    for word in line:
        word_vocab.append(word)

word_counts = Counter(word_vocab)
word_counts

Counter({'intrusion': 414,
         'alarm': 5953,
         'corrective': 4189,
         'site': 5748,
         'service': 8143,
         'checked': 7538,
         'found': 10706,
         'sensor': 2294,
         'intermittent': 351,
         'adjusted': 791,
         'position': 1056,
         'performed': 2876,
         'patron': 155,
         'management': 76,
         'test': 9552,
         'ok': 16587,
         'tested': 5914,
         'put': 4568,
         'gate': 3723,
         'back': 8536,
         'comm': 1121,
         'machine': 2111,
         'degraded': 380,
         'mode': 1635,
         'unplug': 397,
         'plug': 454,
         'lan': 352,
         'cable': 1303,
         'check': 11031,
         'status': 9429,
         'shown': 736,
         'csc': 125,
         'rw': 47,
         'error': 3461,
         'afc': 1113,
         'remote': 1649,
         'reader': 158,
         'secured': 494,
         'connection': 833,
         'reboot': 1479,
         'initalized

In [17]:
# Filter words that appear less than 3
train_df['NOTES_PROCESSED'] = train_df['NOTES_PROCESSED'].apply(filter_noise)

# Remove any that contain only 1 word or are empty 
train_df['LENGTH'] = train_df['NOTES_PROCESSED'].apply(lambda x: len(x.split()))
train_df = train_df[train_df['LENGTH']>1]
train_df = train_df.reset_index(drop=True)
train_df = train_df.drop('LENGTH', axis=1)

# Reshuttle cols
train_df=train_df[['NOTES','NOTES_PROCESSED','TARGET']]

In [18]:
# Convert to lower case, find unique categories
train_df['TARGET'] = train_df['TARGET'].apply(lambda x: x.lower())
sorted(train_df['TARGET'].unique())

[' i/o gp a card failure',
 '(cnt/lti) lift intercom voice communication not working',
 '(do not use) led display com link error',
 '(do not use) led display unable to show priority message',
 '(do not use) pas pmf 906a analogue monitor card fault alarm',
 '(do not use) pas pnc 903a aes/ebu output module fault alarm',
 '(do not use) unknown symptom',
 '1 plc down',
 '12" monitor',
 '18 " monitor/lcd monitor replacement',
 '22" lcd monitor replacement ',
 '24vdc absence',
 'abnormal sound generated from equipment',
 'abnormal wear',
 'adjustment of the swing gate ',
 'air flow not  strong',
 'air leaking',
 'air pressure too strong',
 'air-con cover/ducting/grille dis-lodged ( mis-aligned )',
 'alarm / event message not update / not received',
 'alarm led blinking',
 'alarm ups battery/alarm module/alarm battery',
 'alarm ups module/replacement of alarm battery',
 'alarms / event acknowledge problem -- cannot acknowledge / auto acknowledge',
 'alarms / event messages sent to wrong gws p

In [19]:
# Append other non-indicative class and remove them
tmp_list = ['others', 'miscellaneous', 'fault', 'other symptom not in the list', 'no symptom', 'wrong code selected', 
          'no fault found', 'other fault', 'all other faults not listed', 'other hardware fault', 'other hardware faults', 'faulty',
          'other software faults', 'na-other hardware faults', 'other link failure','other pabx faults', 'other psm printer problems', 
          'other hardware faults/other hardware problem','other hardware faults/other hardware problems', 'other link failure',
          'other software faults/auto rebooting', 'other software faults ', 'no defects found']

train_df = train_df[~train_df["TARGET"].isin(tmp_list)]
train_df = train_df[~train_df["TARGET"].str.contains('do not use', case=False, regex=True)]

In [20]:
# Keep class that where >= 10
value_counts = train_df['TARGET'].value_counts()
keep = value_counts[value_counts >= 10].index
train_df = train_df[train_df['TARGET'].isin(keep)]
train_df = train_df.reset_index(drop=True)

In [21]:
train_df.head()

Unnamed: 0,NOTES,NOTES_PROCESSED,TARGET
0,Fault: Intrusion alarm. Corrective action: On...,intrusion alarm corrective site service checke...,patron sensor alignment out/patron sensor bloc...
1,Fault: No comm Corrective action: On site mach...,comm corrective site machine degraded mode unp...,no comms
2,Fault : CSC RW error Corrective action AFC ga...,csc rw error corrective afc gate service site ...,csc r/w/c-through reader
3,Fault: NBX unable to initialise. Corrective a...,nbx unable initialise corrective site service ...,magazine 1/change notebox
4,Fault : Intermtent cannot accept notes Correct...,accept note corrective ttk service site cleanu...,self recovery/ recovered by station staff


### b. Testing Dataset

In [22]:
# Apply fucntion
test_df['NOTES'] = test_df['NOTES'].apply(clean_notes)
test_df['NOTES_PROCESSED'] = test_df['NOTES'].apply(remove_punctuation)
test_df['NOTES_PROCESSED'] = test_df['NOTES_PROCESSED'].apply(remove_stopwords_lemmatize)
test_df['NOTES_PROCESSED'] = test_df['NOTES_PROCESSED'].apply(filter_noise)

In [23]:
# Remove any that contain only 1 word or are empty 
test_df['NOTES_PROCESSED'] = test_df['NOTES_PROCESSED'].apply(lambda x:''.join(x))
test_df['LENGTH'] = test_df['NOTES_PROCESSED'].apply(lambda x: len(x.split()))
test_df = test_df[test_df['LENGTH']>1]
test_df = test_df.reset_index(drop=True)

# Reshuttle cols
test_df = test_df.drop('LENGTH', axis=1)
test_df = test_df[['NOTES','NOTES_PROCESSED','TARGET']] #reshuffle cols

In [24]:
# Convert to lower case, remove non indicative class
test_df['TARGET'] = test_df['TARGET'].apply(lambda x: x.lower())
test_df = test_df[~test_df["TARGET"].isin(tmp_list)]
test_df = test_df[~test_df["TARGET"].str.contains('do not use', case=False, regex=True)]

In [25]:
# Keep class that where >= 10
value_counts = test_df['TARGET'].value_counts()
keep = value_counts[value_counts >= 10].index
test_df = test_df[test_df['TARGET'].isin(keep)]
test_df = test_df.reset_index(drop=True)

In [26]:
test_df.head()

Unnamed: 0,NOTES,NOTES_PROCESSED,TARGET
0,"Fault: Gate show ""X"" on screen. Corrective ac...",gate show x screen corrective site service sc ...,ecu replacement
1,"Fault: Gate showing ""X"" on the screen. Correc...",gate showing x screen corrective site oos sc s...,ecu replacement
2,Fault : Intrusion alarm Corrective action : AF...,intrusion alarm corrective afc gate service si...,patron sensor alignment out/patron sensor bloc...
3,Fault : Patron sensor error Corrective action ...,patron sensor error corrective afc gate servic...,patron sensor alignment out/gate intrusion
4,Fault : Intrusion error intermittent Correctiv...,intrusion error intermittent corrective gate s...,patron sensor alignment out/patron sensor bloc...


## 5. Train Test Split

In [27]:
x_train = train_df[['NOTES_PROCESSED']]
y_train = train_df['TARGET']

In [28]:
x_test = test_df[['NOTES_PROCESSED']]
y_test = test_df['TARGET']

## 6. Wide & Deep Learning Model

### a. Deep (LSTM) Preprocessing

In [29]:
# Extract NOTES_PROCESSED from x_train, convert into integers
x_train_notes_processed = x_train['NOTES_PROCESSED']
x_tokenizer = Tokenizer(oov_token='<oov>')
x_tokenizer.fit_on_texts(x_train_notes_processed)
x_train_sequences = x_tokenizer.texts_to_sequences(x_train_notes_processed)
x_train_encoded = pad_sequences(x_train_sequences, padding='post', maxlen=425)

In [30]:
# Get the dict, Find length of dict
word_index = x_tokenizer.word_index
vocab_size = len(word_index) +1 # Padding is 0 so need to +1

In [31]:
# Encoding for y_train
unique_ytrain_labels = list(set(y_train))
ytrainlabel_to_index = dict([
    (unique_ytrain_labels[x], x) for x in range(len(unique_ytrain_labels))])
y_train_encoded = [ytrainlabel_to_index[i] for i in y_train]
y_train_encoded = np.array(y_train_encoded) # Convert to np for LSTM

In [32]:
n_class = len(ytrainlabel_to_index) #len of dict
n_class

296

In [33]:
# Extract NOTES_PROCESSED from x_test, convert into integers
x_test_notes_processed = x_test['NOTES_PROCESSED']
x_test_sequences = x_tokenizer.texts_to_sequences(x_test_notes_processed)
x_test_encoded = pad_sequences(x_test_sequences, padding='post', maxlen=425)

In [34]:
# Label encoding for y_test
y_test_encoded = [ytrainlabel_to_index.get(i, n_class) for i in y_test]
y_test_encoded = np.array(y_test_encoded) # Convert to numpy for neural network
test_indices = [i for i in range(len(y_test_encoded)) if y_test_encoded[i] < n_class]

In [35]:
X_train = x_train_encoded
Y_train = y_train_encoded
X_valid = x_test_encoded[test_indices]
Y_valid = y_test_encoded[test_indices]

### b. Wide (TF-IDF Vectorizer) PreProcessing

In [36]:
# Load previosuly saved vectorizer from Python Script: Prediction of Category Labels using NLP and Machine Learning Models)
with open('tfidf.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

# Transform the training data using the pre-loaded TF-IDF vectorizer
x_train_TFIDF = vectorizer.transform(x_train_notes_processed)

# Transform the test data using the same TF-IDF vectorizer
x_test_revised = x_test_notes_processed[test_indices]
x_test_TFIDF = vectorizer.transform(x_test_revised) 


## 7. Wide & Deep Learning Model (TF-IDF Vectorizer + LSTM)

In [37]:
X_train.shape, type(X_train)

((23325, 425), numpy.ndarray)

In [38]:
x_train_TFIDF.shape, type(x_train_TFIDF)

((23325, 4744), scipy.sparse._csr.csr_matrix)

In [39]:
x_test_TFIDF.shape, type(x_test_TFIDF)

((3921, 4744), scipy.sparse._csr.csr_matrix)

In [40]:
X_valid.shape, type(X_valid)

((3921, 425), numpy.ndarray)

In [41]:
len(ytrainlabel_to_index)

296

In [42]:
# Concat LSTM + TFIDF
x_train_TFIDF_array = x_train_TFIDF.toarray() # convert sparse matrix to array
x_test_TFIDF_array = x_test_TFIDF.toarray()
x_train_combined = np.concatenate((X_train,x_train_TFIDF_array), axis =1)
x_test_combined = np.concatenate((X_valid,x_test_TFIDF_array), axis =1)

In [43]:
# Normalise
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train_combined)
x_test_scaled = scaler.transform(x_test_combined)

In [44]:
def build_model():
    x_input = tf.keras.Input(shape=(5169,), dtype="float32", name="x_input")
    x_sequence = tf.cast(x_input[:,:425],dtype='int32')
    x_lstm_embed = tf.keras.layers.Embedding(vocab_size, 32)(x_sequence)
    x_lstm  = tf.keras.layers.LSTM(
        32, return_sequences=False)(x_lstm_embed)
    x_tfidf = tf.keras.layers.Dense(32)(x_input[:,425:]) 
    x_widedeep = tf.concat([x_lstm,x_tfidf], axis=-1)
    x_logits = tf.keras.layers.Dense(296)(x_widedeep)
    x_probs  = tf.nn.softmax(x_logits, axis=-1)
    return tf.keras.Model(inputs=x_input, outputs=x_probs)

In [45]:
X_train = x_train_scaled
Y_train = y_train_encoded
X_valid = x_test_scaled
Y_valid = Y_valid

lstm_model = build_model()
lstm_optim = tf.keras.optimizers.Adam()
 
lstm_model.compile(
    optimizer=lstm_optim,
    metrics="accuracy",
    loss='sparse_categorical_crossentropy')
lstm_model.fit(
    X_train, Y_train, epochs=10,
    shuffle=True, batch_size=128, validation_data=((X_valid, Y_valid)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f017ce7a30>

Hyperparmeter Tunning of LSTM:

Layers: 32, 64, 128

Batch Size: 32, 64, 128

Epochs: 10, 15

Comparing accuracy and val_accuracy: Layers 32, batch size 128 and epochs 15 provides the best accuracy of 79.99% and val_accuracy of 86.94%.

In [46]:
# Predictions
predictions_lstm = lstm_model.predict(X_valid)
pred_status_lstm = tf.argmax(predictions_lstm, axis=1)
pred_status_lstm = np.array(pred_status_lstm) # Convert to array



In [47]:
# Decoding
index_to_ytrainlabel = {index: value for value, index in ytrainlabel_to_index.items()}
predicted_lstm = [index_to_ytrainlabel[index] for index in pred_status_lstm]
actual = [index_to_ytrainlabel[index] for index in Y_valid] 

In [48]:
# Get results
x_test_revised = x_test_revised.reset_index(drop=True)
x_test_revised_df = pd.DataFrame(x_test_revised)
actual_df = pd.DataFrame(actual)
predicted_lstm_df = pd.DataFrame(predicted_lstm)
results_lstm = pd.concat([x_test_revised_df,actual_df,predicted_lstm_df], axis=1)
results_lstm.columns = ['NOTES_PROCESSED', 'actual', 'predicted'] # Rename cols

In [49]:
results_lstm.sample(5)

Unnamed: 0,NOTES_PROCESSED,actual,predicted
689,intermittent oos corrective gate site checked ...,patron sensor alignment out/gate intrusion,patron sensor alignment out/patron sensor bloc...
204,occ lcd display blank restart turn blue screen...,rms/mss application not responding,rms/mss application not responding
2653,crane handle unit aisle found pallet crane for...,equipment fault with alarms,equipment fault with alarms
3053,car underseat underframe smoke detection faile...,train/vehicle/equipment fault with alarms ( re...,train/vehicle/equipment fault with alarms ( re...
819,date time bgk station staff reported camera di...,camera no video,camera no video


In [50]:
results_lstm[results_lstm['actual'] != results_lstm['predicted']].shape[0]

512

In [51]:
results_lstm[results_lstm['actual'] != results_lstm['predicted']].sample(5)

Unnamed: 0,NOTES_PROCESSED,actual,predicted
589,hbfn car battery charger unknown leading self ...,train/vehicle/equipment fault without alarm/fa...,train/vehicle/equipment fault with alarms ( re...
2136,time cqy stn report blank vnc pd unsucessful s...,default message,com link error
862,time ned dtc report depot camera blank medium ...,blank screen,rms/mss application not responding
1003,nel fm check found screw stuck cause u l lh co...,noisy,tripped
1144,date time l fire extinguisher summary status r...,train/vehicle/equipment fault without alarm/fa...,train/vehicle/equipment fault with alarms ( re...


## 8. Wide & Deep Learning Model (TF-IDF Vectorizer + SimpleRNN)

In [None]:
def build_rnn_model():
    x_input = tf.keras.Input(shape=(5169,), dtype="float32", name="x_input")
    x_sequence = tf.cast(x_input[:,:425],dtype='int32')
    x_rnn_embed = tf.keras.layers.Embedding(vocab_size, 32)(x_sequence)
    x_rnn  = tf.keras.layers.SimpleRNN(
        32, return_sequences=False)(x_rnn_embed)
    x_tfidf = tf.keras.layers.Dense(32)(x_input[:,425:])
    x_widedeep = tf.concat([x_rnn,x_tfidf], axis=-1)
    x_logits = tf.keras.layers.Dense(296)(x_widedeep)
    x_probs  = tf.nn.softmax(x_logits, axis=-1)
    return tf.keras.Model(inputs=x_input, outputs=x_probs)

In [53]:
X_train = x_train_scaled
Y_train = y_train_encoded
X_valid = x_test_scaled
Y_valid = Y_valid
 
rnn_model = build_rnn_model()
rnn_optim = tf.keras.optimizers.Adam()
 
rnn_model.compile(
    optimizer=rnn_optim,
    metrics="accuracy",
    loss='sparse_categorical_crossentropy')
rnn_model.fit(
    X_train, Y_train, epochs=10,
    shuffle=True, batch_size=128, validation_data=((X_valid, Y_valid)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f018703d30>

Hyperparmeter Tunning of SimpleRNN:

Layers: 32, 64, 128

Batch Size: 32, 64, 128

Epochs: 10, 15

Comparing accuracy and val_accuracy: Layers 32, batch size 128 and epochs 15 provides the best accuracy of 79.85% and val_accuracy of 85.72%.

In [54]:
# Predictions
predictions_rnn = rnn_model.predict(X_valid)
pred_status_rnn = tf.argmax(predictions_rnn, axis=1)
pred_status_rnn = np.array(pred_status_rnn) #convert to array



In [55]:
# Decoding
index_to_ytrainlabel = {index: value for value, index in ytrainlabel_to_index.items()} 
predicted_rnn = [index_to_ytrainlabel[index] for index in pred_status_rnn]
actual = [index_to_ytrainlabel[index] for index in Y_valid] 

In [56]:
# Get results
x_test_revised_df = pd.DataFrame(x_test_revised)
actual_df = pd.DataFrame(actual)
predicted_rnn_df = pd.DataFrame(predicted_rnn)
results_rnn = pd.concat([x_test_revised_df,actual_df,predicted_rnn_df], axis=1)
results_rnn.columns = ['NOTES_PROCESSED', 'actual', 'predicted'] # Rename cols

In [57]:
results_rnn.sample(5)

Unnamed: 0,NOTES_PROCESSED,actual,predicted
17,comm corrective afc gate service site checked ...,no comms,no comms
3227,plc obstruction l normalised monitor repeated ...,equipment fault with alarms,equipment fault with alarms
3372,date time cnt station staff reported cctv moni...,camera image flickering,blank screen
1880,date time l schxxx supply air temperature sens...,train/vehicle/equipment fault with alarms ( re...,train/vehicle/equipment fault with alarms ( re...
2792,skg comm link error vnc verification display d...,default message,default message


In [58]:
results_rnn[results_rnn['actual'] != results_rnn['predicted']].shape[0]

560

In [59]:
results_rnn[results_rnn['actual'] != results_rnn['predicted']].sample(5)

Unnamed: 0,NOTES_PROCESSED,actual,predicted
2380,nel fm log dbp dfc se upon arrival lift runnin...,unable to start,tripped
3442,date time ser station staff reported unable do...,blank screen,rms/mss application not responding
3181,hbf c car pec keep self activated facing pgl m...,train/vehicle/equipment fault without alarm/fa...,train/vehicle/equipment fault with alarms ( re...
1833,apu failure replaced apu unit load software fu...,train/vehicle/equipment fault without alarm/fa...,train/vehicle/equipment fault with alarms ( re...
2696,date time opt n b psd door service status faul...,eda lost contact/forced active/limitswitch fault,out of operation/ faulty


## 9. Results

TF-IDF Vectorizer + LSTM: accuracy: 79.99%, val_accuracy: 86.94%

TF-IDF Vectorizer + SimpleRNN: accuracy: 79.85% and val_accuracy: 85.72%

Best model: TF-IDF Vectorizer + LSTM (accuracy: 79.99%, val_accuracy: 86.94%)

In [60]:
# Save file
results_lstm.to_csv('Label Predictions_TFIDFwithLSTM.csv')

In [61]:
# Save model
lstm_model.save('lstm_model.h5')

In [62]:
# # load model and predict
# loaded_model_lstm = load_model('lstm_model.h5')
# predictions_lstm = loaded_model_lstm.predict(X_valid)
# pred_status_lstm = tf.argmax(predictions_lstm, axis=1)