## 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import string
import re
import statistics

import gensim
from gensim.models import Word2Vec
from gensim.models import Doc2Vec 
from gensim.models.doc2vec import TaggedDocument

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import pickle

from xgboost import XGBClassifier

In [2]:
pd.set_option('display.max_rows', 1000); pd.set_option('display.max_columns', 1000); pd.set_option('display.width', 1000)

## 2. Functions

In [3]:
# Performs a series of replacements
def clean_notes(text):
    # Remove '\n'
    text = text.replace("\n", " ")
    #Remove >>>, username, date & time
    text = re.sub(r">>> ([a-z0-9_\.-]+) : \d{2}/\d{2}/\d{4} \d{2}:\d{2} ", "", text)
    #reference code ("Ref JR: " + 6 digits)
    text = re.sub("Ref JR: \d{6} ", "", text)
    return text

In [4]:
# Remove punctuation
def remove_punctuation(text):
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 
    text = text.translate(translator)
    text = re.sub(r'\s+'," ", text)
    return text

In [5]:
# Define Stopwords
nltk.download('stopwords')
stopwords = stopwords.words("english")

wo_stop_words = ['location', 'loc', 'description', 'fault', 'action', 'work', 'start', 'end']

for i in wo_stop_words:
    if i not in stopwords:
        stopwords.append(i)

[nltk_data] Downloading package stopwords to C:\Users\Chun
[nltk_data]     Quan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# remove stopwords, lemmatize, returns only alphabet characters

nltk.download('punkt') # Tokenizer model
nltk.download('wordnet') # Lexical database
# nltk.download('punkt_tab')

def remove_stopwords_lemmatize(sentence):
    # Tokenize
    tokens = word_tokenize(sentence)
    tokens = [word.lower() for word in tokens]

    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Only contains alphabet characters
    tokens = " ".join([word for word in tokens if word.isalpha()])

    return tokens

[nltk_data] Downloading package punkt to C:\Users\Chun
[nltk_data]     Quan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Chun
[nltk_data]     Quan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# Filter out the noise from a text by removing words that appear less than 3
def filter_noise(text):
    text = ' '.join([word for word in word_tokenize(text) if word_counts[word] > 3])
    return text

In [8]:
# Transforms text into numerical vectors using Word2Vec model
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(next(iter(word2vec.values())))
    def fit(self, X, y): 
            return self
    def transform(self, X):
            return np.array([np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0) for words in X])

In [9]:
# Tokenizes the text of each document for Doc2Vec model
def tokenize_text(text):
    tokens = []
    for word in nltk.word_tokenize(text):
        tokens.append(word)
    return tokens

In [10]:
# Generating labels and features for Doc2Vec model
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors

In [None]:
# ML model except XGBoost
def train_model(model, vectorizer, X_train, y_train, X_test, y_test, param_grid):
    # Fit model
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    # Get the best model from grid search
    best_model = grid_search.best_estimator_
    # Generate predictions
    y_pred = best_model.predict(X_test)
    # Accuracy Score
    score = round((accuracy_score(y_test, y_pred)*100),2)
    
    print(f'Vectorizer = {vectorizer}, Accuracy = {score}%')

    return best_model

In [None]:
# XGBoost model
def xgb_train_model(xg, vectorizer, X_train, y_train, X_test, y_test, param_grid):
     # Fit model
     le = LabelEncoder()
     y_xgb = le.fit_transform(y_train)
     grid_search = GridSearchCV(xg, param_grid, cv=5, n_jobs=-1)
     grid_search.fit(X_train, y_xgb)
     # Get the best model from grid search
     best_model = grid_search.best_estimator_
     # Generate predictions
     y_pred = best_model.predict(X_test)
     y_pred = le.inverse_transform(y_pred)
    # Accuracy Score
     score = round((accuracy_score(y_test, y_pred)*100),2)
    
     print(f'Vectorizer = {vectorizer}, Accuracy = {score}%')

     return best_model

## 3. Datasets and Preprocessing

### a. Training Dataset

Training Data: Jan 2020 to Jun 2023

In [13]:
# Load training datasets
tmp_data_1 = pd.read_csv("0_NEL_MMS_WO (2020-2022)_withnotes.csv")
tmp_data_2 = pd.read_csv("0_NEL_MMS_WO (Jan2023 - Jul2023).csv")

In [14]:
# Ensure both dataset has the same columns
tmp_data_2 = tmp_data_2[tmp_data_1.columns]
# Merge dataset
tmp_data = pd.concat([tmp_data_1, tmp_data_2])

In [15]:
# Filter CM Job
train_df = tmp_data[tmp_data["JOB_TYPE"] == "CM"].copy(deep = True)
print(f'CM Work Order DataFrame: {train_df.shape[0]} rows')

# Extract notes & symptom, drop those that are na
train_df = train_df[["NOTES", "DESCRIPTION_SYMPTOM"]]
train_df = train_df.dropna(how="any")

# Rest axis & rename columns
train_df = train_df.reset_index(drop=True)
train_df = train_df.rename(columns = {'DESCRIPTION_SYMPTOM': 'TARGET'})
print(f'Symptom Work Order DataFrame: {train_df.shape[0]} rows')

CM Work Order DataFrame: 66838 rows
Symptom Work Order DataFrame: 27189 rows


In [16]:
train_df.head()

Unnamed: 0,NOTES,TARGET
0,>>> nepsas26 : 25/01/2020 08:57\nFault: Intrus...,Patron sensor alignment out/Patron Sensor bloc...
1,>>> nesctc02 : 30/01/2020 09:57\nFault: No com...,No Comms
2,>>> nealms01 : 29/01/2020 07:21\nFault : CSC R...,CSC R/W/C-Through Reader
3,>>> nealsn01 : 04/01/2020 10:53\nFault : BNA ...,Other Hardware faults/Other hardware problems
4,>>> nepsas26 : 06/01/2020 14:00\nFault: NBX un...,Magazine 1/Change Notebox


### b. Testing Dataset

Testing Data: Jul 2023 - Dec 2023

In [17]:
# Load testing dataset, ensure both dataset has the same columns as training datasets
tmp_data_3 = pd.read_csv('0_NEL_MMS_WO (Jul2023 - Dec2023).csv')
tmp_data_3 = tmp_data_3[tmp_data.columns]

In [18]:
# Filter CM Job
test_df = tmp_data_3[tmp_data_3["JOB_TYPE"] == "CM"].copy(deep = True)
print(f'CM Work Order DataFrame: {test_df.shape[0]} rows')

# Extract notes & symptom, drop those that are N.A.
test_df = test_df[["NOTES", "DESCRIPTION_SYMPTOM"]]
test_df = test_df.dropna(how="any")

# Rest axis & Rename columns
test_df = test_df.reset_index(drop=True)
test_df = test_df.rename(columns = {'DESCRIPTION_SYMPTOM': 'TARGET'})
print(f'Symptom Work Order DataFrame: {test_df.shape[0]} rows')

CM Work Order DataFrame: 8985 rows
Symptom Work Order DataFrame: 5407 rows


In [19]:
test_df.head()

Unnamed: 0,NOTES,TARGET
0,>>> nealsn01 : 26/07/2022 09:05\nFault : Gate ...,Hang
1,>>> nepsas26 : 28/07/2022 10:36\nFault: Gate s...,ECU Replacement
2,>>> nepsas26 : 28/07/2022 11:07\nFault: Gate s...,ECU Replacement
3,>>> dtacto01 : 19/07/2022 08:49\nFault : Intru...,Patron sensor alignment out/Patron Sensor bloc...
4,>>> dtsgto04 : 06/07/2022 13:11\nFault : Patro...,Patron Sensor Alignment Out/Gate Intrusion


## 4. NLP

### a. Training Dataset

In [20]:
# Apply fucntion
train_df['NOTES'] = train_df['NOTES'].apply(clean_notes)
train_df['NOTES_PROCESSED'] = train_df['NOTES'].apply(remove_punctuation)
train_df['NOTES_PROCESSED'] = train_df['NOTES_PROCESSED'].apply(remove_stopwords_lemmatize)

In [21]:
# Counts the frequency of each word
notes_tok = [word_tokenize(word) for word in train_df['NOTES_PROCESSED']]

word_vocab = []
for line in notes_tok:
    for word in line:
        word_vocab.append(word)

word_counts = Counter(word_vocab)
word_counts

Counter({'intrusion': 414,
         'alarm': 5953,
         'corrective': 4189,
         'site': 5748,
         'service': 8143,
         'checked': 7538,
         'found': 10706,
         'sensor': 2294,
         'intermittent': 351,
         'adjusted': 791,
         'position': 1056,
         'performed': 2876,
         'patron': 155,
         'management': 76,
         'test': 9552,
         'ok': 16587,
         'tested': 5914,
         'put': 4568,
         'gate': 3723,
         'back': 8536,
         'comm': 1121,
         'machine': 2111,
         'degraded': 380,
         'mode': 1635,
         'unplug': 397,
         'plug': 454,
         'lan': 352,
         'cable': 1303,
         'check': 11031,
         'status': 9429,
         'shown': 736,
         'csc': 125,
         'rw': 47,
         'error': 3461,
         'afc': 1113,
         'remote': 1649,
         'reader': 158,
         'secured': 494,
         'connection': 833,
         'reboot': 1479,
         'initalized

In [22]:
# Filter words that appear less than 3
train_df['NOTES_PROCESSED'] = train_df['NOTES_PROCESSED'].apply(filter_noise)

# Remove any that contain only 1 word or are empty 
train_df['LENGTH'] = train_df['NOTES_PROCESSED'].apply(lambda x: len(x.split()))
train_df = train_df[train_df['LENGTH']>1]
train_df = train_df.reset_index(drop=True)
train_df = train_df.drop('LENGTH', axis=1)

# Reshuttle cols
train_df=train_df[['NOTES','NOTES_PROCESSED','TARGET']]

In [23]:
# Convert to lower case, find unique categories
train_df['TARGET'] = train_df['TARGET'].apply(lambda x: x.lower())
sorted(train_df['TARGET'].unique())

[' i/o gp a card failure',
 '(cnt/lti) lift intercom voice communication not working',
 '(do not use) led display com link error',
 '(do not use) led display unable to show priority message',
 '(do not use) pas pmf 906a analogue monitor card fault alarm',
 '(do not use) pas pnc 903a aes/ebu output module fault alarm',
 '(do not use) unknown symptom',
 '1 plc down',
 '12" monitor',
 '18 " monitor/lcd monitor replacement',
 '22" lcd monitor replacement ',
 '24vdc absence',
 'abnormal sound generated from equipment',
 'abnormal wear',
 'adjustment of the swing gate ',
 'air flow not  strong',
 'air leaking',
 'air pressure too strong',
 'air-con cover/ducting/grille dis-lodged ( mis-aligned )',
 'alarm / event message not update / not received',
 'alarm led blinking',
 'alarm ups battery/alarm module/alarm battery',
 'alarm ups module/replacement of alarm battery',
 'alarms / event acknowledge problem -- cannot acknowledge / auto acknowledge',
 'alarms / event messages sent to wrong gws p

In [24]:
# Append other non-indicative class and remove them
tmp_list = ['others', 'miscellaneous', 'fault', 'other symptom not in the list', 'no symptom', 'wrong code selected', 
          'no fault found', 'other fault', 'all other faults not listed', 'other hardware fault', 'other hardware faults', 'faulty',
          'other software faults', 'na-other hardware faults', 'other link failure','other pabx faults', 'other psm printer problems', 
          'other hardware faults/other hardware problem','other hardware faults/other hardware problems', 'other link failure',
          'other software faults/auto rebooting', 'other software faults ', 'no defects found']

train_df = train_df[~train_df["TARGET"].isin(tmp_list)]
train_df = train_df[~train_df["TARGET"].str.contains('do not use', case=False, regex=True)]

In [25]:
# Keep class that where >= 10
value_counts = train_df['TARGET'].value_counts()
keep = value_counts[value_counts >= 10].index
train_df = train_df[train_df['TARGET'].isin(keep)]
train_df = train_df.reset_index(drop=True)

In [26]:
train_df.head()

Unnamed: 0,NOTES,NOTES_PROCESSED,TARGET
0,Fault: Intrusion alarm. Corrective action: On...,intrusion alarm corrective site service checke...,patron sensor alignment out/patron sensor bloc...
1,Fault: No comm Corrective action: On site mach...,comm corrective site machine degraded mode unp...,no comms
2,Fault : CSC RW error Corrective action AFC ga...,csc rw error corrective afc gate service site ...,csc r/w/c-through reader
3,Fault: NBX unable to initialise. Corrective a...,nbx unable initialise corrective site service ...,magazine 1/change notebox
4,Fault : Intermtent cannot accept notes Correct...,accept note corrective ttk service site cleanu...,self recovery/ recovered by station staff


### b. Testing Dataset

In [27]:
# Apply fucntion
test_df['NOTES'] = test_df['NOTES'].apply(clean_notes)
test_df['NOTES_PROCESSED'] = test_df['NOTES'].apply(remove_punctuation)
test_df['NOTES_PROCESSED'] = test_df['NOTES_PROCESSED'].apply(remove_stopwords_lemmatize)
test_df['NOTES_PROCESSED'] = test_df['NOTES_PROCESSED'].apply(filter_noise)

In [28]:
# Remove any that contain only 1 word or are empty 
test_df['NOTES_PROCESSED'] = test_df['NOTES_PROCESSED'].apply(lambda x:''.join(x))
test_df['LENGTH'] = test_df['NOTES_PROCESSED'].apply(lambda x: len(x.split()))
test_df = test_df[test_df['LENGTH']>1]
test_df = test_df.reset_index(drop=True)

# Reshuttle cols
test_df = test_df.drop('LENGTH', axis=1)
test_df = test_df[['NOTES','NOTES_PROCESSED','TARGET']] #reshuffle cols

In [29]:
# Convert to lower case, remove non indicative class
test_df['TARGET'] = test_df['TARGET'].apply(lambda x: x.lower())
test_df = test_df[~test_df["TARGET"].isin(tmp_list)]
test_df = test_df[~test_df["TARGET"].str.contains('do not use', case=False, regex=True)]

In [30]:
# Keep class that where >= 10
value_counts = test_df['TARGET'].value_counts()
keep = value_counts[value_counts >= 10].index
test_df = test_df[test_df['TARGET'].isin(keep)]
test_df = test_df.reset_index(drop=True)

In [31]:
test_df.head()

Unnamed: 0,NOTES,NOTES_PROCESSED,TARGET
0,"Fault: Gate show ""X"" on screen. Corrective ac...",gate show x screen corrective site service sc ...,ecu replacement
1,"Fault: Gate showing ""X"" on the screen. Correc...",gate showing x screen corrective site oos sc s...,ecu replacement
2,Fault : Intrusion alarm Corrective action : AF...,intrusion alarm corrective afc gate service si...,patron sensor alignment out/patron sensor bloc...
3,Fault : Patron sensor error Corrective action ...,patron sensor error corrective afc gate servic...,patron sensor alignment out/gate intrusion
4,Fault : Intrusion error intermittent Correctiv...,intrusion error intermittent corrective gate s...,patron sensor alignment out/patron sensor bloc...


## 5. Train Test Split

In [32]:
x_train = train_df[['NOTES_PROCESSED']]
y_train = train_df['TARGET']

In [33]:
x_test = test_df[['NOTES_PROCESSED']]
y_test = test_df['TARGET']

## 6. Vectorizer

#### a. COUNT VECTORIZER

denoted as CV

In [34]:
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train['NOTES_PROCESSED'])
x_test_cv = cv.transform(x_test['NOTES_PROCESSED'])

#### b. TF-IDF VECTORIZER

denoted as TFIDF

In [35]:
TFIDF = TfidfVectorizer()
x_train_TFIDF =  TFIDF.fit_transform(x_train['NOTES_PROCESSED'])
x_test_TFIDF = TFIDF.transform(x_test['NOTES_PROCESSED'])

#### c. Word2Vec

denoted as W2V

In [36]:
# Tokenise NOTES_PROCESSED
train_df['NOTES_PROCESSED_TOKENIZE'] = [nltk.word_tokenize(i) for i in train_df['NOTES_PROCESSED']]

In [37]:
# Model initialization, building its vocabulary
w2v_model=Word2Vec(train_df['NOTES_PROCESSED_TOKENIZE'],min_count=1)

In [38]:
# Convert word2Vec trained model vocabulary into a dict with embeding
w2v_dict = dict(zip(w2v_model.wv.index_to_key, w2v_model.wv.vectors))

In [39]:
print(list(zip(w2v_model.wv.index_to_key, w2v_model.wv.vectors)))

[('ok', array([ 0.9008806 , -0.16512233,  1.1946117 ,  0.7401561 ,  0.47636724,
       -0.8715704 ,  2.221702  ,  0.5937816 , -1.1399463 , -0.6228905 ,
       -1.1555357 ,  0.34720016, -0.7784805 ,  0.6554941 ,  0.09207585,
       -1.1774395 ,  1.0797709 ,  0.57351065, -1.0356046 ,  0.86950415,
        0.67328304,  1.1453675 , -0.97414523,  1.1633253 ,  0.8185856 ,
       -1.8319865 ,  0.47599813, -0.50972855, -0.9599857 , -1.1297814 ,
       -2.5128555 , -0.8165294 ,  0.5985943 , -0.48963392,  0.5873814 ,
       -1.5392153 ,  1.980342  , -0.56352156,  2.0463872 ,  1.0887322 ,
       -0.7536325 , -1.305471  ,  0.07004557,  0.11007233,  2.044471  ,
        1.362126  , -0.42321387,  2.0860753 ,  2.223655  , -0.01358497,
        0.6929866 ,  0.72334707, -0.04244995,  0.5873587 , -0.33619958,
        0.5699993 ,  3.1609285 , -1.2382416 , -0.9565436 , -1.0059549 ,
       -0.6193626 ,  1.1969465 , -0.9443411 , -0.23655769,  0.3115774 ,
        0.17366631, -0.61853194,  0.2812666 , -0.1545757

In [40]:
# Call the function w2v_dict to return either average or zero vector
w2v = MeanEmbeddingVectorizer(w2v_dict)

In [41]:
# Tokenise NOTES_PROCESSED
X_train_tok = [nltk.word_tokenize(i) for i in x_train['NOTES_PROCESSED']]  
X_test_tok = [nltk.word_tokenize(i) for i in x_test['NOTES_PROCESSED']]

In [42]:
# Convert to numerical vector represersentations
X_train_w2v = w2v.transform(X_train_tok)
X_test_w2v = w2v.transform(X_test_tok)

#### d. Doc2Vec

denotated as D2V

In [43]:
# Train Test Split
train = train_df
test = test_df

In [44]:
# Create Tagged Documents for Doc2Vec
train_tagged = train.apply(lambda r: TaggedDocument(words = tokenize_text(r['NOTES_PROCESSED']), tags=[r.TARGET]), axis=1)
test_tagged = test.apply(lambda r: TaggedDocument(words = tokenize_text(r['NOTES_PROCESSED']), tags=[r.TARGET]), axis=1)

In [45]:
# Model initialization, building its vocabulary
model_dbow = Doc2Vec(dm = 0, vector_size = 100, min_count=2, alpha=0.025)
model_dbow.build_vocab([x for x in train_tagged.values])

In [46]:
# Training models
epochs = range(15)
for epoch in epochs:
    print(f'Epoch {epoch+1}')
    model_dbow.train(train_tagged, total_examples = model_dbow.corpus_count, epochs = model_dbow.epochs)
    model_dbow.alpha -= 0.00025
    model_dbow.min_alpha = model_dbow.alpha

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15


In [47]:
# Generate numerical vector represersentations
y_train_d2v, X_train_d2v = vec_for_learning(model_dbow, train_tagged)
y_test_d2v, X_test_d2v = vec_for_learning(model_dbow, test_tagged)

## 7. Machine Learning Models

#### a. Navie Bayes

In [None]:
nb = MultinomialNB()
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]}
train_model(nb, 'Count Vectorizer', x_train_cv, y_train, x_test_cv, y_test, param_grid)
train_model(nb, 'TF-IDF Vectorizerr', x_train_TFIDF, y_train, x_test_TFIDF, y_test, param_grid)

Vectorizer = Count Vectorizer, Accuracy = 75.49%
Vectorizer = TF-IDF Vectorizerr, Accuracy = 67.21%


### b. SVM

In [None]:
svc = SVC()
param_grid = {
    'C': [0.1,1,10]}
train_model(svc, 'Count Vectorizer', x_train_cv, y_train, x_test_cv, y_test, param_grid)
train_model(svc, 'TF-IDF Vectorizer', x_train_TFIDF, y_train, x_test_TFIDF, y_test, param_grid)
train_model(svc, 'Word2Vec', X_train_w2v, y_train, X_test_w2v, y_test, param_grid)
train_model(svc, 'Doc2Vec', X_train_d2v, y_train_d2v, X_test_d2v, y_test_d2v, param_grid)

Vectorizer = Count Vectorizer, Accuracy = 85.7%
Vectorizer = TF-IDF Vectorizer, Accuracy = 88.66%
Vectorizer = Word2Vec, Accuracy = 77.72%
Vectorizer = Doc2Vec, Accuracy = 87.47%


### c.  Logistics Regression

In [None]:
# Solver = liblinear
lr = LogisticRegression(solver = 'liblinear', random_state=42)
param_grid = {
    'C': [0.1,1,10], 'max_iter': [100,200]}
train_model(lr, 'Count Vectorizer', x_train_cv, y_train, x_test_cv, y_test, param_grid)
train_model(lr, 'TF-IDF Vectorizer', x_train_TFIDF, y_train, x_test_TFIDF, y_test, param_grid)
train_model(lr, 'Word2Vec', X_train_w2v, y_train, X_test_w2v, y_test, param_grid)
train_model(lr, 'Doc2Vec', X_train_d2v, y_train_d2v, X_test_d2v, y_test_d2v, param_grid)

Vectorizer = Count Vectorizer, Accuracy = 91.03%
Vectorizer = TF-IDF Vectorizer, Accuracy = 81.13%
Vectorizer = Word2Vec, Accuracy = 77.59%
Vectorizer = Doc2Vec, Accuracy = 85.57%


## d. Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],            
    'max_depth': [None, 10, 20],            
    'min_samples_split': [2, 5],              
    'min_samples_leaf': [1, 2]}
train_model(rf, 'Count Vectorizer', x_train_cv, y_train, x_test_cv, y_test, param_grid)
train_model(rf, 'TF-IDF Vectorizer', x_train_TFIDF, y_train, x_test_TFIDF, y_test, param_grid)
train_model(rf, 'Word2Vec', X_train_w2v, y_train, X_test_w2v, y_test, param_grid)
train_model(rf, 'Doc2Vec', X_train_d2v, y_train_d2v, X_test_d2v, y_test_d2v, param_grid)

Vectorizer = Count Vectorizer, Accuracy = 97.02%
Vectorizer = TF-IDF Vectorizer, Accuracy = 97.2%
Vectorizer = Word2Vec, Accuracy = 96.77%
Vectorizer = Doc2Vec, Accuracy = 90.12%


## e. XGBoost

In [None]:
xg = XGBClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]} 
xgb_train_model(xg, 'Count Vectorizer', x_train_cv, y_train, x_test_cv, y_test, param_grid)
xgb_train_model(xg, 'TF-IDF Vectorizer', x_train_TFIDF, y_train, x_test_TFIDF, y_test, param_grid)
xgb_train_model(xg, 'Word2Vec', X_train_w2v, y_train, X_test_w2v, y_test, param_grid)
xgb_train_model(xg, 'Doc2Vec', X_train_d2v, y_train_d2v, X_test_d2v, y_test_d2v, param_grid)

Vectorizer = Count Vectorizer, Accuracy = 88.81%
Vectorizer = TF-IDF Vectorizer, Accuracy = 92.32%
Vectorizer = Word2Vec, Accuracy = 96.36%
Vectorizer = Doc2Vec, Accuracy = 79.94%


Best Model: Random Forest with TF-IDF Vectorizer (Accuracy 97.2%) 


## 8. Results

In [53]:
# Get predictions
train_model(rf, 'TF-IDF Vectorizer', x_train_TFIDF, y_train, x_test_TFIDF, y_test)
pred = rf.predict(x_test_TFIDF)
pred_df = pd.DataFrame(pred, columns=['PREDICTION'])
test_df = test_df.reset_index(drop=True)
results = pd.concat([test_df, pred_df], axis=1)
results = results[['NOTES', 'NOTES_PROCESSED', 'TARGET', 'PREDICTION']]

Vectorizer = TF-IDF Vectorizer, Accuracy = 97.2%


In [54]:
results.sample(5)

Unnamed: 0,NOTES,NOTES_PROCESSED,TARGET,PREDICTION
1055,Fault Description: CSO reported cab A air con ...,cso reported cab air con water dripping confir...,pax comfort (for external reported fault. smoo...,pax comfort (for external reported fault. smoo...
242,NEL-FM-2022070203 19/07/20222 10:19 PGL PSC A...,nel fm pgl psc aircon faulty informed attend t...,temperature notcold,temperature notcold
2693,1242hrs PTP PTP staff reported all concourse P...,ptp ptp staff reported concourse pid train tim...,blank / wrong / partial information displayed ...,blank / wrong / partial information displayed ...
3233,Date & Time: 031122/ 1007hrs Location: BNKS ...,date time bnks car air compressor failure cab ...,train/vehicle/equipment fault with alarms ( re...,train/vehicle/equipment fault with alarms ( re...
441,Time: 1546hrs Loc: HBFNS Fault: Train 15: Ava...,time hbfns train availability available car tr...,train/vehicle/equipment fault with alarms ( re...,train/vehicle/equipment fault with alarms ( re...


In [55]:
results[results['TARGET']!=results['PREDICTION']].shape[0]

111

In [56]:
results[results['TARGET']!=results['PREDICTION']].sample(5)

Unnamed: 0,NOTES,NOTES_PROCESSED,TARGET,PREDICTION
3308,Fault: BNA error. Corrective action: On site ...,bna error corrective site service sc show bna ...,bna disable/coins & nets only,bna/notes jammed in bna/notebox problem/ high ...
2139,0813hrs PGL PD35 blank Onsite verify with stn ...,pgl blank onsite verify stn staff check found ...,display blank,default message
3678,Time: 0854hrs Loc: BGKS Fault: Car 72003 Door ...,time bgks car door opening slower sec related ...,train/vehicle/equipment fault with alarms ( re...,train/vehicle/equipment fault without alarm/fa...
108,Time: 0625hrs Loc: WLH Fault: Stn report PD21 ...,time wlh stn report blank vnc verification dis...,display blank,default message
3795,Time: 1357hrs Loc:P ML Fault: Train 21 LED sho...,time p ml train led show cqy car tci tetra iso...,train/vehicle/equipment fault without alarm/fa...,pax comfort (for external reported fault. smoo...


In [None]:
# Save file
results.to_csv('results_symptoms.csv')

In [58]:
# Save
pickle.dump(TFIDF, open('tfidf.pkl', 'wb'))
pickle.dump(rf, open('randomforest_model_symptom.pkl', 'wb'))

In [59]:
# # Load
# with open('tfidf.pkl', 'rb') as file:
#     vectorizer = pickle.load(file)

# with open('randomforest_model_symptom.pkl', 'rb') as file:
#     model = pickle.load(file)