In [1]:
import pickle
import re
import spacy
import nltk
import pandas as pd
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\unumuser\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\unumuser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unseen sample text

In [3]:
raw_text = ['I find this course somewhat okay, but it still needs improvements. Other than that it is fine.']

In [4]:
# define preprocessing function to remove stopwords and lemmatise the text
def remove_stopwords_lemmatise(text):
    stopwords = nltk.corpus.stopwords.words('english')
    words = [token.lemma_ for token in nlp(text) if not token.is_punct]
    words = [word.lower() for word in words if word.lower() not in stopwords]
    return ' '.join(words)

In [5]:
raw_text_processed = []
for x in raw_text:
    text = x.lower()
    text = x.strip()
    text = re.sub(r'\d+', '', x)
    text = re.sub(r'<br>', '', x)
    text = re.sub(r'<br />', '',x)
    text = re.sub(r'[^\w\s]', '',x)
    text = re.sub(r' +', ' ', x)
    text = remove_stopwords_lemmatise(x)
    raw_text_processed.append(text)

# Multinomial Naive Bayes Testing

In [6]:
nb = pickle.load(open('multinomial_nb.p', 'rb'))

In [7]:
vect = pickle.load(open('vectoriser.p', 'rb'))

In [8]:
x_unseen_dtm = vect.transform(raw_text_processed)
text_predict = nb.predict(x_unseen_dtm)
text_predict

array([5], dtype=int64)

# LSTM RNN Testing

In [13]:
model = load_model('lstm2.h5')

In [14]:
max_words = 50000 # make the top list of words (common words)
embedding_dim = 64
max_len = 600
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # OOV = Out of Vocabulary

In [15]:
tokenizer = pickle.load(open('tokeniser.p', 'rb'))

In [18]:
seq = tokenizer.texts_to_sequences(raw_text_processed)
padded = pad_sequences(seq, maxlen=max_len, padding=padding_type, truncating=trunc_type)
pred = model.predict(padded)
print(pred, np.argmax(pred))

[[4.8705384e-10 1.9373713e-03 7.2418556e-02 8.6108887e-01 5.4922651e-02
  9.6324999e-03]] 3
