# Recipe 6-1. Retrieving Information

Step 1-1 Import the libraries

In [11]:
import gensim
from gensim.models import Word2Vec
import numpy as np
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re
import pandas as pd
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english') 

Step 1-2 Create/import documents

In [2]:
# Randomly taking sentences from the internet:
Doc1 = ["With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders." ]

Doc2 = ["Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data."]

Doc3 = ["He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban and metro rail systems."]

Doc4 = ["But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni, India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg."]

# Put all the documents in one list
fin= Doc1+Doc2+Doc3+Doc4

Step 1-3 Download word2vec

In [3]:
#load the model
model = gensim.models.KeyedVectors.load_word2vec_format('D:\\College\\Semester 6\\NLP (Natural Language Processing)\\TugasBab6_Galih Lanjar Pangastuti_2107412037_CCIT 6B\\GoogleNews-vectors-negative300.bin', binary=True) 

Step 1-4 Create IR system

In [4]:
#Preprocessing
def remove_stopwords(text, is_lower_case=False):
    pattern = r'[^a-zA-z0-9\s]'
    text =  re.sub(pattern, ' ', text) 
    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [5]:
# Function to get the embedding vector for n dimension, we have used "300"
def get_embedding(word):
    if word in model.vocab:
        return model[word]
    else:
        return np.zeros(300)

In [6]:
# Getting average vector for each document
out_dict = {}
for sen in fin:
    average_vector = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(remove_stopwords(sen))]), axis=0))
    dict = { sen : (average_vector) }
    out_dict.update(dict)

In [7]:
# Function to calculate the similarity between the query vector and document vector
def get_sim(query_embedding, average_vector_doc):
    sim = [(1 - scipy.spatial.distance.cosine(query_embedding, average_vector_doc))]
    return sim

In [8]:
# Rank all the documents based on the similarity to get relevant docs
def Ranked_documents(query):
    query_words = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(query.lower())],dtype=float), axis=0))
    rank = []
    
    for k,v in out_dict.items():
        rank.append((k, get_sim(query_words, v)))
        
    rank = sorted(rank,key=lambda t: t[1], reverse=True)
    print('Ranked Documents :')
    return rank

Step 1-5 Results and applications

In [9]:
# Call the IR function with a query
Ranked_documents("cricket")

Ranked Documents :


[('But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni, India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg.',
  [0.44954328830341783]),
 ('He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban and metro rail systems.',
  [0.23973446930269127]),
 ('With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders.',
  [0.18673744058462127]),
 ('Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.',
  [0.17995

In [10]:
# take one more example as may be driving.
Ranked_documents("driving")

Ranked Documents :


[('With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders.',
  [0.3528503913165989]),
 ('But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni, India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg.',
  [0.19042557661139026]),
 ('He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban and metro rail systems.',
  [0.1706653724240128]),
 ('Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data.',
  [0.0887230

# Recipe 6-2. Classifying Text with Deep Learning

Step 2-2 Identifying potential data sources, collection, and understanding

In [12]:
import pandas as pd

In [13]:
#read file
file_content = pd.read_csv('D:\\College\\Semester 6\\NLP (Natural Language Processing)\\TugasBab6_Galih Lanjar Pangastuti_2107412037_CCIT 6B\\spam.csv', encoding = "ISO-8859-1")

In [14]:
#check sample content in the email
file_content['v2'][1]

'Ok lar... Joking wif u oni...'

Step 2-3 Text preprocessing

In [15]:
#Import library
from nltk.corpus import stopwords
from nltk import *
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [16]:
# Remove stop words
stop = stopwords.words('english')
file_content['v2'] = file_content['v2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [18]:
# Delete unwanted columns
Email_Data = file_content[['v1', 'v2']]

In [19]:
# Rename column names
Email_Data = Email_Data.rename(columns={"v1":"Target", "v2":"Email"})
Email_Data.head()

Unnamed: 0,Target,Email
0,ham,"Go jurong point, crazy.. Available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say early hor... U c already say...
4,ham,"Nah I think goes usf, lives around though"


In [20]:
#Delete punctuations, convert text in lower case and delete the double space
Email_Data['Email'] = Email_Data['Email'].apply(lambda x: re.sub('[!@#$:).;,?&]', '', x.lower()))
Email_Data['Email'] = Email_Data['Email'].apply(lambda x: re.sub(' ', ' ', x))
Email_Data['Email'].head(5)

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4             nah i think goes usf lives around though
Name: Email, dtype: object

In [21]:
#Separating text(input) and target classes
list_sentences_rawdata = Email_Data["Email"].fillna("_na_").values
list_classes = ["Target"]
target = Email_Data[list_classes].values
To_Process=Email_Data[['Email', 'Target']]

Step 2-4 Data preparation for model building

In [22]:
#Train and test split with 80:20 ratio
train, test = train_test_split(To_Process, test_size=0.2)

In [23]:
# Define the sequence lengths, max number of words and embedding dimensions
# Sequence length of each sentence. If more, truncate. If less, pad with zeros
MAX_SEQUENCE_LENGTH = 300

# Top 20000 frequently occurring words
MAX_NB_WORDS = 20000

In [1]:
%pip install tensorflow keras

Note: you may need to restart the kernel to use updated packages.


In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [25]:
# Get the frequently occurring words
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train.Email)
train_sequences = tokenizer.texts_to_sequences(train.Email)
test_sequences = tokenizer.texts_to_sequences(test.Email)

In [26]:
# dictionary containing words and their index
word_index = tokenizer.word_index
print(tokenizer.word_index)

# total words in the corpus
print('Found %s unique tokens.' % len(word_index))

Found 8493 unique tokens.


In [30]:
# get only the top frequent words on train
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(train_data.shape)

# get only the top frequent words on test
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(test_data.shape)

(4457, 300)
(1115, 300)


In [38]:
train_labels = train['Target']
test_labels = test['Target'] 

In [39]:
#import library
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [40]:
# converts the character array to numeric array. Assigns levels to unique labels.
le = LabelEncoder()
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)

print(le.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))

['ham' 'spam']
(array([0, 1]), array([3859,  598], dtype=int64))
(array([0, 1]), array([966, 149], dtype=int64))


In [42]:
from keras.utils import to_categorical

# changing data types
labels_train = to_categorical(np.asarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))

print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_test.shape)

Shape of data tensor: (4457, 300)
Shape of label tensor: (4457, 2)
Shape of label tensor: (1115, 2)


In [43]:
EMBEDDING_DIM = 100
print(MAX_SEQUENCE_LENGTH)

300


Step 2-5 Model building and predicting

In [46]:
# Import Libraries
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D, Conv1D, SimpleRNN
from keras.models import Model
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential

print('Training CNN 1D model.')

Training CNN 1D model.


In [49]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Dropout(0.5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])



In [50]:
# We are now fitting our model to the data. Here we have 5 epochs and a batch size of 64 patterns.
model.fit(train_data, labels_train, 
          batch_size=64,
          epochs=5,
          validation_data=(test_data, labels_test))

Epoch 1/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 103ms/step - acc: 0.7974 - loss: 0.4965 - val_acc: 0.8664 - val_loss: 0.3756
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 110ms/step - acc: 0.9303 - loss: 0.2044 - val_acc: 0.8664 - val_loss: 0.3385
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 111ms/step - acc: 0.9767 - loss: 0.0869 - val_acc: 0.1336 - val_loss: 0.7901
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 110ms/step - acc: 0.9818 - loss: 0.0628 - val_acc: 0.1336 - val_loss: 0.7313
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 110ms/step - acc: 0.9903 - loss: 0.0445 - val_acc: 0.9848 - val_loss: 0.6406


<keras.src.callbacks.history.History at 0x19957c6a760>

In [51]:
#predictions on test data
predicted=model.predict(test_data)
predicted

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


array([[0.5139222 , 0.48607785],
       [0.511462  , 0.48853803],
       [0.51676095, 0.48323902],
       ...,
       [0.5122004 , 0.48779958],
       [0.5192138 , 0.48078617],
       [0.5124003 , 0.48759973]], dtype=float32)

model evaluation

In [53]:
import sklearn
from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted.round())

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(sklearn.metrics.classification_report(labels_test, predicted.round()))

precision: [0.982706 1.      ]
recall: [1.         0.88590604]
fscore: [0.99127758 0.93950178]
support: [966 149]
############################
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.89      0.94       149

   micro avg       0.98      0.98      0.98      1115
   macro avg       0.99      0.94      0.97      1115
weighted avg       0.99      0.98      0.98      1115
 samples avg       0.98      0.98      0.98      1115



Define RNN Model

In [55]:
#import library
from tensorflow.keras.layers import SimpleRNN


#model training
print('Training SIMPLERNN model.')
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM,input_length=MAX_SEQUENCE_LENGTH))
model.add(SimpleRNN(2, input_shape=(None,1)))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'binary_crossentropy',
              
optimizer='adam',metrics = ['accuracy']) 
model.fit(train_data, labels_train, batch_size=16, epochs=5, validation_data=(test_data, labels_test))

Training SIMPLERNN model.
Epoch 1/5


  super().__init__(**kwargs)


[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 89ms/step - accuracy: 0.8536 - loss: 0.5652 - val_accuracy: 0.9103 - val_loss: 0.3997
Epoch 2/5
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 96ms/step - accuracy: 0.9599 - loss: 0.3046 - val_accuracy: 0.9193 - val_loss: 0.3195
Epoch 3/5
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 95ms/step - accuracy: 0.9879 - loss: 0.1817 - val_accuracy: 0.9247 - val_loss: 0.2726
Epoch 4/5
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 95ms/step - accuracy: 0.9961 - loss: 0.1146 - val_accuracy: 0.9274 - val_loss: 0.2470
Epoch 5/5
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 94ms/step - accuracy: 0.9971 - loss: 0.0783 - val_accuracy: 0.9175 - val_loss: 0.2436


<keras.src.callbacks.history.History at 0x19957c6afd0>

In [56]:
# prediction on test data
predicted_Srnn=model.predict(test_data)
predicted_Srnn

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step


array([[0.9946063 , 0.00539366],
       [0.9988959 , 0.00110413],
       [0.99874735, 0.00125265],
       ...,
       [0.892392  , 0.10760799],
       [0.5846516 , 0.41534844],
       [0.997332  , 0.00266801]], dtype=float32)

In [57]:
#model evaluation
from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted_Srnn.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(sklearn.metrics.classification_report(labels_test, predicted_Srnn.round()))

precision: [0.93525896 0.75675676]
recall: [0.97204969 0.56375839]
fscore: [0.95329949 0.64615385]
support: [966 149]
############################
              precision    recall  f1-score   support

           0       0.94      0.97      0.95       966
           1       0.76      0.56      0.65       149

   micro avg       0.92      0.92      0.92      1115
   macro avg       0.85      0.77      0.80      1115
weighted avg       0.91      0.92      0.91      1115
 samples avg       0.92      0.92      0.92      1115



Long Short-Term Memory (LSTM)

In [60]:
#model training
print('Training LSTM model.')
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(units=16, activation='relu', recurrent_activation='hard_sigmoid', return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.fit(train_data, labels_train, batch_size=16, epochs=5, validation_data=(test_data, labels_test))

Training LSTM model.
Epoch 1/5
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 141ms/step - accuracy: 0.9022 - loss: 0.2806 - val_accuracy: 0.9749 - val_loss: 0.2942
Epoch 2/5
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 85ms/step - accuracy: 0.9954 - loss: 0.0195 - val_accuracy: 0.9857 - val_loss: 0.0555
Epoch 3/5
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 89ms/step - accuracy: 0.9988 - loss: 0.0044 - val_accuracy: 0.9901 - val_loss: 0.0500
Epoch 4/5
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 140ms/step - accuracy: 0.9997 - loss: 0.0015 - val_accuracy: 0.9839 - val_loss: 0.0938
Epoch 5/5
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 152ms/step - accuracy: 1.0000 - loss: 4.0466e-04 - val_accuracy: 0.8448 - val_loss: 0.3786


<keras.src.callbacks.history.History at 0x1996c5f35b0>

In [61]:
#prediction on text data
predicted_lstm=model.predict(test_data)
predicted_lstm

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step


array([[1.5544491e-01, 8.4455514e-01],
       [9.9772078e-01, 2.2792502e-03],
       [9.9999309e-01, 6.9325743e-06],
       ...,
       [9.9999154e-01, 8.4486956e-06],
       [9.9999940e-01, 5.5004699e-07],
       [9.9991500e-01, 8.5035208e-05]], dtype=float32)

In [62]:
#model evaluation
from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted_lstm.round())
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(sklearn.metrics.classification_report(labels_test, predicted_lstm.round()))

precision: [0.99749059 0.46226415]
recall: [0.82298137 0.98657718]
fscore: [0.90187181 0.62955032]
support: [966 149]
############################
              precision    recall  f1-score   support

           0       1.00      0.82      0.90       966
           1       0.46      0.99      0.63       149

   micro avg       0.84      0.84      0.84      1115
   macro avg       0.73      0.90      0.77      1115
weighted avg       0.93      0.84      0.87      1115
 samples avg       0.84      0.84      0.84      1115



In [64]:
#model training
print('Training Bidirectional LSTM model.')
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(16, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(Conv1D(16, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform"))
model.add(GlobalMaxPool1D())
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
model.fit(train_data, labels_train, batch_size=16, epochs=3, validation_data=(test_data, labels_test))

Training Bidirectional LSTM model.
Epoch 1/3
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 233ms/step - accuracy: 0.8970 - loss: 0.2938 - val_accuracy: 0.9865 - val_loss: 0.0432
Epoch 2/3
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 251ms/step - accuracy: 0.9947 - loss: 0.0224 - val_accuracy: 0.9910 - val_loss: 0.0334
Epoch 3/3
[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 255ms/step - accuracy: 0.9987 - loss: 0.0057 - val_accuracy: 0.9892 - val_loss: 0.0426


<keras.src.callbacks.history.History at 0x19971227f40>

In [65]:
# prediction on test data
predicted_blstm=model.predict(test_data)
predicted_blstm

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step


array([[1.0000000e+00, 5.0778944e-08],
       [1.0000000e+00, 4.3315271e-10],
       [9.9999869e-01, 1.3478026e-06],
       ...,
       [1.0000000e+00, 9.0523818e-11],
       [1.0000000e+00, 5.4790874e-09],
       [1.0000000e+00, 1.1937731e-10]], dtype=float32)

In [66]:
#model evaluation
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(labels_test, predicted_blstm.round())

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
print("############################")
print(sklearn.metrics.classification_report(labels_test, predicted_blstm.round()))

precision: [0.98773006 1.        ]
recall: [1.         0.91946309]
fscore: [0.99382716 0.95804196]
support: [966 149]
############################
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       1.00      0.92      0.96       149

   micro avg       0.99      0.99      0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115
 samples avg       0.99      0.99      0.99      1115



# Recipe 6-3. Next Word Prediction

Step 3-2 Identifying potential data sources, collection, and understanding

In [67]:
file_content = pd.read_csv('D:\\College\\Semester 6\\NLP (Natural Language Processing)\\TugasBab6_Galih Lanjar Pangastuti_2107412037_CCIT 6B\\spam.csv', encoding = "ISO-8859-1")

In [68]:
# Just selecting emails and connverting it into list
Email_Data = file_content[[ 'v2']]
list_data = Email_Data.values.tolist()
list_data

[['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'],
 ['Ok lar... Joking wif u oni...'],
 ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"],
 ['U dun say so early hor... U c already then say...'],
 ["Nah I don't think he goes to usf, he lives around here though"],
 ["FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"],
 ['Even my brother is not like to speak with me. They treat me like aids patent.'],
 ["As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"],
 ['WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hour

Step 3-3 Importing and installing necessary libraries

In [70]:
import numpy as np
import random
import pandas as pd
import sys
import os
import time
import codecs
import collections
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer = ToktokTokenizer()

Step 3-4 Processing the data

In [71]:
#Converting list to string
from collections import Iterable

def flatten(items):
    """Yield items from any nested iterable"""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten(x):
                yield sub_x
        else:
            yield x
            
TextData=list(flatten(list_data))
TextData = ''.join(TextData)

  from collections import Iterable


In [74]:
# Remove unwanted lines and converting into lower case
TextData = TextData.replace('\n','')
TextData = TextData.lower()
pattern = r'[^a-zA-z0-9\s]'
TextData = re.sub(pattern, ' ', TextData) 

In [75]:
# Tokenizing
tokens = tokenizer.tokenize(TextData)
tokens = [token.strip() for token in tokens]

In [76]:
# get the distinct words and sort it
word_counts = collections.Counter(tokens)
word_c = len(word_counts)

print(word_c)

distinct_words = [x[0] for x in word_counts.most_common()]
distinct_words_sorted = list(sorted(distinct_words))

10476


In [77]:
# Generate indexing for all words
word_index = {x: i for i, x in enumerate(distinct_words_sorted)}

# decide on sentence length
sentence_length = 25

Step 3-5 Data preparation for modeling

In [78]:
# prepare the dataset of input to output pairs encoded as integers
# Generate the data for the model
# input = the input sentence to the model with index
# output = output of the model with index
InputData = []
OutputData = []

for i in range(0, word_c - sentence_length, 1):
    X = tokens[i:i + sentence_length]
    Y = tokens[i + sentence_length]
    InputData.append([word_index[char] for char in X])
    OutputData.append(word_index[Y])
    
print (InputData[:1])
print ("\n")
print(OutputData[:1])

[[4260, 9652, 5187, 7160, 2847, 1651, 6693, 4898, 2170, 6292, 4356, 10220, 5328, 3431, 2166, 2533, 9127, 4315, 1371, 9941, 6640, 5368, 5152, 10104, 9559]]


[6689]


In [79]:
# Generate X
X = numpy.reshape(InputData, (len(InputData), sentence_length, 1))

# One hot encode the output variable
Y = to_categorical(OutputData)
Y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Step 3-6 Model building

In [80]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

  super().__init__(**kwargs)


In [86]:
# define the checkpoint
file_name_path = "weights-improvement-{epoch:02d}-{loss:.4f}.keras"
checkpoint = ModelCheckpoint(file_name_path, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint]

In [87]:
# fit the model
model.fit(X, Y, epochs=5, batch_size=128, callbacks=callbacks) 

Epoch 1/5
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - loss: 8.3645
Epoch 1: loss improved from inf to 7.74180, saving model to weights-improvement-01-7.7418.keras
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 97ms/step - loss: 8.3570
Epoch 2/5
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - loss: 6.8230
Epoch 2: loss improved from 7.74180 to 6.86264, saving model to weights-improvement-02-6.8626.keras
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 89ms/step - loss: 6.8234
Epoch 3/5
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step - loss: 6.6903
Epoch 3: loss improved from 6.86264 to 6.72709, saving model to weights-improvement-03-6.7271.keras
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 87ms/step - loss: 6.6908
Epoch 4/5
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - loss: 6.5759
Epoch 4: loss improved from 6.72709 to 6.

<keras.src.callbacks.history.History at 0x1997f41c490>

In [89]:
# load the network weights
file_name = "weights-improvement-05-6.5288.keras"
model.load_weights(file_name)
model.compile(loss='categorical_crossentropy', optimizer='adam')

Step 3-7 Predicting next word

In [90]:
# Generating random sequence
start = numpy.random.randint(0, len(InputData))
input_sent = InputData[start]

# Generate index of the next word of the email
X = numpy.reshape(input_sent, (1, len(input_sent), 1))

predict_word = model.predict(X, verbose=0)
index = numpy.argmax(predict_word)

print(input_sent)
print ("\n")
print(index)

[1680, 2993, 2018, 4815, 3827, 8511, 8136, 1572, 5888, 645, 5705, 1238, 5705, 8136, 5001, 5888, 9279, 4575, 1454, 9975, 10042, 9127, 1284, 10397, 2990]


4815


In [91]:
# Convert these indexes back to words
word_index_rev = dict((i, c) for i, c in enumerate(tokens))
result = word_index_rev[index]
sent_in = [word_index_rev[value] for value in input_sent]

print(sent_in)
print ("\n")
print(result)

['wk', 'muz', 'she', 'but', 'exam', 'in', '1', 'in', 'finish', 'too', 'and', 'will', 'and', '1', 'our', 'finish', 'number', 'celebrations', 'more', 'you', 'cut', 'you', 'about', 'to', '4']


but
