In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

## Testing the model

In [3]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [4]:
from keras.models import load_model
testModel = load_model("model.h5", compile=False)
testModel.compile(loss="binary_crossentropy", metrics = ["accuracy", recall_m, precision_m, f1_m])


In [52]:
# Loading Data
csvData = pd.read_csv('data/spam_messages_train.csv', usecols = ["label", "text"])

# Removing duplicates
csvData.drop_duplicates(inplace = True)

# Removing NULL entries
csvData.dropna(inplace = True)

In [53]:
corpus = []
stemmer = PorterStemmer()
for text in csvData['text']:
    tokenized_text = word_tokenize(text)
    stemmed_string = ''
    for word in tokenized_text:
        stemmed_string += stemmer.stem(word) + ' '
    corpus.append(stemmed_string)

len(corpus[0])


137

In [45]:
# import pickle
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray() # tokenized text
y = csvData.iloc[:,0].values #spam / ham

In [46]:
del csvData
del corpus

In [47]:
for label in range(len(y)):
    
    if y[label] == "spam":
        y[label] = 1
    elif y[label] == "ham":
        y[label] = 0

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
del x, y, x_train, y_train

In [48]:
x_test = np.array(x_test, dtype=np.int32)
y_test = np.array(y_test, dtype=np.int32)

In [49]:
x_test.shape

(1171, 45681)

In [50]:
# Define the target shape
target_shape = (1171, 50876)

# Calculate the number of zeros to add to each row
num_zeros_to_add = target_shape[1] - x_test.shape[1]

# Pad each row of x_test with zeros
x_test = np.pad(x_test, ((0, 0), (0, num_zeros_to_add)), mode='constant')


In [51]:
loss, accuracy, f1Score, precision, recall = testModel.evaluate(x_test,y_test)
print(loss, accuracy, f1Score, precision, recall)

3.0991268157958984 0.61742103099823 0.37379777431488037 0.5307261943817139 0.4333006739616394
