In [None]:
!python -m nltk.downloader punkt
!python -m nltk.downloader stopwords

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# Loading Data
csvData = pd.read_csv('data/dataset.csv', usecols = ["label", "text"])

In [None]:
# Removing duplicates
csvData.drop_duplicates(inplace = True)

# Removing NULL entries
csvData.dropna(inplace = True)

In [None]:
corpus = []
stemmer = PorterStemmer()
for text in csvData['text']:
    tokenized_text = word_tokenize(text)
    stemmed_string = ''
    for word in tokenized_text:
        stemmed_string += stemmer.stem(word) + ' ' #stemmed word
    corpus.append(stemmed_string)


len(corpus[0])


In [None]:
# from nltk.stem import WordNetLemmatizer

# corpusLemma = []
# wnl = WordNetLemmatizer()
# for text in csvData['text']:
#     tokenized_text = word_tokenize(text)
#     stemmed_string = ''
#     for word in tokenized_text:
#         stemmed_string += wnl.lemmatize(word) + ' ' #stemmed word
#     corpusLemma.append(stemmed_string)



In [None]:
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray() # tokenized text
y = csvData.iloc[:,0].values           #spam / ham

In [None]:
del csvData
del corpus

In [None]:
for label in range(len(y)):

    if y[label] == "spam":
        y[label]=1
    elif y[label] == "ham":
        y[label]=0


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [None]:
del x, y, x_test, y_test

In [None]:
x_train = np.array(x_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.int32)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import load_model

In [None]:
model = Sequential()
model.add(Dense(800, input_shape = (len(x_train[0]),), activation="relu"))
model.add(Dense(400, activation="relu"))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss="binary_crossentropy", metrics = ["accuracy"])


In [None]:
hist = model.fit(x_train, y_train, epochs=20, batch_size = 100, callbacks=[tensorboardCallback])

# model.save('model.h5')

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
plt.plot(hist.history['accuracy'], color='teal', label='accuracy')
plt.show()

In [None]:
model.save('best-mail-classifier-colab.h5')

In [None]:
user_text = "Hello Barry. How are you?"
prediction = model.predict(cv.transform([user_text]))[0]

prediction

In [None]:
if prediction > 0.5:
    print("spam")
else:
    print("ham")