In [None]:
import re
import string
import nltk
import pandas as pd
from keras import Sequential
from keras.src.layers import Dense
from nltk import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from sklearn.metrics import accuracy_score, confusion_matrix
import plotly.express as px

In [None]:
data = pd.read_csv('SMSSpamCollection.csv', sep="\t", header=None)
data.columns = ["category", "sms"]
data

In [None]:
print("0: good\n1: spam")
data["label"] = data["category"].apply(lambda c: int(c == "spam"))
data

In [None]:
stopwords = nltk.corpus.stopwords.words("english")
ps = PorterStemmer()


def clean_text(text):
    tokens = re.split(r"\s", text)
    text = "".join(ps.stem(word) + " " for word in tokens if word.lower() not in stopwords)
    text = "".join([c for c in text if c not in string.punctuation])
    return text


data["sms"] = data["sms"].apply(lambda text: clean_text(text))

data

In [None]:
cv = CountVectorizer()

data_to_train = cv.fit_transform(data["sms"])
data_to_train = pd.DataFrame(data_to_train.toarray())

train_data, test_data, train_labels, test_labels = train_test_split(data_to_train, data["label"], test_size=0.2)

In [None]:
model = Sequential()
model.add(Dense(units=1, input_shape=(data_to_train.shape[1],), activation='sigmoid'))

model.summary()

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

r = model.fit(train_data, train_labels, epochs=10, validation_data=(test_data, test_labels))

In [None]:
def draw_history(r):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=r.epoch, y=r.history['accuracy'], name="accuracy"))
    fig.add_trace(go.Scatter(x=r.epoch, y=r.history['val_accuracy'], name="val_accuracy"))
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=r.epoch, y=r.history['loss'], name="loss"))
    fig.add_trace(go.Scatter(x=r.epoch, y=r.history['val_loss'], name="val_loss"))
    fig.show()


draw_history(r)

In [None]:
preds = model.predict(data_to_train).round()

df = pd.DataFrame({
    "true": data["label"],
    "pred": preds.ravel()
})

df

In [None]:
print(f"Accuracy: {accuracy_score(df['true'], df['pred'])}")

cm = confusion_matrix(df['true'], df['pred'])
px.imshow(cm, text_auto=True, x=["p_good", "p_spam"], y=["t_good", "t_spam"])

In [None]:
text = input("Enter text to predict:")
text = clean_text(text)
print(text)
to_p = cv.transform([text])
p = model.predict(to_p, verbose=0)

print(f"Sigmoid = {p[0, 0]}, class = {p[0, 0].round()}")