In [7]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt to /home/heigetvu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/heigetvu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/heigetvu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


LOAD DATA

In [8]:
DATASET_PATH = "./2cls_spam_text_cls.csv"
df = pd.read_csv(DATASET_PATH)

messages = df["Message"].values.tolist()
labels = df["Category"].values.tolist()

PREPROCESSING

In [9]:
def lowercase(text):
    return text.lower()

def punctuation_removal(text):
    translator = str.maketrans('', '', string.punctuation)
    return  text.translate(translator)

def tokenize(text):
    return nltk.word_tokenize(text)

def stopwords_removal(tokens):
    stop_words = nltk.corpus.stopwords.words('english')
    return [token for token in tokens if token not in stop_words]

def steming(tokens):
    stemser = nltk.PorterStemmer()
    return [stemser.stem(token) for token in tokens]

def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = stopwords_removal(tokens)
    tokens = steming(tokens)
    return tokens

messages = [preprocess_text(message) for message in messages]

CREATING DICTIONARY

In [10]:
def create_dictionary(messages):
    dictionary = []
    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)
    return dictionary

dictionary = create_dictionary(messages)

CREATING FEATURES BASED ON COUNTING HOW MANY TIMES EACH WORD FROM THE VOCAB APPEARING?

In [12]:
def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))
    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1

    return features

X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [20]:
X.shape

(5572, 8166)

In [13]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f"Classes: {le.classes_}")
print(f"Encoded labels: {y}")

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


SPLIT VAL TEST

In [14]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=SEED)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=TEST_SIZE, shuffle=True, random_state=SEED)

MODEL

In [15]:
model = GaussianNB()
print("Starting training...")
model = model.fit(X_train, y_train)
print("Training complted")



Starting training...
Training complted


In [16]:
y_test_pred = model.predict(X_test)
y_val_pred = model.predict(X_val)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Accuracy on test set: {test_accuracy}")
print(f"Accuracy on validation set: {val_accuracy}")

print(f"Accuracy on test set: {accuracy_score(y_test, y_test_pred)}")
print(f"Accuracy on validation set: {accuracy_score(y_val, y_val_pred)}")

Accuracy on test set: 0.8602150537634409
Accuracy on validation set: 0.8816143497757848
Accuracy on test set: 0.8602150537634409
Accuracy on validation set: 0.8816143497757848


In [None]:
def prediction(text, model, dictionary):
    processed_text = preprocess_text(text)
    features = create_features(processed_text, dictionary)
    features = np.array(features).reshape(1, -1) # reshape to 2D array with shape (n_samples, n_features)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)
    return prediction_cls[0]

print(prediction("I am actually thinking a way of doing something useful", model, dictionary))

ham
