# Spam Email Classification with Naive - Bayes Project

In [1]:
# Import Libraries
import string
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KietKlat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KietKlat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\KietKlat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## 1. Read and Split the dataset

In [2]:
# Load the dataset
df = pd.read_csv("../data/2cls_spam_text_cls.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Store for each column and convert to lists
messages = df["Message"].values.tolist()
labels = df["Category"].values.tolist()



## Pre-processing the dataset

### Processing steps:
- `Message` -> `Lower case` -> `Punctuation Removal` -> `Tokenize` -> `Remove Stopword` -> `Stemming` 

#### Step 1: Lower case


In [4]:
def lower_case(text):
  return text.lower()


#### Step 2: Punctuation Removal
- Eliminates all punctuation marks such as `, . '

In [5]:
def punctuation_removal(text):
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)


#### Step 3: Tokenize

In [6]:
def tokenize(text):
    return word_tokenize(text)

#### Step 4: Remove Stopword
- Filters out common words that don't carry significant meaning

In [7]:
def remove_stopwords(tokens):
    # Create a set of English stop words, use set for faster lookup
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # Return tokens that are not in the stop words set
    return [token for token in tokens if token not in stop_words]

In [8]:
test = "Hello, world! This is a test."
print(remove_stopwords(tokenize(punctuation_removal(lower_case(test)))))

['hello', 'world', 'test']


#### Step 5: Stemming
- Reduces words to their root form, grouping similar words together

In [9]:
def stemming(tokens):
  stemmer  = nltk.stem.PorterStemmer()
  return [stemmer.stem(token) for token in tokens]

In [10]:
s = "The cat's toys are scattered everywhere."
print(stemming(remove_stopwords(tokenize(punctuation_removal(lower_case(s))))))

['cat', 'toy', 'scatter', 'everywher']


In [11]:
# Combine all preprocessing steps into a single function
def preprocess_text(text):
  lower_text = lower_case(text)
  punctuation_text = punctuation_removal(lower_text)
  tokens = tokenize(punctuation_text)
  token_rm_sw = remove_stopwords(tokens)
  stemmed_tokens = stemming(token_rm_sw)
  return stemmed_tokens



In [12]:
s = "The cat's toys are scattered everywhere."
print(preprocess_text(s))

['cat', 'toy', 'scatter', 'everywher']


In [13]:
# Pre-processing the dataset
messages = [preprocess_text(message) for message in messages]

In [14]:
print(messages[:5])  # Display the first 5 preprocessed messages

[['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat'], ['ok', 'lar', 'joke', 'wif', 'u', 'oni'], ['free', 'entri', '2', 'wkli', 'comp', 'win', 'fa', 'cup', 'final', 'tkt', '21st', 'may', '2005', 'text', 'fa', '87121', 'receiv', 'entri', 'questionstd', 'txt', 'ratetc', 'appli', '08452810075over18'], ['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say'], ['nah', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though']]


### Create a dictionary

In [15]:
def create_dictionary(messages):
  dictionary = []
  for tokens in messages:
    for token in tokens:
      if token not in dictionary:
        dictionary.append(token)
  return dictionary

dictionary = create_dictionary(messages)

### Create a features vector

In [16]:
def create_features(tokens, dictionary):
  features = np.zeros(len(dictionary))
  for token in tokens:
    if token in dictionary:
      features[dictionary.index(token)] += 1
  return features

X = np.array([create_features(tokens, dictionary) for tokens in messages])


### Pre-processing label data
- ham -> 0
- spam -> 1


In [17]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f"Classes: {le.classes_}")
print(f"Encoded labels: {y}")


Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


## Divide the dataset into: Train, Validation, Test with the rate: 70, 20, 10

In [18]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125 # 0.1/(1-0.2)
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=SEED)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=TEST_SIZE, shuffle=True, random_state=SEED)




## Training the model

In [19]:
gaussNB = GaussianNB()
print("Training the model...")
model = gaussNB.fit(X_train, y_train)
print("Model trained successfully.")


Training the model...


Model trained successfully.


## Model Evaluation

In [20]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Val accuracy: {val_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")

Val accuracy: 0.8816
Test accuracy: 0.8602


## Predict

In [21]:
def predict(text, model, dictionary, label_encoder):
  processed_text = preprocess_text(text)
  features = create_features(processed_text, dictionary)
  features = np.array(features).reshape(1, -1)  # Reshape for single sample
  prediction = model.predict(features)
  prediction_cls = label_encoder.inverse_transform(prediction)[0]
  return prediction_cls

# Example usage
test_input = "I am actually thinking a way of doing something useful"
prediction_cls = predict(test_input,model, dictionary, le)
print(f"Prediction for '{test_input}': {prediction_cls}")

   
    

Prediction for 'I am actually thinking a way of doing something useful': ham
