<a href="https://colab.research.google.com/github/KhangTheKangaroo/Text-Classification-Emails-/blob/main/TextClassification4Email.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R # Download the dataset

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /content/2cls_spam_text_cls.csv
100% 486k/486k [00:00<00:00, 91.2MB/s]


In [5]:
# Import necessary libraries

import nltk
nltk.download('stopwords') # Remove stopwords
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib as plt
import string

from sklearn.model_selection import train_test_split # Train the model
from sklearn.naive_bayes import GaussianNB # Turn it into a normal distribution (kinda)
from sklearn.metrics import accuracy_score # Measure accuracy
from sklearn.preprocessing import LabelEncoder # Split classes into designated labels

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
dataset_path = '/content/2cls_spam_text_cls.csv'
df = pd.read_csv(dataset_path)

# Convert messages and categories into a list
msgs = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

In [8]:
def lowercase_text(txt):
  return txt.lower()

def remove_punctuation(txt):
  return txt.translate(str.maketrans('', '', string.punctuation))

def tokenize(txt):
  return nltk.word_tokenize(txt) # Basically split the words into a list of words; great is it, truly.

def remove_stopwords(tokens): # You can do (txt) here if you want, (token) is just for easier read
  stopwords = nltk.corpus.stopwords.words('english') # Get stopwords (English edition, (I didn't even know this command existed until just a while ago making this))
  return [token for token in tokens if token not in stopwords] # Remove stopwords

def stemming(tokens):
  stemmer = nltk.PorterStemmer() # Get the stemmer
  return [stemmer.stem(token) for token in tokens] # Get the root/stem of the words

def preprocess_text(txt):
  txt = lowercase_text(txt)
  txt = remove_punctuation(txt)
  tokens = tokenize(txt)
  tokens = remove_stopwords(tokens)
  tokens = stemming(tokens)

  return tokens

msgs = [preprocess_text(message) for message in msgs] # Process the text

In [10]:
# Create a dictionary of UNIQUE words
def create_dict(msgs):
  dict = []
  for tokens in msgs:
    for token in tokens:
      if token not in dict:
        dict.append(token)
  return dict

dict = create_dict(msgs)

In [12]:
def create_features(tokens, dict):
  features = np.zeros(len(dict))

  for token in tokens:
    if token in dict:
      features[dict.index(token)] += 1

  return features

X = np.array([create_features(tokens, dict) for tokens in msgs])

In [13]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f"Classes: {le.classes_}")
print(f"Encoded Labels: {y}")

Classes: ['ham' 'spam']
Encoded Labels: [0 0 1 ... 0 0 0]


In [14]:
# Split the dataset into 3 parts: Training, Validation, Testing (7/2/1)

VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, Y_train, Y_val = train_test_split(X, y, test_size=VAL_SIZE, random_state=SEED, shuffle=True)
print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape)

X_train, X_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=TEST_SIZE, random_state=SEED, shuffle=True)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(4457, 8166) (1115, 8166) (4457,) (1115,)
(3899, 8166) (558, 8166) (3899,) (558,)


In [15]:
# Create the model

model = GaussianNB()
print('Start Training...')
model = model.fit(X_train, y_train)
print('Training completed.')

Start Training...
Training completed.


In [16]:
# Evaluate the model by validation's and testing's accuracy (Should be similar)

y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_acc = accuracy_score(Y_val, y_val_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Val Accuracy:{val_acc}")
print(f"Test Accuracy:{test_acc}")

Val Accuracy:0.8816143497757848
Test Accuracy:0.8602150537634409


In [21]:
def predict(text, model, dict):
  processed_text = preprocess_text(text) # Process the texts
  features = create_features(text, dict) # Create features from the texts
  features = np.array(features).reshape(1, -1) # Make it into a 1D array
  prediction = model.predict(features) # Predict the classes
  prediction_cls = le.inverse_transform(prediction)[0] # Turn the 0's and 1's back into ham and spam

  return prediction_cls

test_input1 = 'Wanna hangout?'
test_input2 = 'FREE MONEY! CLAIM NOW!!'

prediction_cls = predict(test_input1, model, dict)
print(f"Prediction: {prediction_cls}")

prediction_cls = predict(test_input2, model, dict)
print(f"Prediction: {prediction_cls}")

Prediction: ham
Prediction: spam
